In [1]:
import re

from wikitools import wiki, category, api

In [2]:
class Article:
    
    def __init__(self, title, body, links=None, categories=None, templates=None):
        self.title = title
        self.body = body
        self.links = links
        self.categories = categories
        self.templates = templates
        
    def to_json(self):
        result = dict()
        result.update({"title": self.title})
        result.update({"body": self.body})
        
        links = list()
        for l in self.links:
            links.append({"link": l[0], "anchor": l[1]})
       
        result.update({"links": links})   
        
        categories = list()
        for c in self.categories:
            categories.append(c)
       
        result.update({"categories": categories})
        
        templates = list()
        for t in self.templates:
            templates.append(t)
        
        result.update({"templates": templates})
        return json.dumps(result, encoding="utf-8")

In [3]:
def link_finder(content_string):
    links = list()
    for i,j in re.findall(r'\[\[([^|\]]*\|)?([^\]]+)\]\]',content_string):
        if len(i) == 0:
            links.append((j, j))
        elif u'#' not in i :
            links.append((j, i[:-1]))
        elif u'#' in i:
            new_i = i[:i.index(u'#')]
            links.append((j, new_i))
    links = [l for l in links if u'|' not in l[1] and u'Category:' not in l[1] and u'File:' not in l[1]]
    return links

In [4]:
# utils
def convert_to_datetime(string):
    dt = datetime.datetime.strptime(string,'%Y-%m-%dT%H:%M:%SZ')
    return dt
    
def convert_from_datetime(dt):
    string = dt.strftime('%Y%m%d%H%M%S')
    return string

def cast_to_unicode(string):
    if isinstance(string,str):
        try:
            string2 = string.decode('utf8')
        except:
            try:
                string2 = string.decode('latin1')
            except:
                print "Some messed up encoding here"
    elif isinstance(string,unicode):
        string2 = string
    return string2

In [5]:
def rename_on_redirect(article_title,lang='en'):
    result = query_wikipedia({'titles': article_title,
                              'prop': 'info',
                              'action': 'query',
                              'redirects': 'True'},lang)
    if 'redirects' in result.keys() and 'pages' in result.keys():
        article_title = result['redirects'][0]['to']
    return article_title

In [6]:
def query_wikipedia(query_params, lang="en"):
    site = wiki.Wiki(url='http://'+lang+'.wikipedia.org/w/api.php')
    request = api.APIRequest(site, query_params)
    result = request.query()
    return result
#     return result[query_params['action']]

In [7]:
def get_category_members(category_name, depth, lang='en'):
    articles = []
    if depth < 0:
        return articles
    continue_query = ""
    while (True):
        #Begin crawling articles in category
        res = query_wikipedia({'list': 'categorymembers',
                                   'cmtitle': category_name,
                                   'cmtype': 'page',
                                   'cmlimit': '500',
                                   'action': 'query',
                                   'cmcontinue': continue_query},lang) 
        if 'continue' not in res.keys():
            break
        else:
            continue_query = res['continue']['cmcontinue']
        results = res['query']
        if 'categorymembers' in results.keys() and len(results['categorymembers']) > 0:
            for i, page in enumerate(results['categorymembers']):
                article = page['title']
                articles.append(article)

        # Begin crawling subcategories
        results = query_wikipedia({'list': 'categorymembers',
                                       'cmtitle': category_name,
                                       'cmtype': 'subcat',
                                       'cmlimit': '500',
                                       'action': 'query',
                                       'cmcontinue': continue_query},lang)
        subcategories = []
        if 'categorymembers' in results.keys() and len(results['categorymembers']) > 0:
            for i, category in enumerate(results['categorymembers']):
                cat_title = category['title']
                subcategories.append(cat_title)
        for category in subcategories:
            articles += get_category_members(category,depth-1)      
    return articles

In [8]:
def get_page_content(page_title,lang):
    article_title = rename_on_redirect(page_title, lang)
    revisions_dict = dict()
    result = query_wikipedia({'titles': article_title,
                              'prop': 'revisions',
                              'rvprop': 'ids|content',
                              'rvlimit': '5000',
                              'action': 'query'},lang)
    if result and 'pages' in result.keys():
        page_number = result['pages'].keys()[0]
        try:
            revisions = result['pages'][page_number]['revisions']
            for revision in revisions:
                rev = dict()
                rev['pageid'] = page_number
                rev['title'] = result['pages'][page_number]['title']
                rev['content'] = revision.get('*',unicode()) # Sometimes content hidden, return with empty unicode string
                rev['revid'] = revision['revid']
                revisions_dict[revision['revid']] = rev
        except:
            pass
    return revisions_dict

In [9]:
def get_page_categories(page_title, lang='en'):
    page_title = rename_on_redirect(page_title, lang)
    results = query_wikipedia({'prop': 'categories',
                               'titles': page_title,
                               'cllimit': '500',
                               'clshow':'!hidden',
                               'action': 'query'},lang)
    if 'pages' in results.keys():
        page_number = results['pages'].keys()[0]
        categories = results['pages'][page_number]['categories']
        categories = [i['title'] for i in categories]
        cat = list()
        for c in categories:
            res = False
            r = re.compile(u'Category:(.*)')
            cat.append(re.findall(r, c)[0])
    else:
        print u"{0} not found in category results".format(page_title)
    return cat

In [10]:
def get_page_templates(page_title, lang):
    page_title = cast_to_unicode(page_title)
    page_title = rename_on_redirect(page_title, lang)
    result = query_wikipedia({'titles': page_title,
                              'prop': 'templates',
                              'tllimit': '500',
                              'action': 'query'},lang)
    if 'pages' in result.keys():
        page_id = result['pages'].keys()[0]
        templates = list()
        if 'templates' in result['pages'][page_id].keys():
            templates = [i['title'] for i in result['pages'][page_id]['templates']]
    return templates

In [11]:
def get_article(title, lang="en"):
    result = None
    content = get_page_content(title, lang)
    # we take only last revision
    rev_key = content.keys()[0]
    categories = get_page_categories(title, lang)
    templates = get_page_templates(title, lang)
    result = Article(content[rev_key]["title"], 
                          content[rev_key]["content"],
                          link_finder(content[rev_key]["content"]),
                          categories,
                          templates)
    return result

In [12]:
dis_pages = get_category_members("Category:All_disambiguation_pages", 1)

Server lag, sleeping for 6 seconds
Server lag, sleeping for 6 seconds
Server lag, sleeping for 9 seconds
Server lag, sleeping for 6 seconds
Server lag, sleeping for 6 seconds


In [13]:
len(dis_pages)

268000

In [14]:
dis_pages

[u'! (disambiguation)',
 u'!!',
 u'" (disambiguation)',
 u'$ (disambiguation)',
 u'$1',
 u'$1 coin',
 u'$10',
 u'$100',
 u'$1000',
 u'$2',
 u'$20',
 u'$200',
 u'$3',
 u'$5',
 u'$50',
 u'$500',
 u'%s',
 u'& (disambiguation)',
 u'&&',
 u"' (disambiguation)",
 u"'01",
 u"'77",
 u"'78",
 u"'A'",
 u"'Abd al-Malik of Samanid",
 u"'Ad",
 u"'Amakihi",
 u"'Are'are",
 u"'hood",
 u"'Nique",
 u"'Phags-pa",
 u"'s",
 u"'t Haantje",
 u"'t Zand",
 u"'Tis the Season",
 u'( ) (disambiguation)',
 u'(A)',
 u'* (disambiguation)',
 u'***',
 u'*=',
 u'+ (disambiguation)',
 u'+++',
 u'+1',
 u'+972',
 u'- (disambiguation)',
 u'--',
 u'-er',
 u'-ey',
 u'-i',
 u'-O',
 u'-s',
 u'-th',
 u'. .',
 u'.223 (disambiguation)',
 u'.303',
 u'.32',
 u'.357',
 u'.460',
 u'.577',
 u'.abc',
 u'.cab',
 u'.codes (disambiguation)',
 u'.cpp',
 u'.dot',
 u'.dss',
 u'.erl',
 u'.m',
 u'.mm (disambiguation)',
 u'.net (disambiguation)',
 u'.odc',
 u'.one',
 u'.package',
 u'.pdm',
 u'.pot',
 u'.rex',
 u'.sdc',
 u'.sgm',
 u'.sol',
 u'.s

In [52]:
# Filter titles containing non alphabetical symbols
non_alpha_elements = "[\\\'\"@+?.,\/#!$%\^&\*;:{}=\-_`~()\d]"
a = re.findall(non_alpha_elements, "a,")

In [53]:
clean_dis_pages = [title for title in dis_pages if len(re.findall(non_alpha_elements, title)) == 0 ]     

In [58]:
len(clean_dis_pages)

200138

In [62]:
# Filter stop words
from nltk.corpus import stopwords

filtered_clean_dis_pages = [word for word in clean_dis_pages if word.lower() not in stopwords.words('english')]

In [68]:
import codecs

with open('disambiguation_pages.csv', 'w') as f:
    f.write(codecs.encode("\n".join(filtered_clean_dis_pages), 'utf-8'))
        

In [73]:
# Select random (uniformly distributed) N pages
import random
N = 1000 
selected_indices = [random.randrange(0, len(filtered_clean_dis_pages)) for i in range(N)]


In [76]:
sampled_dis_pages = [filtered_clean_dis_pages[i] for i in selected_indices]

In [77]:
sampled_dis_pages

[u'VLV',
 u'Wirral Grammar',
 u'White cell',
 u'We Are the People',
 u'Valona',
 u'Sukumar Sen',
 u'Parian',
 u'Tabor',
 u'Tityus',
 u'Gregg House',
 u'Thomas Bayly',
 u'Kingsgate',
 u'Arthur Goddard',
 u'Good book',
 u'Anterior cutaneous branch',
 u'Morrighan',
 u'George Riddell',
 u'Kooringal',
 u'Keenaght',
 u'We Are One',
 u'Inseparable',
 u'XT',
 u'Shineh',
 u'Hilton House',
 u'Alberto P\xe9rez',
 u'Cadabra',
 u'Piel Pinocchio',
 u'Freedom Act',
 u'Wieliczna',
 u'BITS',
 u'New Hudson',
 u'Altcar',
 u'Kingdom',
 u'Bill Schultz',
 u'Valley of Peace',
 u'Kitts',
 u'Katie Anderson',
 u'Peter Nelson',
 u'Kryl',
 u'NLI',
 u'Al Hubbard',
 u'Sterpu River',
 u'Dancy',
 u'Dryden Historic District',
 u'Vadsky',
 u'Identity token',
 u'Wicks',
 u'Abu Nasir',
 u'Willowbrook',
 u'Club',
 u'IEE',
 u'Molinos de viento',
 u'German Order',
 u'Sequential system',
 u'Scission',
 u'Zhanghuai',
 u'Hungry Hearts',
 u'Czarnia',
 u'Christopher Batt',
 u'Rampage',
 u'Similitude',
 u'Tibor K\xe1rolyi',
 u'Ze