In [53]:
import lxml.etree
import urllib
import urllib.request
import pandas as pd


def get_wiki_xml(title):
    title = title
    params = { "format":"xml", "action":"query", "prop":"revisions", "rvprop":"timestamp|user|comment|content" }
    params["titles"] = "API|%s" % urllib.parse.quote(title.encode("utf8"))
    qs = "&".join("%s=%s" % (k, v)  for k, v in params.items())
    url = "http://en.wikipedia.org/w/api.php?%s" % qs
    tree = lxml.etree.parse(urllib.request.urlopen(url))
    revs = tree.xpath('//rev')
    return (revs[-1].text)

In [98]:
popular_dataframe = pd.read_csv('../data/popular_categories.csv')

In [99]:
popular_dataframe = popular_dataframe.loc[:, ['Category', 'Page']]

In [109]:
import pymongo

In [110]:
from pymongo import MongoClient

In [111]:
client = MongoClient('mongodb://localhost:27017/')

In [112]:
wiki_db = client['popular_wiki_database']

In [113]:
collection = wiki_db['popular_wiki_database']

An important note about collections (and databases) in MongoDB is that they are created lazily - none of the above commands have actually performed any operations on the MongoDB server. Collections and databases are created when the first document is inserted into them.

In [50]:
def get_wiki_xml(title):
    title = title
    params = { "format":"xml", "action":"query", "prop":"revisions", "rvprop":"timestamp|user|comment|content" }
    params["titles"] = "API|%s" % urllib.parse.quote(title.encode("utf8"))
    qs = "&".join("%s=%s" % (k, v)  for k, v in params.items())
    url = "http://en.wikipedia.org/w/api.php?%s" % qs
    tree = lxml.etree.parse(urllib.request.urlopen(url))
    revs = tree.xpath('//rev')
    return (revs[-1].text)

In [114]:
pop = popular_dataframe.replace('\xa0', ' ', regex=True)
pop.drop_duplicates(subset='Page', inplace=True)
pop.Page = pop.Page.astype(str, inplace=True)
pop.dropna(inplace=True)

In [115]:
for category, title in zip(pop.Category, pop.Page):
    wiki_xml = get_wiki_xml(title)
    post = {'category': category,
            'text': wiki_xml}
    posts = wiki_db.posts
    post_id = posts.insert_one(post).inserted_id

In [141]:
data = pd.DataFrame(list(posts.find()))

In [142]:
data = data[data['text'] != ""]
data = data[data['text'].str.contains("#redirect") == False]
data = data[data['text'].str.contains("may refer to:\n\n*") == False]
data = data[data['text'].str.contains("can refer to:\n") == False]
data = data[data['text'].str.contains("could refer to:\n") == False]
data = data[data['text'].str.contains("#REDIRECT") == False]
data = data[data['text'].str.contains("== Matches ==\n:") == False]
data = data[data['text'].str.contains("{{underconstruction") == False]

In [176]:
file = '../data/enwiki.observations.text_wp10.30k.tsv'
raw_data = pd.read_csv(file, sep='\t', header=None)
data = pd.DataFrame(data=list(raw_data[0].apply(literal_eval)))
data = data[:100]
data = data[data['text'] != ""]
data = data[data['text'].str.contains("#redirect") == False]
data = data[data['text'].str.contains("may refer to:\n\n*") == False]
data = data[data['text'].str.contains("can refer to:\n") == False]
data = data[data['text'].str.contains("could refer to:\n") == False]
data = data[data['text'].str.contains("#REDIRECT") == False]
data = data[data['text'].str.contains("== Matches ==\n:") == False]
data = data[data['text'].str.contains("{{underconstruction") == False]

In [175]:
def create_engineered_features(raw_text):
    cleaned_text = clean_wiki_markup(raw_text)
    return {
        'cleaned_text': cleaned_text,
        'num_web_citations': find_num_web_citations(raw_text),
        'num_book_citations': find_num_book_citations(raw_text),
        'num_news_citations': find_num_news_citations(raw_text),
        'num_quotes': find_num_quotes(raw_text),
        'num_h3_headers': find_num_h3_headers(raw_text),
        'num_internal_links': find_num_internal_links(raw_text),
        'num_h2_headers': find_num_h2_headers(raw_text),
        'has_infobox': find_infobox(raw_text),
        'num_categories': find_num_categories(raw_text),
        'num_images': find_num_images(raw_text),
        'num_ISBN': find_num_ISBN(raw_text),
        'num_references': find_num_references(raw_text),
        'article_length': find_article_length(raw_text),
        'num_difficult_words': find_num_difficult_words(cleaned_text),
        'dale_chall_readability_score': find_dale_chall_readability_score(cleaned_text),
        'readability_index': find_automated_readability_index(cleaned_text),
        'linsear_write_formula': find_linsear_write_formula(cleaned_text),
        'gunning_fog_index': find_gunning_fog_index(cleaned_text),
        'smog_index': find_smog_index(cleaned_text),
        'syllable_count': find_syllable_count(cleaned_text),
        'lexicon_count': find_lexicon_count(cleaned_text),
        'sentence_count': find_sentence_count(cleaned_text),
        'num_footnotes': find_num_footnotes(raw_text),
        'num_note_tags': find_num_note_tags(raw_text),
        'num_underlines': find_num_underlines(raw_text),
        'num_journal_citations': find_num_journal_citations(raw_text),
        'num_about_links': find_num_about_links(raw_text),
        'num_wikitables': find_num_wikitables(raw_text)}

def get_engineered_dataframe(raw_dataframe):
    engineered_df = pd.DataFrame(raw_dataframe['text'].apply(create_engineered_features).tolist())
    engineered_df['label'] = raw_dataframe['label']
    return engineered_df

In [177]:
engineered_df = get_engineered_dataframe(data)

In [185]:
X = engineered_df.loc[:, ['has_infobox','num_categories','num_images','num_ISBN','num_references','article_length',
                'num_difficult_words','dale_chall_readability_score','readability_index','linsear_write_formula',
                'gunning_fog_index', 'num_web_citations','num_book_citations','num_news_citations',
                'num_quotes','num_h3_headers','num_internal_links', 'num_h2_headers', 'syllable_count',
                'lexicon_count', 'sentence_count','num_footnotes', 'num_note_tags', 'num_underlines', 'num_journal_citations',
                'num_about_links', 'num_wikitables', 'smog_index']].values
X

array([[ 0. ,  3. , 15. , ...,  0. ,  1. ,  9.9],
       [ 1. ,  4. , 12. , ...,  0. ,  0. ,  9.4],
       [ 1. ,  9. ,  1. , ...,  0. ,  0. ,  8.6],
       ...,
       [ 0. ,  1. ,  0. , ...,  0. ,  0. ,  8.8],
       [ 0. ,  6. ,  0. , ...,  0. ,  0. ,  9.4],
       [ 0. ,  7. ,  1. , ...,  1. ,  1. , 10.2]])

In [163]:
import pandas as pd
import numpy as np
from ast import literal_eval
from textstat.textstat import textstat
from gensim.corpora import wikicorpus

def clean_wiki_markup(raw_article):
    """ Removes Wikipedia markup from text and return cleaned text.

        Parameters
        ----------
        raw_article (str): Wikipedia markup text

        Returns
        -------
        cleaned_article (str): Cleaned Wikipedia text
    """
    semi_cleaned_article = wikicorpus.filter_wiki(raw_article)
    cleaned_article = semi_cleaned_article.replace("\n", "").replace("\'", "").replace("()", "").replace("=", "").replace("|alt","").replace("\xa0","")
    return cleaned_article


def find_num_categories(raw_article):
    """ Finds the estimated number of categories listed at the bottom of a Wikipedia article.

        Parameters
        ----------
        raw_article (str): Wikipedia markup text

        Returns
        -------
        (int): Number of categories listed in text
    """
    return raw_article.count("[[Category:")


def find_num_images(raw_article):
    """ Finds the estimated number of images in a Wikipedia article.

        Parameters
        ----------
        raw_article (str): Wikipedia markup text

        Returns
        -------
        (int): Number of images present in text
    """
    return raw_article.count("[[Image:")


def find_num_ISBN(raw_article):
    """ Finds the estimated number of ISBN's listed in a Wikipedia article.

        Parameters
        ----------
        raw_article (str): Wikipedia markup text

        Returns
        -------
        (int): Number of ISBN's listed in text
    """
    return raw_article.count("ISBN")


def find_num_references(raw_article):
    """ Finds the estimated number of references listed in a Wikipedia article.

        Parameters
        ----------
        raw_article (str): Wikipedia markup text

        Returns
        -------
        (int): Number of references listed in text
    """
    return raw_article.count("</ref>")


def find_article_length(cleaned_article):
    """ Finds the article length (in characters) of a Wikipedia article.

        Parameters
        ----------
        cleaned_article (str): Cleaned Wikipedia text

        Returns
        -------
        (int): Article length (in characters)
    """
    return len(cleaned_article)


def find_num_difficult_words(cleaned_article):
    """ Finds the number of difficult words in a Wikipedia article. Words are considered difficult if they do not 
        appear in a list of the 3,000 most common words that a 4th grader can understand.

        Parameters
        ----------
        cleaned_article (str): Cleaned Wikipedia text

        Returns
        -------
        (int): Number of 'difficult' words
    """
    return textstat.difficult_words(cleaned_article)


def find_dale_chall_readability_score(cleaned_article):
    """ Uses the New Dale-Chall Formula to find a score that represents the grade-level of reading that characterizes the text.
        Scores can be interpreted as:

                Score              Level of Understanding
            ____________________________________________________
            4.9 or lower	|   average 4th-grade student or lower
            5.0–5.9	average |   5th or 6th-grade student
            6.0–6.9	average |   7th or 8th-grade student
            7.0–7.9	average |   9th or 10th-grade student
            8.0–8.9	average |   11th or 12th-grade student
            9.0–9.9	average |   13th to 15th-grade (college) student

        Parameters
        ----------
        cleaned_article (str): Cleaned Wikipedia text

        Returns
        -------
        (float): Number describing the text's Dale-Chall score
    """
    return textstat.dale_chall_readability_score(cleaned_article)


def find_automated_readability_index(cleaned_article):
    """ Uses the Automated Readability Index to calculate a score that approximates the grade level needed
          to comprehend the text. 

            For example: If the score is 8, then the grade-level needed to comprehend the text is 8th. 

        Parameters
        ----------
        cleaned_article (str): Cleaned Wikipedia text

        Returns
        -------
        (float): Number describing the Automated Readability Index 
    """
    return textstat.automated_readability_index(cleaned_article)


def find_linsear_write_formula(cleaned_article):
    """ Uses the Linsear Write Formula to calculate a score that approximates the grade level needed
          to comprehend the text. 

            For example: If the score is 8, then the grade-level needed to comprehend the text is 8th. 

        Parameters
        ----------
        cleaned_article (str): Cleaned Wikipedia text

        Returns
        -------
        (float): Number describing the Linsear Write score 
    """
    return textstat.linsear_write_formula(cleaned_article)


def find_gunning_fog_index(cleaned_article):
    """ Uses the Gunning Gog Index to calculate a score that approximates the grade level needed
          to comprehend the text. 

            For example: If the score is 8, then the grade-level needed to comprehend the text is 8th. 

        Parameters
        ----------
        cleaned_article (str): Cleaned Wikipedia text

        Returns
        -------
        (float): Number describing the Gunning Gog Index 
    """
    return textstat.gunning_fog(cleaned_article)


def find_smog_index(cleaned_article):
    """ Uses the SMOG index to calculate a score that approximates the grade level needed to comprehend the text. 

        For example: If the score is 8, then the grade-level needed to comprehend the text is 8th. 

        Texts of fewer than 30 sentences are statistically invalid, because the SMOG formula was normed on 30-sentence samples. 
        
        textstat requires at least 3 sentences for a result.

        Parameters
        ----------
        cleaned_article (str): Cleaned Wikipedia text

        Returns
        -------
        (float): Number describing the Smog Index
    """
    return textstat.smog_index(cleaned_article)

def find_num_web_citations(raw_article):
    """ Finds the estimated number of web citations within a Wikipedia article.

        Parameters
        ----------
        raw_article (str): Wikipedia markup text

        Returns
        -------
        (int): Number of web citations
    """
    return raw_article.count("{{cite web")


def find_num_book_citations(raw_article):
    """ Finds the estimated number of book citations within a Wikipedia article.

        Parameters
        ----------
        raw_article (str): Wikipedia markup text

        Returns
        -------
        (int): Number of book citations
    """
    return raw_article.count("{{cite book")


def find_num_news_citations(raw_article):
    """ Finds the estimated number of news citations within a Wikipedia article.

        Parameters
        ----------
        raw_article (str): Wikipedia markup text

        Returns
        -------
        (int): Number of news citations
    """
    return raw_article.count("{{cite news")


def find_num_quotes(raw_article):
    """ Finds the estimated number of quotes mentioned in a Wikipedia article.

        Parameters
        ----------
        raw_article (str): Wikipedia markup text

        Returns
        -------
        (int): Number of quotes in Wikipedia article
    """
    return raw_article.count("{{quote")


def find_num_h3_headers(raw_article):
    """ Finds the estimated number of h3 headers in a Wikipedia article.

        Parameters
        ----------
        raw_article (str): Wikipedia markup text

        Returns
        -------
        (int): Number of h3 headers in Wikipedia article
    """
    return raw_article.count("\n===")


def find_num_internal_links(raw_article):
    """ Finds the estimated number of internal links in a Wikipedia article.

        Parameters
        ----------
        raw_article (str): Wikipedia markup text

        Returns
        -------
        (int): Number of internal links in Wikipedia article
    """
    return raw_article.count("[[")


def find_num_h2_headers(raw_article):
    """ Finds the estimated number of h2 headers in a Wikipedia article.

        Parameters
        ----------
        raw_article (str): Wikipedia markup text

        Returns
        -------
        (int): Number of h2 headers in Wikipedia article
    """
    return (raw_article.count("\n==") - find_num_h3_headers(raw_article))


def find_syllable_count(cleaned_article):
    """ Returns the number of syllables present in text.

        Parameters
        ----------
        cleaned_article (str): Cleaned Wikipedia text

        Returns
        -------
        (float): Number syllables present in text
    """
    return textstat.syllable_count(cleaned_article)


def find_lexicon_count(cleaned_article):
    """ Returns the number of words in text. 
        Optional removepunct arugment specifies whether or not to remove punctuation symbols while counting lexicons. 
        Default value for removepunct is True. This removes  punctuation before counting lexicon items.

        Parameters
        ----------
        cleaned_article (str): Cleaned Wikipedia text

        Returns
        -------
        (int): Number lexicon items in text
    """ 
    return textstat.lexicon_count(cleaned_article, removepunct=True)


def find_sentence_count(cleaned_article):
    """ Returns the number of sentences in text. 
        Parameters
        ----------
        cleaned_article (str): Cleaned Wikipedia text

        Returns
        -------
        (int): Number sentences in text
    """ 
    return textstat.sentence_count(cleaned_article)

def find_num_footnotes(raw_article):
    """ Finds the estimated number of footnotes in a Wikipedia article.

    Parameters
    ----------
    raw_article (str): Wikipedia markup text

    Returns
    -------
    (int): Number of footnotes in Wikipedia article
    """
    return raw_article.count("{{")


def find_num_note_tags(raw_article):
    """ Finds the estimated number of note tags in a Wikipedia article.

    Parameters
    ----------
    raw_article (str): Wikipedia markup text

    Returns
    -------
    (int): Number of note tags in Wikipedia article
    """
    return raw_article.count("{{note")


def find_num_bullet_points(raw_article):
    """ Finds the estimated number of bullet points in a Wikipedia article.

    Parameters
    ----------
    raw_article (str): Wikipedia markup text

    Returns
    -------
    (int): Number of bullet points in Wikipedia article
    """
    return (raw_article.count("*"))


def find_num_underlines(raw_article):
    """ Finds the estimated number of underlines in a Wikipedia article.

    Parameters
    ----------
    raw_article (str): Wikipedia markup text

    Returns
    -------
    (int): Number of underlines in Wikipedia article
    """
    return (raw_article.count("<u>"))


def find_num_journal_citations(raw_article):
    """ Finds the estimated number of journal citations in a Wikipedia article.

    Parameters
    ----------
    raw_article (str): Wikipedia markup text

    Returns
    -------
    (int): Number of journal citations in Wikipedia article
    """
    return (raw_article.count("{{cite journal"))


def find_num_about_links(raw_article):
    """ Finds the estimated number of 'About' links in a Wikipedia article.

    Parameters
    ----------
    raw_article (str): Wikipedia markup text

    Returns
    -------
    (int): Number of 'About' links in Wikipedia article
    """
    return (raw_article.count("{{About"))


def find_num_wikitables(raw_article):
    """ Finds the estimated number of Wiki Tables in a Wikipedia article.

    Parameters
    ----------
    raw_article (str): Wikipedia markup text

    Returns
    -------
    (int): Number of Wiki Tables in Wikipedia article
    """
    return (raw_article.count('class="wikitable'))


def find_infobox(raw_article):
    """ Determines if the Wikipedia article has an infobox or not.

    Parameters
    ----------
    raw_article (str): Wikipedia markup text

    Returns
    -------
    (int): 0 if no infobox. 1 if yes infobox
    """
    return int('{{Infobox' in raw_article)