In [158]:
import lxml.etree
import urllib
import urllib.request
import pandas as pd
import pickle
import numpy as np
import torch
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from ast import literal_eval
import matplotlib.pyplot as plt
from textstat.textstat import textstat
from gensim.corpora import wikicorpus
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nltk
from sklearn.feature_extraction.text import HashingVectorizer
from collections import defaultdict
%matplotlib inline
# Make it pretty
plt.style.use('ggplot')
hash_vec_rf_model = pickle.load(open("../src/hash_vec2_aug23.pkl", "rb" ))
rf_model = pickle.load(open("../src/random_forest_aug22.pkl", "rb" ))
hash_vec_fitter = pickle.load(open("../src/hash_vec_fitter.pkl", "rb" ))

def get_wiki_xml(title):
    title = title
    params = { "format":"xml", "action":"query", "prop":"revisions", "rvprop":"timestamp|user|comment|content" }
    params["titles"] = "API|%s" % urllib.parse.quote(title.encode("utf8"))
    qs = "&".join("%s=%s" % (k, v)  for k, v in params.items())
    url = "http://en.wikipedia.org/w/api.php?%s" % qs
    tree = lxml.etree.parse(urllib.request.urlopen(url))
    revs = tree.xpath('//rev')
    return (revs[-1].text)

In [159]:
popular_dataframe = pd.read_csv('../data/popular_categories.csv')

In [160]:
popular_dataframe = popular_dataframe.loc[:, ['Category', 'Page']]

In [161]:
import pymongo

In [162]:
from pymongo import MongoClient

In [163]:
client = MongoClient('mongodb://localhost:27017/')

In [164]:
wiki_db = client['popular_wiki_database']

In [165]:
collection = wiki_db['popular_wiki_database']

An important note about collections (and databases) in MongoDB is that they are created lazily - none of the above commands have actually performed any operations on the MongoDB server. Collections and databases are created when the first document is inserted into them.

In [166]:
def get_wiki_xml(title):
    title = title
    params = { "format":"xml", "action":"query", "prop":"revisions", "rvprop":"timestamp|user|comment|content" }
    params["titles"] = "API|%s" % urllib.parse.quote(title.encode("utf8"))
    qs = "&".join("%s=%s" % (k, v)  for k, v in params.items())
    url = "http://en.wikipedia.org/w/api.php?%s" % qs
    tree = lxml.etree.parse(urllib.request.urlopen(url))
    revs = tree.xpath('//rev')
    return (revs[-1].text)

In [187]:
pop = popular_dataframe.replace('\xa0', ' ', regex=True)
pop.drop_duplicates(subset='Page', inplace=True)
pop.Page = pop.Page.astype(str, inplace=True)
pop.dropna(inplace=True)
pop['Page'] = pop['Page'].astype(str)

In [195]:
for category, title in zip(pop.Category, pop.Page):
    wiki_xml = get_wiki_xml(title)
    post = {'category': category,
            'title': title,
            'text': wiki_xml}
    posts = wiki_db.posts
    post_id = posts.insert_one(post).inserted_id

In [197]:
data = pd.DataFrame(list(posts.find()))

In [201]:
popular_dataframe['Page'] = popular_dataframe['Page'].astype(str, inplace=True)
popular_dataframe['text'] = popular_dataframe['Page'].apply(get_wiki_xml)

In [203]:
popular_dataframe = popular_dataframe[popular_dataframe['text'] != ""]
popular_dataframe = popular_dataframe[popular_dataframe['text'].str.contains("#redirect") == False]
popular_dataframe = popular_dataframe[popular_dataframe['text'].str.contains("may refer to:\n\n*") == False]
popular_dataframe = popular_dataframe[popular_dataframe['text'].str.contains("can refer to:\n") == False]
popular_dataframe = popular_dataframe[popular_dataframe['text'].str.contains("could refer to:\n") == False]
popular_dataframe = popular_dataframe[popular_dataframe['text'].str.contains("#REDIRECT") == False]
popular_dataframe = popular_dataframe[popular_dataframe['text'].str.contains("== Matches ==\n:") == False]
popular_dataframe = popular_dataframe[popular_dataframe['text'].str.contains("{{underconstruction") == False]

In [86]:
def create_engineered_features(raw_text):
    cleaned_text = clean_wiki_markup(raw_text)
    return {
        'cleaned_text': cleaned_text,
        'num_web_citations': find_num_web_citations(raw_text),
        'num_book_citations': find_num_book_citations(raw_text),
        'num_news_citations': find_num_news_citations(raw_text),
        'num_quotes': find_num_quotes(raw_text),
        'num_h3_headers': find_num_h3_headers(raw_text),
        'num_internal_links': find_num_internal_links(raw_text),
        'num_h2_headers': find_num_h2_headers(raw_text),
        'has_infobox': find_infobox(raw_text),
        'num_categories': find_num_categories(raw_text),
        'num_images': find_num_images(raw_text),
        'num_ISBN': find_num_ISBN(raw_text),
        'num_references': find_num_references(raw_text),
        'article_length': find_article_length(raw_text),
        'num_difficult_words': find_num_difficult_words(cleaned_text),
        'dale_chall_readability_score': find_dale_chall_readability_score(cleaned_text),
        'readability_index': find_automated_readability_index(cleaned_text),
        'linsear_write_formula': find_linsear_write_formula(cleaned_text),
        'gunning_fog_index': find_gunning_fog_index(cleaned_text),
        'smog_index': find_smog_index(cleaned_text),
        'syllable_count': find_syllable_count(cleaned_text),
        'lexicon_count': find_lexicon_count(cleaned_text),
        'sentence_count': find_sentence_count(cleaned_text),
        'num_footnotes': find_num_footnotes(raw_text),
        'num_note_tags': find_num_note_tags(raw_text),
        'num_underlines': find_num_underlines(raw_text),
        'num_journal_citations': find_num_journal_citations(raw_text),
        'num_about_links': find_num_about_links(raw_text),
        'num_wikitables': find_num_wikitables(raw_text)}

def get_engineered_dataframe_no_label(raw_dataframe):
    engineered_df = pd.DataFrame(raw_dataframe['text'].apply(create_engineered_features).tolist())
    return engineered_df

In [204]:
popular_dataframe['cleaned_text'] = popular_dataframe['text'].apply(clean_wiki_markup)
popular_dataframe['num_web_citations'] = popular_dataframe['text'].apply(find_num_web_citations)
popular_dataframe['num_book_citations'] = popular_dataframe['text'].apply(find_num_book_citations)
popular_dataframe['num_news_citations'] = popular_dataframe['text'].apply(find_num_news_citations)
popular_dataframe['num_quotes'] = popular_dataframe['text'].apply(find_num_quotes)
popular_dataframe['num_h3_headers'] = popular_dataframe['text'].apply(find_num_h3_headers)
popular_dataframe['num_internal_links'] = popular_dataframe['text'].apply(find_num_internal_links)
popular_dataframe['num_h2_headers'] = popular_dataframe['text'].apply(find_num_h2_headers)
popular_dataframe['has_infobox'] = popular_dataframe['text'].str.contains('{{Infobox').astype(int)
popular_dataframe['num_categories'] = popular_dataframe['text'].apply(find_num_categories)
popular_dataframe['num_images'] = popular_dataframe['text'].apply(find_num_images)
popular_dataframe['num_ISBN'] = popular_dataframe['text'].apply(find_num_ISBN)
popular_dataframe['num_references'] = popular_dataframe['text'].apply(find_num_references)
popular_dataframe['article_length'] = popular_dataframe['text'].apply(find_article_length)
popular_dataframe['num_difficult_words'] = popular_dataframe['cleaned_text'].apply(find_num_difficult_words)
popular_dataframe['dale_chall_readability_score'] = popular_dataframe['cleaned_text'].apply(find_dale_chall_readability_score)
popular_dataframe['readability_index'] = popular_dataframe['cleaned_text'].apply(find_automated_readability_index)
popular_dataframe['linsear_write_formula'] = popular_dataframe['cleaned_text'].apply(find_linsear_write_formula)
popular_dataframe['gunning_fog_index'] = popular_dataframe['cleaned_text'].apply(find_gunning_fog_index)
popular_dataframe['smog_index'] = popular_dataframe['cleaned_text'].apply(find_smog_index)
popular_dataframe['syllable_count'] = popular_dataframe['cleaned_text'].apply(find_syllable_count)
popular_dataframe['lexicon_count'] = popular_dataframe['cleaned_text'].apply(find_lexicon_count)
popular_dataframe['sentence_count'] = popular_dataframe['cleaned_text'].apply(find_sentence_count)
popular_dataframe['num_footnotes'] = popular_dataframe['text'].apply(find_num_footnotes)
popular_dataframe['num_note_tags'] = popular_dataframe['text'].apply(find_num_note_tags)
popular_dataframe['num_underlines'] = popular_dataframe['text'].apply(find_num_underlines)
popular_dataframe['num_journal_citations'] = popular_dataframe['text'].apply(find_num_journal_citations)
popular_dataframe['num_about_links'] = popular_dataframe['text'].apply(find_num_about_links)
popular_dataframe['num_wikitables'] = popular_dataframe['text'].apply(find_num_wikitables)

In [67]:
def clean_wiki_markup(raw_article):
    semi_cleaned_article = wikicorpus.filter_wiki(raw_article)
    cleaned_article = semi_cleaned_article.replace("\n", "").replace("\'", "").replace("()", "").replace("=", "").replace("|alt","").replace("\xa0","")
    return cleaned_article
def find_num_categories(raw_article):
    return raw_article.count("[[Category:")
def find_num_images(raw_article):
    return raw_article.count("[[Image:")
def find_num_ISBN(raw_article):
    return raw_article.count("ISBN")
def find_num_references(raw_article):
    return raw_article.count("</ref>")
def find_article_length(cleaned_article):
    return len(cleaned_article)
def find_num_difficult_words(cleaned_article):
    return textstat.difficult_words(cleaned_article)
def find_dale_chall_readability_score(cleaned_article):
    return textstat.dale_chall_readability_score(cleaned_article)
def find_automated_readability_index(cleaned_article):
    return textstat.automated_readability_index(cleaned_article)
def find_linsear_write_formula(cleaned_article):
    return textstat.linsear_write_formula(cleaned_article)
def find_gunning_fog_index(cleaned_article):
    return textstat.gunning_fog(cleaned_article)
def find_syllable_count(cleaned_article):
    return textstat.syllable_count(cleaned_article)
def find_lexicon_count(cleaned_article):
    return textstat.lexicon_count(cleaned_article, removepunct=True)
def find_sentence_count(cleaned_article):
    return textstat.sentence_count(cleaned_article)
def find_smog_index(cleaned_article):
    return textstat.smog_index(cleaned_article)
def find_num_web_citations(raw_article):
    return raw_article.count("{{cite web")
def find_num_book_citations(raw_article):
    return raw_article.count("{{cite book")
def find_num_news_citations(raw_article):
    return raw_article.count("{{cite news")
def find_num_quotes(raw_article):
    return raw_article.count("quote=")
def find_num_h3_headers(raw_article):
    return raw_article.count("\n===")
def find_num_internal_links(raw_article):
    return (raw_article.count("[[") // 2)
def find_num_h2_headers(raw_article):
    return (raw_article.count("\n==") - find_num_h3_headers(raw_article))
def find_num_note_tags(raw_article):
    return raw_article.count("{{note")
def find_num_bullet_points(raw_article):
    return (raw_article.count("*"))
def find_num_underlines(raw_article):
    return (raw_article.count("<u>"))
def find_num_journal_citations(raw_article):
    return (raw_article.count("{{cite journal"))
def find_num_about_links(raw_article):
    return (raw_article.count("{{About"))
def find_num_wikitables(raw_article):
    return (raw_article.count('class="wikitable'))
def find_num_footnotes(raw_article):
    return raw_article.count("{{")
def find_infobox(raw_article):
    return int('{{Infobox' in raw_article)

In [94]:
popular_dataframe.dropna(inplace=True)
popular_dataframe.shape

(2970, 32)

# Make backup dataframe

In [91]:
backup_data = data

# Tranform data for Random Forest

In [205]:
rf_X = popular_dataframe.loc[:, ['has_infobox','num_categories','num_images','num_ISBN','num_references','article_length',
                'num_difficult_words','dale_chall_readability_score','readability_index','linsear_write_formula',
                'gunning_fog_index', 'num_web_citations','num_book_citations','num_news_citations',
                'num_quotes','num_h3_headers','num_internal_links', 'num_h2_headers', 'syllable_count',
                'lexicon_count', 'sentence_count','num_footnotes', 'num_note_tags', 'num_underlines', 'num_journal_citations',
                'num_about_links', 'num_wikitables', 'smog_index']]

In [206]:
rf_preds = rf_model.predict(rf_X)

In [207]:
popular_dataframe['random_forest_preds'] = rf_preds

In [213]:
category_df = popular_dataframe.loc[:, ['Category','random_forest_preds']]

In [218]:
categories = category_df.groupby(by='Category').mean()

In [222]:
third_mil_pages = popular_dataframe[popular_dataframe['Category'] == '3rd-millennium people']

In [224]:
third_mil_pages = third_mil_pages.loc[:, ['Page','random_forest_preds']]

In [228]:
third_mil_pages['random_forest_preds'] = third_mil_pages['random_forest_preds'].apply(r)

In [227]:
def r(num):
    return round(num,2)

In [229]:
third_mil_pages

Unnamed: 0,Page,random_forest_preds
384,Willow Smith,3.13
385,Maddie Ziegler,4.06
386,Millie Bobby Brown,3.07
387,Noah Cyrus,3.12
388,Prince George of Cambridge,3.05
389,Jackie Evancho,4.2
390,Lady Louise Windsor,1.83
391,Mackenzie Foy,2.78
392,Gaten Matarazzo,1.52
393,Finn Wolfhard,2.15


In [230]:
def get_wiki_category(title):
    title = title
    params = { "format":"xml", "action":"query", "prop":"revisions", "rvprop":"timestamp|user|comment|content" }
    params["titles"] = "API|%s" % urllib.parse.quote(title.encode("utf8"))
    qs = "&".join("%s=%s" % (k, v)  for k, v in params.items())
    url = "http://en.wikipedia.org/w/api.php?%s" % qs
    tree = lxml.etree.parse(urllib.request.urlopen(url))
    revs = tree.xpath('//rev')
    return (revs[-1].text)

'http://en.wikipedia.org/w/api.php?format=xml&action=query&prop=revisions&rvprop=timestamp|user|comment|content&titles=API|Jazz'