# Imports

In [2]:
import pandas as pd
import numpy as np
import torch
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from ast import literal_eval
import matplotlib.pyplot as plt
from textstat.textstat import textstat
from gensim.corpora import wikicorpus
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nltk
from collections import defaultdict
%matplotlib inline
# Make it pretty
plt.style.use('ggplot')

# Import file

In [3]:
file = '../data/enwiki.observations.text_wp10.30k.tsv'
raw_data = pd.read_csv(file, sep='\t', header=None)
data = pd.DataFrame(data=list(raw_data[0].apply(literal_eval)))

# Drop all rows with incorrect labels

In [5]:
data = data[data['text'] != ""]
data = data[data['text'].str.contains("#redirect") == False]
data = data[data['text'].str.contains("may refer to:\n\n*") == False]
data = data[data['text'].str.contains("can refer to:\n") == False]
data = data[data['text'].str.contains("could refer to:\n") == False]
data = data[data['text'].str.contains("#REDIRECT") == False]
data = data[data['text'].str.contains("== Matches ==\n:") == False]
data = data[data['text'].str.contains("{{underconstruction") == False]

# Make y's numeric

In [6]:
classes = {"stub": 0, "start": 1, "c": 2, "b": 3, "ga": 4, "fa": 5} 
data["label"] = data['label'].map(classes)

# Feature Engineering Functions

In [7]:
def clean_wiki_markup(raw_article):
    semi_cleaned_article = wikicorpus.filter_wiki(raw_article)
    cleaned_article = semi_cleaned_article.replace("\n", "").replace("\'", "").replace("()", "").replace("=", "").replace("|alt","").replace("\xa0","")
    return cleaned_article
def find_num_categories(raw_article):
    return raw_article.count("[[Category:")
def find_num_images(raw_article):
    return raw_article.count("[[Image:")
def find_num_ISBN(raw_article):
    return raw_article.count("ISBN")
def find_num_references(raw_article):
    return raw_article.count("</ref>")
def find_article_length(cleaned_article):
    return len(cleaned_article)
def find_num_difficult_words(cleaned_article):
    return textstat.difficult_words(cleaned_article)
def find_dale_chall_readability_score(cleaned_article):
    return textstat.dale_chall_readability_score(cleaned_article)
def find_automated_readability_index(cleaned_article):
    return textstat.automated_readability_index(cleaned_article)
def find_linsear_write_formula(cleaned_article):
    return textstat.linsear_write_formula(cleaned_article)
def find_gunning_fog_index(cleaned_article):
    return textstat.gunning_fog(cleaned_article)
def find_syllable_count(cleaned_article):
    return textstat.syllable_count(cleaned_article)
def find_lexicon_count(cleaned_article):
    return textstat.lexicon_count(cleaned_article, removepunct=True)
def find_sentence_count(cleaned_article):
    return textstat.sentence_count(cleaned_article)
def find_smog_index(cleaned_article):
    return textstat.smog_index(cleaned_article)
def find_num_web_citations(raw_article):
    return raw_article.count("{{cite web")
def find_num_book_citations(raw_article):
    return raw_article.count("{{cite book")
def find_num_news_citations(raw_article):
    return raw_article.count("{{cite news")
def find_num_quotes(raw_article):
    return raw_article.count("quote=")
def find_num_h3_headers(raw_article):
    return raw_article.count("\n===")
def find_num_internal_links(raw_article):
    return (raw_article.count("[[") // 2)
def find_num_h2_headers(raw_article):
    return (raw_article.count("\n==") - find_num_h3_headers(raw_article))
def find_num_note_tags(raw_article):
    return raw_article.count("{{note")
def find_num_bullet_points(raw_article):
    return (raw_article.count("*"))
def find_num_underlines(raw_article):
    return (raw_article.count("<u>"))
def find_num_journal_citations(raw_article):
    return (raw_article.count("{{cite journal"))
def find_num_about_links(raw_article):
    return (raw_article.count("{{About"))
def find_num_wikitables(raw_article):
    return (raw_article.count('class="wikitable'))
def find_num_footnotes(raw_article):
    return raw_article.count("{{")
def find_infobox(raw_article):
    return int('{{Infobox' in raw_article)

In [8]:
data['cleaned_text'] = data['text'].apply(clean_wiki_markup)
data['num_web_citations'] = data['text'].apply(find_num_web_citations)
data['num_book_citations'] = data['text'].apply(find_num_book_citations)
data['num_news_citations'] = data['text'].apply(find_num_news_citations)
data['num_quotes'] = data['text'].apply(find_num_quotes)
data['num_h3_headers'] = data['text'].apply(find_num_h3_headers)
data['num_internal_links'] = data['text'].apply(find_num_internal_links)
data['num_h2_headers'] = data['text'].apply(find_num_h2_headers)
data['has_infobox'] = data['text'].str.contains('{{Infobox').astype(int)
data['num_categories'] = data['text'].apply(find_num_categories)
data['num_images'] = data['text'].apply(find_num_images)
data['num_ISBN'] = data['text'].apply(find_num_ISBN)
data['num_references'] = data['text'].apply(find_num_references)
data['article_length'] = data['text'].apply(find_article_length)
data['num_difficult_words'] = data['cleaned_text'].apply(find_num_difficult_words)
data['dale_chall_readability_score'] = data['cleaned_text'].apply(find_dale_chall_readability_score)
data['readability_index'] = data['cleaned_text'].apply(find_automated_readability_index)
data['linsear_write_formula'] = data['cleaned_text'].apply(find_linsear_write_formula)
data['gunning_fog_index'] = data['cleaned_text'].apply(find_gunning_fog_index)
data['smog_index'] = data['cleaned_text'].apply(find_smog_index)
data['syllable_count'] = data['cleaned_text'].apply(find_syllable_count)
data['lexicon_count'] = data['cleaned_text'].apply(find_lexicon_count)
data['sentence_count'] = data['cleaned_text'].apply(find_sentence_count)
data['num_footnotes'] = data['text'].apply(find_num_footnotes)
data['num_note_tags'] = data['text'].apply(find_num_note_tags)
data['num_underlines'] = data['text'].apply(find_num_underlines)
data['num_journal_citations'] = data['text'].apply(find_num_journal_citations)
data['num_about_links'] = data['text'].apply(find_num_about_links)
data['num_wikitables'] = data['text'].apply(find_num_wikitables)

Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(GF): Word Count is Zero, cannot divide
Error(GF): Word Count is Zero, cannot divide
Error(GF): Word Count is Zero, cannot

# Save DataFrame

In [None]:
data.to_csv('wiki_train.tsv', sep='\t')

In [30]:
backup_data = data

In [31]:
backup_data.dropna(inplace=True)

In [24]:
random_forest_data = backup_data.loc[:, ['label', 'has_infobox','num_categories','num_images','num_ISBN','num_references','article_length',
                'num_difficult_words','dale_chall_readability_score','readability_index','linsear_write_formula',
                'gunning_fog_index', 'num_web_citations','num_book_citations','num_news_citations',
                'num_quotes','num_h3_headers','num_internal_links', 'num_h2_headers', 'syllable_count',
                'lexicon_count', 'sentence_count','num_footnotes', 'num_note_tags', 'num_underlines', 'num_journal_citations',
                'num_about_links', 'num_wikitables', 'smog_index']]

In [26]:
random_forest_data.to_csv('random_forest_data.csv')

# Train first two models

In [35]:
import pickle
hash_vec_rf_model = pickle.load(open("../src/hash_vec2_aug23.pkl", "rb" ))
rf_model = pickle.load(open("../src/random_forest_aug22.pkl", "rb" ))
hash_vec_fitter = pickle.load(open("../src/hash_vec_fitter.pkl", "rb" ))

# Train / Test split

# 1) Random Forest w/ hand engineered features

In [41]:
round1_data = data[:14672]

In [42]:
round1_data

Unnamed: 0,label,page_title,project,text,timestamp,cleaned_text,num_web_citations,num_book_citations,num_news_citations,num_quotes,...,smog_index,syllable_count,lexicon_count,sentence_count,num_footnotes,num_note_tags,num_underlines,num_journal_citations,num_about_links,num_wikitables
0,5,Funerary art,visual arts,[[Image:GD-FR-Paris-Louvre-Sculptures034.JPG|3...,20100504203659,Burgundy under Louis XIA large sculpture of s...,0,0,1,0,...,9.9,16808.4,10385,525,10,0,0,0,0,1
1,5,Battle of Warsaw (1920),russia,{{Infobox Military Conflict\n|conflict=Battle ...,20070111175847,The Battle of Warsaw (sometimes referred to as...,0,3,0,0,...,9.4,5728.5,3680,176,20,0,0,2,0,0
2,2,Henry Digby (Royal Navy officer),biography,{{Use British English|date=August 2011}}\n{{Us...,20111205102850,Admiral of the Blue Sir Henry Digby GCB (20 Ja...,0,3,0,0,...,8.6,2756.7,1914,81,29,0,0,0,0,0
3,2,Ottawa Redblacks,canadian football,{{Use mdy dates|date=May 2013}}\n{{Infobox CFL...,20140513220942,TD Place StadiumThe Ottawa RedBlacks (stylized...,17,0,11,1,...,9.0,2190.6,1524,64,50,0,0,0,0,0
4,2,Qimonda,germany,{{Infobox_Company |\n company_name = Qimond...,20090315202104,"Qimonda AG , (pronounced ""key-MON-da"") is a m...",8,0,0,0,...,10.4,4231.8,2570,127,14,0,0,0,0,0
5,1,"Ocracoke, North Carolina",project north carolina,{{Infobox Settlement\n|official_name ...,20080911150022,Ocracoke IslandOcracoke is a census-designated...,1,0,0,0,...,8.3,2359.8,1515,106,12,0,0,0,0,0
6,2,Erik Wilhelm,biography,{{construction}}\n\n{{Infobox NFL player\n| im...,20141125183748,"Erik Bradley Wilhelm (born November 16, 1965 i...",0,0,0,0,...,9.3,825.3,552,22,10,0,0,0,0,0
7,0,Kolibite,bulgaria,{{Infobox settlement \n|official_name =Kolibit...,20120524063028,Kolibite is a village in the municipality of S...,0,0,0,0,...,0.0,35.1,15,1,6,0,0,0,0,0
8,3,Edouard Borovansky,ballet,{{Infobox Person\n| name = Edouard Borovansk...,20110208153146,Edouard Borovansky (24 February 1902 – 18 Dece...,4,0,0,0,...,9.7,2823.3,1755,94,12,0,0,1,0,0
9,0,Jacek Wiśniewski,poland,{{Football player infobox\n| playername= Jacek...,20090105062145,"Jacek Wiśniewski (born June 8, 1974 in Gliwice...",0,0,0,0,...,6.4,47.7,30,3,10,0,0,0,0,0
