# Imports

In [127]:
import pandas as pd
import numpy as np
import json
from ast import literal_eval
import matplotlib.pyplot as plt
from textstat.textstat import textstat
from gensim.corpora import wikicorpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
%matplotlib inline
# Make it pretty
plt.style.use('ggplot')

# Read in data

In [128]:
file = '../data/enwiki.observations.text_wp10.30k.tsv'
data = pd.read_csv(file, sep='\t', header=None)

In [129]:
data = pd.DataFrame(data=list(data[0].apply(literal_eval)))

In [113]:
data.isnull().sum()

label         0
page_title    0
project       0
text          0
timestamp     0
dtype: int64

# Practice pipeline with part of data

In [130]:
data1000 = data[:3000]

In [87]:
data1000

Unnamed: 0,label,page_title,project,text,timestamp
0,fa,Funerary art,visual arts,[[Image:GD-FR-Paris-Louvre-Sculptures034.JPG|3...,20100504203659
1,fa,Battle of Warsaw (1920),russia,{{Infobox Military Conflict\n|conflict=Battle ...,20070111175847
2,c,Henry Digby (Royal Navy officer),biography,{{Use British English|date=August 2011}}\n{{Us...,20111205102850
3,c,Ottawa Redblacks,canadian football,{{Use mdy dates|date=May 2013}}\n{{Infobox CFL...,20140513220942
4,c,Qimonda,germany,{{Infobox_Company |\n company_name = Qimond...,20090315202104
5,start,"Ocracoke, North Carolina",project north carolina,{{Infobox Settlement\n|official_name ...,20080911150022
6,c,Erik Wilhelm,biography,{{construction}}\n\n{{Infobox NFL player\n| im...,20141125183748
7,stub,Kolibite,bulgaria,{{Infobox settlement \n|official_name =Kolibit...,20120524063028
8,b,Edouard Borovansky,ballet,{{Infobox Person\n| name = Edouard Borovansk...,20110208153146
9,stub,Jacek Wiśniewski,poland,{{Football player infobox\n| playername= Jacek...,20090105062145


In [19]:
p.split('\n')

['{{Infobox baseball player ',
 '| name=Cobb, Tyrus Raymond ',
 '| image name=TyCobb.jpg',
 '| birthdate=[[December 18]], [[1886]]',
 '| birthplace=[[Narrows, Georgia]]',
 '| dead=dead',
 '| deathdate=[[July 17]], [[1961]]',
 '| deathplace=[[Atlanta, Georgia]]',
 '| debutdate=[[August 30]], [[1905]]',
 '| debutteam=[[Detroit Tigers]]',
 '| debutopponent=[[New York Yankees|New York Highlanders]] ',
 '| debutstadium=[[Bennett Park]] ',
 "| teams='''As Player'''<BR>",
 '[[Detroit Tigers]] ([[1905 in sports|1905]] - [[1926 in sports|1926]])<BR>',
 "[[Oakland Athletics|Philadelphia A's]] ([[1927 in sports|1927]] - [[1928 in sports|1928]])<BR>",
 "'''As Manager'''<BR>",
 '[[Detroit Tigers]] ([[1921 in sports|1921]] - [[1926 in sports|1926]])<BR>',
 '| HOFer=HOFer',
 '| inductiondate=[[1936 in sports|1936]]',
 '| careerhighlights=<br>',
 ';All-Time Records:',
 '* Career batting average (.367)',
 '* Career steals of home (54)',
 '* Career batting titles (11)',
 ';Notable Achievements',
 '* Bat

# Random Forest Test

In [131]:
def clean_wiki_markup(raw_article):
    semi_cleaned_article = wikicorpus.filter_wiki(raw_article)
    cleaned_article = semi_cleaned_article.replace("\n", "").replace("\'", "").replace("()", "").replace("=", "").replace("|alt","").replace("\xa0","")
    return cleaned_article

def find_num_categories(raw_article):
    return raw_article.count("[[Category:")

def find_num_images(raw_article):
    return raw_article.count("[[Image:")

def find_num_ISBN(raw_article):
    return raw_article.count("ISBN")

def find_num_references(raw_article):
    return raw_article.count("</ref>")

def find_article_length(cleaned_article):
    return len(cleaned_article)

def find_num_difficult_words(cleaned_article):
    return textstat.difficult_words(cleaned_article)

def find_dale_chall_readability_score(cleaned_article):
    return textstat.dale_chall_readability_score(cleaned_article)

def find_automated_readability_index(cleaned_article):
    return textstat.automated_readability_index(cleaned_article)

def find_linsear_write_formula(cleaned_article):
    return textstat.linsear_write_formula(cleaned_article)

def find_gunning_fog_index(cleaned_article):
    return textstat.gunning_fog(cleaned_article)

def find_syllable_count(cleaned_article):
    return textstat.syllable_count(cleaned_article)

def find_lexicon_count(cleaned_article):
    return textstat.lexicon_count(cleaned_article, removepunct=True)

def find_sentence_count(cleaned_article):
    return textstat.sentence_count(cleaned_article)

def find_smog_index(cleaned_article):
    return textstat.smog_index(cleaned_article)

def find_num_web_citations(raw_article):
    return raw_article.count("{{cite web")

def find_num_book_citations(raw_article):
    return raw_article.count("{{cite book")

def find_num_news_citations(raw_article):
    return raw_article.count("{{cite news")

def find_num_quotes(raw_article):
    return raw_article.count("quote=")

def find_num_h3_headers(raw_article):
    return raw_article.count("\n===")

def find_num_internal_links(raw_article):
    return (raw_article.count("[[") // 2)

def find_num_h2_headers(raw_article):
    return (raw_article.count("\n==") - find_num_h3_headers(raw_article))

In [132]:
data1000['num_web_citations'] = data1000['text'].apply(find_num_web_citations)
data1000['num_book_citations'] = data1000['text'].apply(find_num_book_citations)
data1000['num_news_citations'] = data1000['text'].apply(find_num_news_citations)
data1000['num_quotes'] = data1000['text'].apply(find_num_quotes)
data1000['num_h3_headers'] = data1000['text'].apply(find_num_h3_headers)
data1000['num_internal_links'] = data1000['text'].apply(find_num_internal_links)
data1000['num_h2_headers'] = data1000['text'].apply(find_num_h2_headers)
data1000['cleaned_text'] = data1000['text'].apply(clean_wiki_markup)
data1000['has_infobox'] = data1000['text'].str.contains('{{Infobox').astype(int)
data1000['num_categories'] = data1000['text'].apply(find_num_categories)
data1000['num_images'] = data1000['text'].apply(find_num_images)
data1000['num_ISBN'] = data1000['text'].apply(find_num_ISBN)
data1000['num_references'] = data1000['text'].apply(find_num_references)
data1000['article_length'] = data1000['text'].apply(find_article_length)
data1000['num_difficult_words'] = data1000['cleaned_text'].apply(find_num_difficult_words)
data1000['dale_chall_readability_score'] = data1000['cleaned_text'].apply(find_dale_chall_readability_score)
data1000['readability_index'] = data1000['cleaned_text'].apply(find_automated_readability_index)
data1000['linsear_write_formula'] = data1000['cleaned_text'].apply(find_linsear_write_formula)
data1000['gunning_fog_index'] = data1000['cleaned_text'].apply(find_gunning_fog_index)
data1000['smog_index'] = data1000['cleaned_text'].apply(find_smog_index)
data1000['syllable_count'] = data1000['cleaned_text'].apply(find_syllable_count)
data1000['lexicon_count'] = data1000['cleaned_text'].apply(find_lexicon_count)
data1000['sentence_count'] = data1000['cleaned_text'].apply(find_sentence_count)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

Error(DCRS): Word Count is zero cannot divide


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Error(ARI) : Sentence count is zero, cannot divide


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Error(GF): Word Count is Zero, cannot divide


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [133]:
data1000.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [49]:
# TfidfVectorizer

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

## Fit Vectorizer

vectorizer.fit(data1000['cleaned_text'])

## Transform Vectorizer

X_transformed = vectorizer.transform(data1000['cleaned_text'])

tfidf_df = pd.DataFrame(X_transformed.todense())

df1 = data1000.loc[:, ['has_infobox','num_categories','num_images','num_ISBN','num_references','article_length',
                 'num_difficult_words','dale_chall_readability_score','readability_index','linsear_write_formula',
                 'gunning_fog_index']]

X = pd.concat([df1, tfidf_df], axis=1)

y = data1000.label.values

# Test/Train Split

In [143]:
df1 = data1000.loc[data1000['label'].isin(['fa','b','stub'])]
y = df1.label.values
df1 = df1.loc[:, ['has_infobox','num_categories','num_images','num_ISBN','num_references','article_length',
                 'num_difficult_words','dale_chall_readability_score','readability_index','linsear_write_formula',
                 'gunning_fog_index', 'num_web_citations','num_book_citations','num_news_citations',
                'num_quotes','num_h3_headers','num_internal_links', 'num_h2_headers', 'syllable_count',
                 'lexicon_count', 'sentence_count']]

In [144]:
X_train, X_test, y_train, y_test = train_test_split(df1.values, y, test_size=0.20, random_state=910)

# Random Forrest

In [145]:
clf = RandomForestClassifier(n_estimators=2000, random_state=910, )

In [146]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
            oob_score=False, random_state=910, verbose=0, warm_start=False)

In [147]:
predictions = clf.predict(X_test)

In [148]:
accuracy_score(y_test, predictions)

0.8504983388704319

In [29]:
# TfidfVectorizer

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

## Fit Vectorizer

vectorizer.fit(data1000['text'])

## Transform Vectorizer

X_transformed = vectorizer.transform(data1000['text'])

tfidf_df = X_transformed.todense()

y = data1000.label.values

In [60]:
feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance',                                                                 ascending=False)

In [None]:
feature_importances

# Random Forest Model for full data

## Read & transform original data

In [73]:
file = '../data/enwiki.observations.text_wp10.30k.tsv'
data = pd.read_csv(file, sep='\t', header=None)
data = pd.DataFrame(data=list(data[0].apply(literal_eval)))

## Change y to ints

In [5]:
classes = {"stub": 0, "start": 1, "c": 2, "b": 3, "ga": 4, "fa": 5} 
data["label"] = data['label'].map(classes)

## Feature Engineer Data

In [None]:
data['cleaned_text'] = data['text'].apply(clean_wiki_markup)
data['has_infobox'] = data['text'].str.contains('{{Infobox').astype(int)
data['num_categories'] = data['text'].apply(find_num_categories)
data['num_images'] = data['text'].apply(find_num_images)
data['num_ISBN'] = data['text'].apply(find_num_ISBN)
data['num_references'] = data['text'].apply(find_num_references)
data['article_length'] = data['text'].apply(find_article_length)
data['num_difficult_words'] = data['cleaned_text'].apply(find_num_difficult_words)
data['dale_chall_readability_score'] = data['cleaned_text'].apply(find_dale_chall_readability_score)
data['readability_index'] = data['cleaned_text'].apply(find_automated_readability_index)
data['linsear_write_formula'] = data['cleaned_text'].apply(find_linsear_write_formula)
data['gunning_fog_index'] = data['cleaned_text'].apply(find_gunning_fog_index)
data['smog_index'] = data['cleaned_text'].apply(find_smog_index)

## Fill NaNs

In [16]:
data.fillna(value=0, inplace=True)

In [3]:
data = pd.read_csv('../data/wiki_transformed_data.csv')
data.dropna(axis=0, inplace=True)

## TFIDF Vectorize Cleaned Text And Concatenate Hand-Selected Features

In [3]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

## Fit Vectorizer

In [4]:
vectorizer.fit(data['cleaned_text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [5]:
## Transform Vectorizer
X_transformed = vectorizer.transform(data['cleaned_text'])

In [6]:
tfidf_df = pd.DataFrame(X_transformed.todense())

In [7]:
df1 = data.loc[:, ['has_infobox','num_categories','num_images','num_ISBN','num_references','article_length',
                 'num_difficult_words','dale_chall_readability_score','readability_index','linsear_write_formula',
                 'gunning_fog_index', 'smog_index']]

In [None]:
X = pd.concat([df1, tfidf_df], axis=1)

In [104]:
y = data.label.values

## Test/Train Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.20, random_state=910)

## Random Forest

In [14]:
clf = RandomForestClassifier(n_estimators=1000, random_state=910)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=910, verbose=0, warm_start=False)

In [15]:
predictions = clf.predict(X_test)

In [16]:
accuracy_score(y_test, predictions)

0.5633276740237692

In [15]:
with open('wiki_transformed_data.csv', 'w', encoding='utf8', errors='replace') as f:
    data.to_csv(f)

In [126]:
X_train.shape

(1201, 18)

In [151]:
data1000.dtypes

label                            object
page_title                       object
project                          object
text                             object
timestamp                        object
num_web_citations                 int64
num_book_citations                int64
num_news_citations                int64
num_quotes                        int64
num_h3_headers                    int64
num_internal_links                int64
num_h2_headers                    int64
cleaned_text                     object
has_infobox                       int64
num_categories                    int64
num_images                        int64
num_ISBN                          int64
num_references                    int64
article_length                    int64
num_difficult_words               int64
dale_chall_readability_score    float64
readability_index               float64
linsear_write_formula           float64
gunning_fog_index               float64
smog_index                      float64
