# Imports

In [127]:
import pandas as pd
import numpy as np
from ast import literal_eval
import matplotlib.pyplot as plt
from textstat.textstat import textstat
from gensim.corpora import wikicorpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
%matplotlib inline
# Make it pretty
plt.style.use('ggplot')

# Read in data

In [301]:
file = '../data/enwiki.observations.text_wp10.30k.tsv'
raw_data = pd.read_csv(file, sep='\t', header=None)

In [302]:
data = pd.DataFrame(data=list(raw_data[0].apply(literal_eval)))
data = data[data['text'] != ""]
data = data[data['text'].str.contains("#redirect") == False]
data = data[data['text'].str.contains("may refer to:\n\n*") == False]
data = data[data['text'].str.contains("can refer to:\n") == False]
data = data[data['text'].str.contains("could refer to:\n") == False]
data = data[data['text'].str.contains("#REDIRECT") == False]
data = data[data['text'].str.contains("== Matches ==\n:") == False]
data = data[data['text'].str.contains("{{underconstruction") == False]

In [303]:
data.shape

(29353, 5)

# Practice pipeline with part of data

In [304]:
data1000 = data[:3000]

In [305]:
classes = {"stub": 0, "start": 1, "c": 2, "b": 3, "ga": 4, "fa": 5} 
data1000["label"] = data1000['label'].map(classes)
data1000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,label,page_title,project,text,timestamp
0,5,Funerary art,visual arts,[[Image:GD-FR-Paris-Louvre-Sculptures034.JPG|3...,20100504203659
1,5,Battle of Warsaw (1920),russia,{{Infobox Military Conflict\n|conflict=Battle ...,20070111175847
2,2,Henry Digby (Royal Navy officer),biography,{{Use British English|date=August 2011}}\n{{Us...,20111205102850
3,2,Ottawa Redblacks,canadian football,{{Use mdy dates|date=May 2013}}\n{{Infobox CFL...,20140513220942
4,2,Qimonda,germany,{{Infobox_Company |\n company_name = Qimond...,20090315202104
5,1,"Ocracoke, North Carolina",project north carolina,{{Infobox Settlement\n|official_name ...,20080911150022
6,2,Erik Wilhelm,biography,{{construction}}\n\n{{Infobox NFL player\n| im...,20141125183748
7,0,Kolibite,bulgaria,{{Infobox settlement \n|official_name =Kolibit...,20120524063028
8,3,Edouard Borovansky,ballet,{{Infobox Person\n| name = Edouard Borovansk...,20110208153146
9,0,Jacek Wiśniewski,poland,{{Football player infobox\n| playername= Jacek...,20090105062145


# Random Forest Test

In [306]:
def clean_wiki_markup(raw_article):
    semi_cleaned_article = wikicorpus.filter_wiki(raw_article)
    cleaned_article = semi_cleaned_article.replace("\n", "").replace("\'", "").replace("()", "").replace("=", "").replace("|alt","").replace("\xa0","")
    return cleaned_article

def find_num_categories(raw_article):
    return raw_article.count("[[Category:")

def find_num_images(raw_article):
    return raw_article.count("[[Image:")

def find_num_ISBN(raw_article):
    return raw_article.count("ISBN")

def find_num_references(raw_article):
    return raw_article.count("</ref>")

def find_article_length(cleaned_article):
    return len(cleaned_article)

def find_num_difficult_words(cleaned_article):
    return textstat.difficult_words(cleaned_article)

def find_dale_chall_readability_score(cleaned_article):
    return textstat.dale_chall_readability_score(cleaned_article)

def find_automated_readability_index(cleaned_article):
    return textstat.automated_readability_index(cleaned_article)

def find_linsear_write_formula(cleaned_article):
    return textstat.linsear_write_formula(cleaned_article)

def find_gunning_fog_index(cleaned_article):
    return textstat.gunning_fog(cleaned_article)

def find_syllable_count(cleaned_article):
    return textstat.syllable_count(cleaned_article)

def find_lexicon_count(cleaned_article):
    return textstat.lexicon_count(cleaned_article, removepunct=True)

def find_sentence_count(cleaned_article):
    return textstat.sentence_count(cleaned_article)

def find_smog_index(cleaned_article):
    return textstat.smog_index(cleaned_article)

def find_num_web_citations(raw_article):
    return raw_article.count("{{cite web")

def find_num_book_citations(raw_article):
    return raw_article.count("{{cite book")

def find_num_news_citations(raw_article):
    return raw_article.count("{{cite news")

def find_num_quotes(raw_article):
    return raw_article.count("quote=")

def find_num_h3_headers(raw_article):
    return raw_article.count("\n===")

def find_num_internal_links(raw_article):
    return (raw_article.count("[[") // 2)

def find_num_h2_headers(raw_article):
    return (raw_article.count("\n==") - find_num_h3_headers(raw_article))

In [None]:
data1000['num_web_citations'] = data1000['text'].apply(find_num_web_citations)
data1000['num_book_citations'] = data1000['text'].apply(find_num_book_citations)
data1000['num_news_citations'] = data1000['text'].apply(find_num_news_citations)
data1000['num_quotes'] = data1000['text'].apply(find_num_quotes)
data1000['num_h3_headers'] = data1000['text'].apply(find_num_h3_headers)
data1000['num_internal_links'] = data1000['text'].apply(find_num_internal_links)
data1000['num_h2_headers'] = data1000['text'].apply(find_num_h2_headers)
data1000['cleaned_text'] = data1000['text'].apply(clean_wiki_markup)
data1000['has_infobox'] = data1000['text'].str.contains('{{Infobox').astype(int)
data1000['num_categories'] = data1000['text'].apply(find_num_categories)
data1000['num_images'] = data1000['text'].apply(find_num_images)
data1000['num_ISBN'] = data1000['text'].apply(find_num_ISBN)
data1000['num_references'] = data1000['text'].apply(find_num_references)
data1000['article_length'] = data1000['text'].apply(find_article_length)
data1000['num_difficult_words'] = data1000['cleaned_text'].apply(find_num_difficult_words)
data1000['dale_chall_readability_score'] = data1000['cleaned_text'].apply(find_dale_chall_readability_score)
data1000['readability_index'] = data1000['cleaned_text'].apply(find_automated_readability_index)
data1000['linsear_write_formula'] = data1000['cleaned_text'].apply(find_linsear_write_formula)
data1000['gunning_fog_index'] = data1000['cleaned_text'].apply(find_gunning_fog_index)
data1000['smog_index'] = data1000['cleaned_text'].apply(find_smog_index)
data1000['syllable_count'] = data1000['cleaned_text'].apply(find_syllable_count)
data1000['lexicon_count'] = data1000['cleaned_text'].apply(find_lexicon_count)
data1000['sentence_count'] = data1000['cleaned_text'].apply(find_sentence_count)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [None]:
data1000.dropna(inplace=True)

In [157]:
# TfidfVectorizer

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

## Fit Vectorizer

vectorizer.fit(data1000['cleaned_text'])

## Transform Vectorizer

X_transformed = vectorizer.transform(data1000['cleaned_text'])

tfidf_df = pd.DataFrame(X_transformed.todense())

df1 = data1000.loc[:, ['has_infobox','num_categories','num_images','num_ISBN','num_references','article_length',
                 'num_difficult_words','dale_chall_readability_score','readability_index','linsear_write_formula',
                 'gunning_fog_index', 'num_web_citations','num_book_citations','num_news_citations',
                'num_quotes','num_h3_headers','num_internal_links', 'num_h2_headers', 'syllable_count',
                 'lexicon_count', 'sentence_count']]

X = pd.concat([df1, tfidf_df], axis=1)

y = data1000.label.values

# Test/Train Split

In [143]:
df1 = data1000.loc[data1000['label'].isin(['fa','b','stub'])]
y = df1.label.values
df1 = df1.loc[:, ['has_infobox','num_categories','num_images','num_ISBN','num_references','article_length',
                 'num_difficult_words','dale_chall_readability_score','readability_index','linsear_write_formula',
                 'gunning_fog_index', 'num_web_citations','num_book_citations','num_news_citations',
                'num_quotes','num_h3_headers','num_internal_links', 'num_h2_headers', 'syllable_count',
                 'lexicon_count', 'sentence_count']]

In [158]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.20, random_state=910)

ValueError: Found input variables with inconsistent numbers of samples: [3000, 2999]

# Random Forrest

In [145]:
clf = RandomForestClassifier(n_estimators=2000, random_state=910, )

In [146]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
            oob_score=False, random_state=910, verbose=0, warm_start=False)

In [147]:
predictions = clf.predict(X_test)

In [148]:
accuracy_score(y_test, predictions)

0.8504983388704319

In [153]:
feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = df1.columns,
                                    columns=['importance']).sort_values('importance',                                                                 ascending=False)

In [154]:
feature_importances

Unnamed: 0,importance
article_length,0.140706
syllable_count,0.122803
lexicon_count,0.121088
num_difficult_words,0.101117
sentence_count,0.079604
num_references,0.078659
num_internal_links,0.062164
num_h2_headers,0.047499
dale_chall_readability_score,0.039493
gunning_fog_index,0.030601


In [287]:
data['cleaned_text'] = data['text'].apply(clean_wiki_markup)
data['lexicon_count'] = data['cleaned_text'].apply(find_lexicon_count)
data['sentence_count'] = data['cleaned_text'].apply(find_sentence_count)
data['article_length'] = data['text'].apply(find_article_length)
data['num_difficult_words'] = data['cleaned_text'].apply(find_num_difficult_words)

In [222]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = '../data/glove.6B.50d.txt'
word2vec_output_file = 'glove.6B.50d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(400001, 50)

In [None]:
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'glove.6B.50d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [230]:
# calculate: (X - Z) + Y = ?
#result = model.most_similar(positive=['Y', 'X'], negative=['Z'], topn=1)
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

[('queen', 0.8523603677749634)]


In [266]:
from collections import Counter
article = Counter(wikicorpus.tokenize(data1000['cleaned_text'][0]))

In [263]:
from itertools import dropwhile
for key, count in dropwhile(lambda key_count: key_count[1] >= 20, article.most_common()):
    del article[key]

In [267]:
article2 - article

Counter()