# Imports

In [370]:
import pandas as pd
import numpy as np
import json
from ast import literal_eval
import matplotlib.pyplot as plt
from textstat.textstat import textstat
from gensim.corpora import wikicorpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
# Make it pretty
plt.style.use('ggplot')

  from numpy.core.umath_tests import inner1d


# Read in data

In [283]:
file = '../data/enwiki.observations.text_wp10.30k.tsv'
data = pd.read_csv(file, sep='\t', header=None)

In [284]:
data = pd.DataFrame(data=list(data[0].apply(literal_eval)))

# Practice pipeline with part of data

In [258]:
data1000 = data[:1000]

In [26]:
def remove_wiki_markup(article):
    return wikicorpus.filter_wiki(article)

In [30]:
def tokenize(article):
    return wikicorpus.tokenize(article)

In [56]:
data[data['text'].str.contains('==Notes')].shape

(5413, 5)

In [60]:
data[data['text'].str.contains('==References')].shape

(19443, 5)

In [64]:
data[data['text'].str.contains('== References')].shape

(4223, 5)

# Feature Engineering

## Has infobox or not

In [76]:
data[data['text'].str.contains('{{Infobox')].shape

(17181, 5)

## Number of level{n} headings
https://en.wikibooks.org/wiki/Editing_Wikitext/Headings

In [91]:
data['text'][0].count("\n===")

3

In [None]:
data['text'][0]

## Article length
Here I am finding the article length of the parsed raw text. This include references but appears to get rid of infoboxes and images.

In [92]:
def find_article_length(raw_article):
    return len(wikicorpus.filter_wiki(article))

In [102]:
art = data['text'][1]
find_article_length(art)

23717

In [101]:
len(wikicorpus.filter_wiki(data['text'][1]))

23717

In [103]:
len(data['text'][1])

34682

## Number of Categories
They seem to by denoted by the wikipedia notation ```[[Category:```

In [120]:
def find_num_categories(raw_article):
    return raw_article.count("[[Category:")

In [121]:
find_num_categories(data['text'][1])

4

## Number of Images
They seem to by denoted by the wikipedia notation ```[[Image:```

In [125]:
def find_num_images(raw_article):
    return raw_article.count("[[Image:")

In [200]:
find_num_images(data['text'][5])

7

## Number of ISBN References
Number of times ```ISBN``` appears

In [202]:
def find_num_ISBN(raw_article):
    return raw_article.count("ISBN")

In [218]:
find_num_ISBN(data['text'][29444])

29

## Number of references
They seem to by denoted with the wikipedia **end** notation ```</ref>```

In [127]:
def find_num_references(raw_article):
    return raw_article.count("</ref>")

In [199]:
find_num_references(data['text'][5])

1

## Number of Difficult Words
The difficult words score is calculated based on how many difficult words appear in text. A word is considered difficult if it does not appear in a list of 3000 common English words that groups of fourth-grade American students could reliably understand

In [295]:
def find_num_difficult_words(raw_article):
    text = un_mark_wikis(raw_article)
    return textstat.difficult_words(text)

In [296]:
find_num_difficult_words(data['text'][4])

693

## Dale-Chall Readability Score
Another measure for comprehension
difficulty when reading a text. This score takes into
account the percentage of difficult words in the text as well
as the ratio between the number of words and the number of
sentences.

In [309]:
def find_dale_chall_readability_score(cleaned_article):
    return textstat.dale_chall_readability_score(cleaned_article)

In [310]:
find_dale_chall_readability_score(data['text'][1])

8.87

## Automated Readability Index
Another measure for comprehension
difficulty when reading a text. This score takes into
account the percentage of difficult words in the text as well
as the ratio between the number of words and the number of
sentences.

In [314]:
def find_automated_readability_index(cleaned_article):
    return textstat.automated_readability_index(cleaned_article)

In [315]:
find_automated_readability_index(data['text'][1])

19.8

## Linsear Write Formula
Score initially designed
for the United States Air Force to compute the readability of
their technical manuals. This score corresponds to the US
grade level of a text sample based on sentence length and the
number of words used that have three or more syllables

In [318]:
def find_linsear_write_formula(cleaned_article):
    return textstat.linsear_write_formula(cleaned_article)

In [144]:
find_linsear_write_formula(data['text'][1])

19.0

## Gunning-Fog index
Readability score to measure the difficulty of a
given text in terms of the years of formal education needed to
understand the text on a first reading. 

In [None]:
def find_gunning_fog_index(cleaned_article):
    return textstat.gunning_fog(cleaned_article)

find_gunning_fog_index(data['text'][1])

## Turn y labels into ints

In [250]:
classes = {"stub": 0, "start": 1, "c": 2, "b": 3, "ga": 4, "fa": 5} 
data["label"] = data['label'].map(classes)

# Random Forest Test

In [None]:
def clean_wiki_markup(raw_article):
    semi_cleaned_article = wikicorpus.filter_wiki(raw_article)
    cleaned_article = semi_cleaned_article.replace("\n", "").replace("\'", "").replace("()", "").replace("=", "").replace("|alt","").replace("\xa0","")
    return cleaned_article

def find_num_categories(raw_article):
    return raw_article.count("[[Category:")

def find_num_images(raw_article):
    return raw_article.count("[[Image:")

def find_num_ISBN(raw_article):
    return raw_article.count("ISBN")

def find_num_references(raw_article):
    return raw_article.count("</ref>")

def find_article_length(cleaned_article):
    return len(article)

def find_num_difficult_words(cleaned_article):
    return textstat.difficult_words(cleaned_article)

def find_dale_chall_readability_score(cleaned_article):
    return textstat.dale_chall_readability_score(cleaned_article)

def find_automated_readability_index(cleaned_article):
    return textstat.automated_readability_index(cleaned_article)

def find_linsear_write_formula(cleaned_article):
    return textstat.linsear_write_formula(cleaned_article)

def find_gunning_fog_index(cleaned_article):
    return textstat.gunning_fog(cleaned_article)

def transform_dataframe(raw_dataframe):
    raw_dataframe['has_infobox'] = raw_dataframe['text'].str.contains('{{Infobox').astype(int)
    raw_dataframe['num_categories'] = raw_dataframe['text'].apply(find_num_categories)
    raw_dataframe['num_images'] = raw_dataframe['text'].apply(find_num_images)
    raw_dataframe['num_references'] = raw_dataframe['text'].apply(find_num_references)
    raw_dataframe['cleaned_text'] = raw_dataframe['text'].apply(clean_wiki_markup)
    raw_dataframe['article_length'] = raw_dataframe['cleaned_text'].apply(find_article_length)
    raw_dataframe['num_difficult_words'] = raw_dataframe['cleaned_text'].apply(find_num_difficult_words)
    raw_dataframe['readability_index'] = raw_dataframe['cleaned_text'].apply(find_automated_readability_index)
    raw_dataframe['dale_chall_readability_score'] = raw_dataframe['cleaned_text'].apply(find_dale_chall_readability_score)
    raw_dataframe['linsear_write_formula'] = raw_dataframe['cleaned_text'].apply(find_linsear_write_formula)
    raw_dataframe['gunning_fog_index'] = raw_dataframe['cleaned_text'].apply(find_gunning_fog_index)
    

In [358]:
#data['has_infobox'] = data['text'].str.contains('{{Infobox').astype(int)
#data['num_categories'] = data['text'].apply(find_num_categories)
#data['num_images'] = data['text'].apply(find_num_images)
#data['num_ISBN'] = data['text'].apply(find_num_ISBN)
#data['num_references'] = data['text'].apply(find_num_references)
#data1000['article_length'] = data1000['text'].apply(find_article_length)
#data1000['num_difficult_words'] = data1000['cleaned_text'].apply(find_num_difficult_words)
#data1000['cleaned_text'] = data1000['text'].apply(clean_wiki_markup)
#data1000['dale_chall_readability_score'] = data1000['cleaned_text'].apply(find_dale_chall_readability_score)
#data1000['readability_index'] = data1000['cleaned_text'].apply(find_automated_readability_index)
#data1000['linsear_write_formula'] = data1000['cleaned_text'].apply(find_linsear_write_formula)
#data1000['gunning_fog_index'] = data1000['cleaned_text'].apply(find_gunning_fog_index)

In [None]:
# TfidfVectorizer

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

## Fit Vectorizer

vectorizer.fit(data1000['cleaned_text'])

## Transform Vectorizer

X_transformed = vectorizer.transform(data1000['cleaned_text'])

tfidf_df = pd.DataFrame(X_transformed.todense())

df1 = data1000.loc[:, ['has_infobox','num_categories','num_images','num_ISBN','num_references','article_length',
                 'num_difficult_words','dale_chall_readability_score','readability_index','linsear_write_formula',
                 'gunning_fog_index']]

X = pd.concat([df1, tfidf_df], axis=1)

y = data1000.label.values

# Test/Train Split

In [359]:
from sklearn.model_selection import train_test_split

In [389]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.20, random_state=910)

# Random Forrest

In [373]:
clf = RandomForestClassifier(n_estimators=1000, random_state=910)

In [390]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=910, verbose=0, warm_start=False)

In [391]:
predictions = clf.predict(X_test)

In [392]:
from sklearn.metrics import accuracy_score

In [393]:
accuracy_score(y_test, predictions)

0.48

In [394]:
from keras.models import Sequential
from keras.layers import Dense

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [396]:
model = Sequential()
model.add(Dense(2000, input_dim=96669, activation='relu'))
model.add(Dense(1000, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [397]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=10)

Epoch 1/10