# Imports

In [3]:
import pandas as pd
import numpy as np
import json
from ast import literal_eval
import matplotlib.pyplot as plt
from textstat.textstat import textstat
from gensim.corpora import wikicorpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
%matplotlib inline
# Make it pretty
plt.style.use('ggplot')

# Read in data

In [5]:
file = '../data/enwiki.observations.text_wp10.30k.tsv'
data = pd.read_csv(file, sep='\t', header=None)

In [6]:
data = pd.DataFrame(data=list(data[0].apply(literal_eval)))

# Practice pipeline with part of data

In [154]:
data1000 = data[:1000]

In [26]:
def remove_wiki_markup(article):
    return wikicorpus.filter_wiki(article)

In [30]:
def tokenize(article):
    return wikicorpus.tokenize(article)

In [56]:
data[data['text'].str.contains('==Notes')].shape

(5413, 5)

In [60]:
data[data['text'].str.contains('==References')].shape

(19443, 5)

In [64]:
data[data['text'].str.contains('== References')].shape

(4223, 5)

# Feature Engineering

## Has infobox or not

In [76]:
data[data['text'].str.contains('{{Infobox')].shape

(17181, 5)

## Number of level{n} headings
https://en.wikibooks.org/wiki/Editing_Wikitext/Headings

In [91]:
data['text'][0].count("\n===")

3

In [None]:
data['text'][0]

## Article length
Here I am finding the article length of the parsed raw text. This include references but appears to get rid of infoboxes and images.

In [92]:
def find_article_length(raw_article):
    return len(wikicorpus.filter_wiki(article))

In [102]:
art = data['text'][1]
find_article_length(art)

23717

In [101]:
len(wikicorpus.filter_wiki(data['text'][1]))

23717

In [103]:
len(data['text'][1])

34682

## Number of References
There doesn't seem to be an algorithimic way to find only the reference section for each article because they do not all start the refernce section the same way.

In [None]:
data['text'][1]

In [None]:
data['text'].str.contains('{{Reflist')

## Number of Categories
They seem to by denoted by the wikipedia notation ```[[Category:```

In [120]:
def find_num_categories(raw_article):
    return raw_article.count("[[Category:")

In [121]:
find_num_categories(data['text'][1])

4

## Number of Images
They seem to by denoted by the wikipedia notation ```[[Image:```

In [125]:
def find_num_images(raw_article):
    return raw_article.count("[[Image:")

In [200]:
find_num_images(data['text'][5])

7

## Number of references
They seem to by denoted with the wikipedia **end** notation ```</ref>```

In [127]:
def find_num_references(raw_article):
    return raw_article.count("</ref>")

In [199]:
find_num_references(data['text'][5])

1

## Number of Difficult Words
The difficult words score is calculated based on how many difficult words appear in text. A word is considered difficult if it does not appear in a list of 3000 common English words that groups of fourth-grade American students could reliably understand

In [136]:
def find_num_difficult_words(raw_article):
    text = wikicorpus.filter_wiki(raw_article)
    text = text.replace("\n", "").replace("\'", "")
    return textstat.difficult_words(text)

In [197]:
find_num_difficult_words(data['text'][4])

693

## Dale-Chall Readability Score
Another measure for comprehension
difficulty when reading a text. This score takes into
account the percentage of difficult words in the text as well
as the ratio between the number of words and the number of
sentences.

In [138]:
def find_dale_chall_readability_score(raw_article):
    text = wikicorpus.filter_wiki(raw_article)
    text = text.replace("\n", "").replace("\'", "")
    return textstat.dale_chall_readability_score(text)

In [139]:
find_dale_chall_readability_score(data['text'][1])

8.88

## Automated Readability Index
Another measure for comprehension
difficulty when reading a text. This score takes into
account the percentage of difficult words in the text as well
as the ratio between the number of words and the number of
sentences.

In [140]:
def find_automated_readability_index(raw_article):
    text = wikicorpus.filter_wiki(raw_article)
    text = text.replace("\n", "").replace("\'", "")
    return textstat.automated_readability_index(text)

In [141]:
find_automated_readability_index(data['text'][1])

14.3

## Linsear Write Formula
Score initially designed
for the United States Air Force to compute the readability of
their technical manuals. This score corresponds to the US
grade level of a text sample based on sentence length and the
number of words used that have three or more syllables

In [143]:
def find_linsear_write_formula(raw_article):
    text = wikicorpus.filter_wiki(raw_article)
    text = text.replace("\n", "").replace("\'", "")
    return textstat.linsear_write_formula(text)

In [144]:
find_linsear_write_formula(data['text'][1])

19.0

## Gunning-Fog index
Readability score to measure the difficulty of a
given text in terms of the years of formal education needed to
understand the text on a first reading. 

In [145]:
def find_gunning_fog_index(raw_article):
    text = wikicorpus.filter_wiki(raw_article)
    text = text.replace("\n", "").replace("\'", "")
    return textstat.gunning_fog(text)

In [146]:
find_gunning_fog_index(data['text'][1])

21.01217391304348

## TfidfVectorizer

In [150]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [151]:
vectorizer = TfidfVectorizer()

In [163]:
def un_mark_wikis(raw_article):
    text = wikicorpus.filter_wiki(raw_article)
    text = text.replace("\n", "").replace("\'", "").replace("()", "").replace("==", "")
    return text

In [167]:
X = data1000['text'].apply(un_mark_wikis)

## Fit Vectorizer

In [172]:
vectorizer.fit(X[:,])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

## Transform Vectorizer

In [173]:
X_transformed = vectorizer.transform(X[:,])

In [186]:
def un_mark_wikis(raw_article):
    text = wikicorpus.filter_wiki(raw_article)
    text = text.replace("\n", "").replace("\'", "").replace("()", "").replace("=", "").replace("\xa0",'').replace("|altA","")
    return text

In [189]:
un_mark_wikis(data['text'][2])

'Admiral of the Blue Sir Henry Digby GCB (20 January 1770– 19 August 1842) was a senior British naval officer, who served in the French Revolutionary and Napoleonic Wars in the Royal Navy. Born into a long established naval family, his uncle was the famous Admiral Robert Digby, Henry went to sea at the end of the American Revolutionary War aged fourteen.As a Lieutenant aboard HMS Pallas, he received a commendation for rescuing the crew of a burning ship. Promoted to Commander in August 1795 and Captain in December 1796, Digby established a reputation as an aggressive prize taker, capturing 57 ships in less than twenty months. His richest capture came in October 1799 when he assisted in the taking of the treasure ship, the Santa Brigida. He commanded HMS Africa at the Battle of Trafalgar, manoeuvering her into the French and Spanish fleet against orders, having been instructed by Nelson to avoid battle, fearing Digbys small ship of the line would be overwhelmed.In 1806 Digby married Lad