# Stage One
1. __Model 2:__ Bigram probability from single training corpus

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict
from nltk.tokenize import word_tokenize
import textstat
from sklearn.metrics import accuracy_score

In [2]:
import utilities as ut

In [3]:
training_path = './data/WikiLarge_Train.csv'

In [4]:
%%time
main_df = ut.produce_dataframe(training_path)

CPU times: user 56.5 s, sys: 483 ms, total: 56.9 s
Wall time: 57 s


In [5]:
main_df.shape

(416768, 4)

In [6]:
main_df.head()

Unnamed: 0,original_text,label,tokens,sentence_length
0,There is manuscript evidence that Austen conti...,1,"[there, is, manuscript, evidence, that, austen...",43
1,"In a remarkable comparative analysis , Mandaea...",1,"[in, a, remarkable, comparative, analysis, ,, ...",23
2,"Before Persephone was released to Hermes , who...",1,"[before, persephone, was, released, to, hermes...",46
3,Cogeneration plants are commonly found in dist...,1,"[cogeneration, plants, are, commonly, found, i...",39
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"[geneva, -lrb-, ,, ;, ,, ;, ,, ;, ;, -rrb-, is...",36


### Produce Closed Vocabulary

In [7]:
threshold = 12

In [8]:
def build_counts(token_list, d_dict):
    for token in token_list:
         d_dict[token] += 1
    return d_dict

In [9]:
total_word_counts = defaultdict(int)

In [10]:
main_df['tokens'].apply(lambda x: build_counts(x, total_word_counts))

0         {'there': 8739, 'is': 164001, 'manuscript': 78...
1         {'there': 8739, 'is': 164001, 'manuscript': 78...
2         {'there': 8739, 'is': 164001, 'manuscript': 78...
3         {'there': 8739, 'is': 164001, 'manuscript': 78...
4         {'there': 8739, 'is': 164001, 'manuscript': 78...
                                ...                        
416763    {'there': 8739, 'is': 164001, 'manuscript': 78...
416764    {'there': 8739, 'is': 164001, 'manuscript': 78...
416765    {'there': 8739, 'is': 164001, 'manuscript': 78...
416766    {'there': 8739, 'is': 164001, 'manuscript': 78...
416767    {'there': 8739, 'is': 164001, 'manuscript': 78...
Name: tokens, Length: 416768, dtype: object

In [11]:
# replace low frequency words with '<unk>'
# input is the list of tokens from the original sentence tokens
replace_low_f = lambda x: [t if total_word_counts[t] >= threshold else '<unk>' for t in x]

In [12]:
main_df['closed_tokens_'] = main_df['tokens'].apply(lambda x: replace_low_f(x))
main_df['closed_tokens'] = main_df['closed_tokens_'].apply(lambda x: ['<s>'] + x + ['<e>'])

In [13]:
main_df.head()

Unnamed: 0,original_text,label,tokens,sentence_length,closed_tokens_,closed_tokens
0,There is manuscript evidence that Austen conti...,1,"[there, is, manuscript, evidence, that, austen...",43,"[there, is, manuscript, evidence, that, austen...","[<s>, there, is, manuscript, evidence, that, a..."
1,"In a remarkable comparative analysis , Mandaea...",1,"[in, a, remarkable, comparative, analysis, ,, ...",23,"[in, a, remarkable, comparative, analysis, ,, ...","[<s>, in, a, remarkable, comparative, analysis..."
2,"Before Persephone was released to Hermes , who...",1,"[before, persephone, was, released, to, hermes...",46,"[before, persephone, was, released, to, hermes...","[<s>, before, persephone, was, released, to, h..."
3,Cogeneration plants are commonly found in dist...,1,"[cogeneration, plants, are, commonly, found, i...",39,"[<unk>, plants, are, commonly, found, in, dist...","[<s>, <unk>, plants, are, commonly, found, in,..."
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"[geneva, -lrb-, ,, ;, ,, ;, ,, ;, ;, -rrb-, is...",36,"[geneva, -lrb-, ,, ;, ,, ;, ,, ;, ;, -rrb-, is...","[<s>, geneva, -lrb-, ,, ;, ,, ;, ,, ;, ;, -rrb..."


### Produce Bigrams

In [34]:
def produce_bigrams(token_list, bigram_dict):
    sentence_tuple = tuple(token_list)

    for i in range(len(sentence_tuple) - 1):
        bigram = sentence_tuple[i:i + 2]
        bigram_dict[bigram] += 1
    return bigram_dict

## Model 2: Bigram model Probability models for each distribution 

In [15]:
%%time
# sentences that DO NOT need to be simplified; label = 0
good_df = main_df[ main_df['label'] == 0]
good_df.shape

CPU times: user 68.2 ms, sys: 9.97 ms, total: 78.2 ms
Wall time: 75.6 ms


(208384, 6)

In [16]:
# sentences that need to be simplified; label = 1
need_simp_ones_df = main_df[ main_df['label'] == 1]
need_simp_ones_df.shape

(208384, 6)

### Class 0: No Simplification needed
Word distribution and probability from the class of sentences that do not need simplification

In [17]:
%%time
# class zero are the terms in the sentences that do not need simplification

# number of times each term appears in Class 0
class_zero_unigram_count = defaultdict(int)
good_df['closed_tokens'].apply(lambda x: build_counts(x, class_zero_unigram_count))

CPU times: user 396 ms, sys: 3.32 ms, total: 399 ms
Wall time: 400 ms


208384    {'<s>': 208384, 'there': 4965, 'is': 78927, 's...
208385    {'<s>': 208384, 'there': 4965, 'is': 78927, 's...
208386    {'<s>': 208384, 'there': 4965, 'is': 78927, 's...
208387    {'<s>': 208384, 'there': 4965, 'is': 78927, 's...
208388    {'<s>': 208384, 'there': 4965, 'is': 78927, 's...
                                ...                        
416763    {'<s>': 208384, 'there': 4965, 'is': 78927, 's...
416764    {'<s>': 208384, 'there': 4965, 'is': 78927, 's...
416765    {'<s>': 208384, 'there': 4965, 'is': 78927, 's...
416766    {'<s>': 208384, 'there': 4965, 'is': 78927, 's...
416767    {'<s>': 208384, 'there': 4965, 'is': 78927, 's...
Name: closed_tokens, Length: 208384, dtype: object

In [18]:
len(class_zero_unigram_count)

30829

In [19]:
# total number of terms in Class 0
class_0_total_unigrams = sum(class_zero_unigram_count.values())
class_0_total_unigrams

4274639

In [20]:
no_simp_bigram_dict = defaultdict(int)

In [21]:
%%time
good_df['closed_tokens'].apply(lambda x:produce_bigrams(x, no_simp_bigram_dict) )

CPU times: user 1.54 s, sys: 16.7 ms, total: 1.56 s
Wall time: 1.56 s


208384    {('<s>', 'there'): 1929, ('there', 'is'): 968,...
208385    {('<s>', 'there'): 1929, ('there', 'is'): 968,...
208386    {('<s>', 'there'): 1929, ('there', 'is'): 968,...
208387    {('<s>', 'there'): 1929, ('there', 'is'): 968,...
208388    {('<s>', 'there'): 1929, ('there', 'is'): 968,...
                                ...                        
416763    {('<s>', 'there'): 1929, ('there', 'is'): 968,...
416764    {('<s>', 'there'): 1929, ('there', 'is'): 968,...
416765    {('<s>', 'there'): 1929, ('there', 'is'): 968,...
416766    {('<s>', 'there'): 1929, ('there', 'is'): 968,...
416767    {('<s>', 'there'): 1929, ('there', 'is'): 968,...
Name: closed_tokens, Length: 208384, dtype: object

### Class 1: Needs Simplification

In [22]:
%%time
# class 1 are the term frequency for the sentences that do need simplification

# number of times each term appears in Class 1 -- unigrams

class_one_unigram_count = defaultdict(int)
need_simp_ones_df['closed_tokens'].apply(lambda x: build_counts(x, class_one_unigram_count))

CPU times: user 517 ms, sys: 10 µs, total: 517 ms
Wall time: 516 ms


0         {'<s>': 208384, 'there': 3774, 'is': 85074, 'm...
1         {'<s>': 208384, 'there': 3774, 'is': 85074, 'm...
2         {'<s>': 208384, 'there': 3774, 'is': 85074, 'm...
3         {'<s>': 208384, 'there': 3774, 'is': 85074, 'm...
4         {'<s>': 208384, 'there': 3774, 'is': 85074, 'm...
                                ...                        
208379    {'<s>': 208384, 'there': 3774, 'is': 85074, 'm...
208380    {'<s>': 208384, 'there': 3774, 'is': 85074, 'm...
208381    {'<s>': 208384, 'there': 3774, 'is': 85074, 'm...
208382    {'<s>': 208384, 'there': 3774, 'is': 85074, 'm...
208383    {'<s>': 208384, 'there': 3774, 'is': 85074, 'm...
Name: closed_tokens, Length: 208384, dtype: object

In [23]:
len(class_one_unigram_count)

30892

In [24]:
# total number of terms in Class 1
class_1_total_unigrams = sum(class_one_unigram_count.values())
class_1_total_unigrams

5666317

In [25]:
# bigrams in the require simplification class 
simp_bigram_dict = defaultdict(int)

In [26]:
%%time
need_simp_ones_df['closed_tokens'].apply(lambda x: produce_bigrams(x, simp_bigram_dict))

CPU times: user 2.09 s, sys: 16.6 ms, total: 2.1 s
Wall time: 2.1 s


0         {('<s>', 'there'): 1323, ('there', 'is'): 859,...
1         {('<s>', 'there'): 1323, ('there', 'is'): 859,...
2         {('<s>', 'there'): 1323, ('there', 'is'): 859,...
3         {('<s>', 'there'): 1323, ('there', 'is'): 859,...
4         {('<s>', 'there'): 1323, ('there', 'is'): 859,...
                                ...                        
208379    {('<s>', 'there'): 1323, ('there', 'is'): 859,...
208380    {('<s>', 'there'): 1323, ('there', 'is'): 859,...
208381    {('<s>', 'there'): 1323, ('there', 'is'): 859,...
208382    {('<s>', 'there'): 1323, ('there', 'is'): 859,...
208383    {('<s>', 'there'): 1323, ('there', 'is'): 859,...
Name: closed_tokens, Length: 208384, dtype: object

## Sentence Bigram probability model

In [27]:
def word_probability(word, prior_word, distribution_unigrams, distribution_bigrams):
    # the probability of a given word in the class distribution
    vocabulary_length = len(distribution_unigrams)
    prior_bigram = (word, prior_word)
    word_bigram = distribution_bigrams[prior_bigram]
    word_count = distribution_unigrams[prior_word]
    P_word = (word_bigram + 1)/(word_count + vocabulary_length)
    return np.log(P_word)

In [28]:
def sentence_probability(closed_token_list, distribution_unigrams, distribution_bigrams):
    sent_prob = 0
    for i in range(1, len(closed_token_list)):
        sent_prob += word_probability(closed_token_list[i], closed_token_list[i-1], distribution_unigrams, distribution_bigrams)

    return sent_prob

In [29]:
def classify_sentences(closed_token_list, class_zero_unigram_count, class_one_unigram_count, no_simp_bigram_dict, simp_bigram_dict):
    class_0_prob = sentence_probability(closed_token_list,class_zero_unigram_count, no_simp_bigram_dict)
    class_1_prob = sentence_probability(closed_token_list,class_one_unigram_count, simp_bigram_dict)

    if class_0_prob > class_1_prob:
        return 0
    else:
        return 1

In [30]:
%%time
main_df['predict'] = main_df['closed_tokens'].apply(lambda x:classify_sentences(x, class_zero_unigram_count, class_one_unigram_count, no_simp_bigram_dict, simp_bigram_dict) )

CPU times: user 23.5 s, sys: 52.6 ms, total: 23.6 s
Wall time: 23.7 s


In [31]:
main_df.head()

Unnamed: 0,original_text,label,tokens,sentence_length,closed_tokens_,closed_tokens,predict
0,There is manuscript evidence that Austen conti...,1,"[there, is, manuscript, evidence, that, austen...",43,"[there, is, manuscript, evidence, that, austen...","[<s>, there, is, manuscript, evidence, that, a...",1
1,"In a remarkable comparative analysis , Mandaea...",1,"[in, a, remarkable, comparative, analysis, ,, ...",23,"[in, a, remarkable, comparative, analysis, ,, ...","[<s>, in, a, remarkable, comparative, analysis...",1
2,"Before Persephone was released to Hermes , who...",1,"[before, persephone, was, released, to, hermes...",46,"[before, persephone, was, released, to, hermes...","[<s>, before, persephone, was, released, to, h...",1
3,Cogeneration plants are commonly found in dist...,1,"[cogeneration, plants, are, commonly, found, i...",39,"[<unk>, plants, are, commonly, found, in, dist...","[<s>, <unk>, plants, are, commonly, found, in,...",1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"[geneva, -lrb-, ,, ;, ,, ;, ,, ;, ;, -rrb-, is...",36,"[geneva, -lrb-, ,, ;, ,, ;, ,, ;, ;, -rrb-, is...","[<s>, geneva, -lrb-, ,, ;, ,, ;, ,, ;, ;, -rrb...",1


In [32]:
accuracy_score(main_df['label'], main_df['predict'])

0.5469085918304668

In [33]:
# threshold 13 = 0.5469085918304668
#  threshold 11 = 0.5472181165540541
# threshold 5 = 0.5422777180589681
