# Stage One
1. __Model 1:__ Unigram probability from single training corpus

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict
from nltk.tokenize import word_tokenize
import textstat
from sklearn.metrics import accuracy_score

In [2]:
import utilities as ut

In [3]:
training_path = './data/WikiLarge_Train.csv'

In [4]:
%%time
main_df = ut.produce_dataframe(training_path)

CPU times: user 55.3 s, sys: 396 ms, total: 55.7 s
Wall time: 55.8 s


In [5]:
main_df.shape

(416768, 4)

In [6]:
main_df.head()

Unnamed: 0,original_text,label,tokens,sentence_length
0,There is manuscript evidence that Austen conti...,1,"[there, is, manuscript, evidence, that, austen...",43
1,"In a remarkable comparative analysis , Mandaea...",1,"[in, a, remarkable, comparative, analysis, ,, ...",23
2,"Before Persephone was released to Hermes , who...",1,"[before, persephone, was, released, to, hermes...",46
3,Cogeneration plants are commonly found in dist...,1,"[cogeneration, plants, are, commonly, found, i...",39
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"[geneva, -lrb-, ,, ;, ,, ;, ,, ;, ;, -rrb-, is...",36


In [7]:
%%time
# sentences that DO NOT need to be simplified; label = 0
good_df = main_df[ main_df['label'] == 0]
good_df.shape

CPU times: user 17.2 ms, sys: 35 µs, total: 17.2 ms
Wall time: 19 ms


(208384, 4)

In [8]:
# sentences that need to be simplified; label = 1
need_simp_ones_df = main_df[ main_df['label'] == 1]
need_simp_ones_df.shape

(208384, 4)

## Model 1: Simple Unigram model

In [9]:
# assumptions: stop words left in, distribution will contribute to the need for simplification

In [10]:
def build_counts(token_list, d_dict):
    for token in token_list:
         d_dict[token] += 1
    return d_dict

### Class 0: No Simplification needed
Word distribution and probability from the class of sentences that do not need simplification

In [11]:
%%time
# class zero are the terms in the sentences that do not need simplification

# number of times each term appears in Class 0
class_zero_count = defaultdict(int)
good_df['tokens'].apply(lambda x: build_counts(x, class_zero_count))

CPU times: user 494 ms, sys: 3.3 ms, total: 497 ms
Wall time: 496 ms


208384    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
208385    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
208386    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
208387    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
208388    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
                                ...                        
416763    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
416764    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
416765    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
416766    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
416767    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
Name: tokens, Length: 208384, dtype: object

In [12]:
len(class_zero_count)

127093

In [13]:
# total number of terms in Class 0
class_0_total_terms = sum(class_zero_count.values())
class_0_total_terms

3857871

In [14]:
%%time
# log probability of terms in Class 0
class_0_log_prob_terms = {k:np.log(v/class_0_total_terms) for k, v in class_zero_count.items()}

CPU times: user 75.2 ms, sys: 3.35 ms, total: 78.5 ms
Wall time: 78.3 ms


In [15]:
# class_0_log_prob_terms

### Class 1: Needs Simplification

In [16]:
%%time
# class 1 are the term frequency for the sentences that do need simplification

# number of times each term appears in Class 1
class_one_count = defaultdict(int)
need_simp_ones_df['tokens'].apply(lambda x: build_counts(x, class_one_count))

CPU times: user 612 ms, sys: 6.65 ms, total: 618 ms
Wall time: 619 ms


0         {'there': 3774, 'is': 85074, 'manuscript': 42,...
1         {'there': 3774, 'is': 85074, 'manuscript': 42,...
2         {'there': 3774, 'is': 85074, 'manuscript': 42,...
3         {'there': 3774, 'is': 85074, 'manuscript': 42,...
4         {'there': 3774, 'is': 85074, 'manuscript': 42,...
                                ...                        
208379    {'there': 3774, 'is': 85074, 'manuscript': 42,...
208380    {'there': 3774, 'is': 85074, 'manuscript': 42,...
208381    {'there': 3774, 'is': 85074, 'manuscript': 42,...
208382    {'there': 3774, 'is': 85074, 'manuscript': 42,...
208383    {'there': 3774, 'is': 85074, 'manuscript': 42,...
Name: tokens, Length: 208384, dtype: object

In [17]:
len(class_one_count)

152583

In [18]:
# total number of terms in Class 1
class_1_total_terms = sum(class_one_count.values())
class_1_total_terms

5249549

In [19]:
%%time
# log probability of the terms in the Class 1 distribution
class_1_log_prob_terms = {k:np.log(v/class_1_total_terms) for k, v in class_one_count.items()}

CPU times: user 91.6 ms, sys: 15 µs, total: 91.6 ms
Wall time: 91.3 ms


In [20]:
# class_1_log_prob_terms

### Simple Unigram probability model

In [21]:
def probability_prediction(token_list, c_1p = class_1_log_prob_terms, c_zp = class_0_log_prob_terms):
    # predicted class is based on which class has the higher probability for each sentence
    # probabilities are negative numbers since they are on the log scale --> higher number = greater probability

    # class 0, no need for simplification likelihood:
    class_0_prob = 0

    # class 1, need simplification likelihood:
    class_1_prob = 0
  
    for token in token_list:
        try:
            class_0_prob += class_0_log_prob_terms[token]
        except:
            class_0_prob += 0

        try:
            class_1_prob += class_1_log_prob_terms[token]
        except: 
            class_1_prob += 0

    predicted_class = 1 if class_1_prob > class_0_prob else 0

    return predicted_class

In [22]:
%%time
main_df['predicted'] = main_df.apply(lambda x: probability_prediction(x['tokens']), axis = 1)

CPU times: user 4.76 s, sys: 16.6 ms, total: 4.78 s
Wall time: 4.78 s


In [23]:
main_df.head()

Unnamed: 0,original_text,label,tokens,sentence_length,predicted
0,There is manuscript evidence that Austen conti...,1,"[there, is, manuscript, evidence, that, austen...",43,1
1,"In a remarkable comparative analysis , Mandaea...",1,"[in, a, remarkable, comparative, analysis, ,, ...",23,1
2,"Before Persephone was released to Hermes , who...",1,"[before, persephone, was, released, to, hermes...",46,1
3,Cogeneration plants are commonly found in dist...,1,"[cogeneration, plants, are, commonly, found, i...",39,0
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"[geneva, -lrb-, ,, ;, ,, ;, ,, ;, ;, -rrb-, is...",36,0


In [24]:
accuracy_score(main_df['label'], main_df['predicted'])

0.583067797911548