# Stage One: Closed Vocabulary
1. __Model 1:__ Closed Vocabulary Unigram probability from single training corpus: thresholds

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict
from nltk.tokenize import word_tokenize
import textstat
from sklearn.metrics import accuracy_score

In [2]:
import utilities as ut

In [3]:
training_path = './data/WikiLarge_Train.csv'

In [4]:
%%time
main_df = ut.produce_dataframe(training_path)

CPU times: user 55.6 s, sys: 403 ms, total: 56 s
Wall time: 56.1 s


In [5]:
main_df.shape

(416768, 4)

In [6]:
main_df.head()

Unnamed: 0,original_text,label,tokens,sentence_length
0,There is manuscript evidence that Austen conti...,1,"[there, is, manuscript, evidence, that, austen...",43
1,"In a remarkable comparative analysis , Mandaea...",1,"[in, a, remarkable, comparative, analysis, ,, ...",23
2,"Before Persephone was released to Hermes , who...",1,"[before, persephone, was, released, to, hermes...",46
3,Cogeneration plants are commonly found in dist...,1,"[cogeneration, plants, are, commonly, found, i...",39
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"[geneva, -lrb-, ,, ;, ,, ;, ,, ;, ;, -rrb-, is...",36


### Produce Closed Vocabulary

In [7]:
threshold = 11

In [8]:
def build_counts(token_list, d_dict):
    for token in token_list:
         d_dict[token] += 1
    return d_dict

In [9]:
total_word_counts = defaultdict(int)

In [10]:
main_df['tokens'].apply(lambda x: build_counts(x, total_word_counts))

0         {'there': 8739, 'is': 164001, 'manuscript': 78...
1         {'there': 8739, 'is': 164001, 'manuscript': 78...
2         {'there': 8739, 'is': 164001, 'manuscript': 78...
3         {'there': 8739, 'is': 164001, 'manuscript': 78...
4         {'there': 8739, 'is': 164001, 'manuscript': 78...
                                ...                        
416763    {'there': 8739, 'is': 164001, 'manuscript': 78...
416764    {'there': 8739, 'is': 164001, 'manuscript': 78...
416765    {'there': 8739, 'is': 164001, 'manuscript': 78...
416766    {'there': 8739, 'is': 164001, 'manuscript': 78...
416767    {'there': 8739, 'is': 164001, 'manuscript': 78...
Name: tokens, Length: 416768, dtype: object

In [11]:
# replace low frequency words with '<unk>'
# input is the list of tokens from the original sentence tokens
replace_low_f = lambda x: [t if total_word_counts[t] >= threshold else '<unk>' for t in x]

In [12]:
main_df['closed_tokens'] = main_df['tokens'].apply(lambda x: replace_low_f(x))

In [13]:
main_df.head()

Unnamed: 0,original_text,label,tokens,sentence_length,closed_tokens
0,There is manuscript evidence that Austen conti...,1,"[there, is, manuscript, evidence, that, austen...",43,"[there, is, manuscript, evidence, that, austen..."
1,"In a remarkable comparative analysis , Mandaea...",1,"[in, a, remarkable, comparative, analysis, ,, ...",23,"[in, a, remarkable, comparative, analysis, ,, ..."
2,"Before Persephone was released to Hermes , who...",1,"[before, persephone, was, released, to, hermes...",46,"[before, persephone, was, released, to, hermes..."
3,Cogeneration plants are commonly found in dist...,1,"[cogeneration, plants, are, commonly, found, i...",39,"[<unk>, plants, are, commonly, found, in, dist..."
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"[geneva, -lrb-, ,, ;, ,, ;, ,, ;, ;, -rrb-, is...",36,"[geneva, -lrb-, ,, ;, ,, ;, ,, ;, ;, -rrb-, is..."


In [14]:
%%time
# sentences that DO NOT need to be simplified; label = 0
good_df = main_df[ main_df['label'] == 0]
good_df.shape

CPU times: user 51 ms, sys: 3.34 ms, total: 54.4 ms
Wall time: 51.9 ms


(208384, 5)

In [15]:
# sentences that need to be simplified; label = 1
need_simp_ones_df = main_df[ main_df['label'] == 1]
need_simp_ones_df.shape

(208384, 5)

## Model 2: Closed Vocabulary Unigram model

In [16]:
# assumptions: differences in the distributions will remain if low frequency words are identified 
# assumptions: stop words left in, distribution will contribute to the need for simplification

### Class 0: No Simplification needed
Word distribution and probability from the class of sentences that do not need simplification

In [17]:
%%time
# class zero are the terms in the sentences that do not need simplification

# number of times each term appears in Class 0
class_zero_count = defaultdict(int)
good_df['closed_tokens'].apply(lambda x: build_counts(x, class_zero_count))

CPU times: user 363 ms, sys: 1 µs, total: 363 ms
Wall time: 363 ms


208384    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
208385    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
208386    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
208387    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
208388    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
                                ...                        
416763    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
416764    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
416765    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
416766    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
416767    {'there': 4965, 'is': 78927, 'some': 3981, 'pr...
Name: closed_tokens, Length: 208384, dtype: object

In [18]:
len(class_zero_count)

32687

In [19]:
# total number of terms in Class 0
class_0_total_terms = sum(class_zero_count.values())
class_0_total_terms

3857871

In [20]:
%%time
# log probability of terms in Class 0
class_0_log_prob_terms = {k:np.log(v/class_0_total_terms) for k, v in class_zero_count.items()}

CPU times: user 20.4 ms, sys: 0 ns, total: 20.4 ms
Wall time: 20.2 ms


In [21]:
# class_0_log_prob_terms

### Class 1: Needs Simplification

In [22]:
%%time
# class 1 are the term frequency for the sentences that do need simplification

# number of times each term appears in Class 1
class_one_count = defaultdict(int)
need_simp_ones_df['closed_tokens'].apply(lambda x: build_counts(x, class_one_count))

CPU times: user 490 ms, sys: 1 µs, total: 490 ms
Wall time: 489 ms


0         {'there': 3774, 'is': 85074, 'manuscript': 42,...
1         {'there': 3774, 'is': 85074, 'manuscript': 42,...
2         {'there': 3774, 'is': 85074, 'manuscript': 42,...
3         {'there': 3774, 'is': 85074, 'manuscript': 42,...
4         {'there': 3774, 'is': 85074, 'manuscript': 42,...
                                ...                        
208379    {'there': 3774, 'is': 85074, 'manuscript': 42,...
208380    {'there': 3774, 'is': 85074, 'manuscript': 42,...
208381    {'there': 3774, 'is': 85074, 'manuscript': 42,...
208382    {'there': 3774, 'is': 85074, 'manuscript': 42,...
208383    {'there': 3774, 'is': 85074, 'manuscript': 42,...
Name: closed_tokens, Length: 208384, dtype: object

In [23]:
len(class_one_count)

32779

In [24]:
# total number of terms in Class 1
class_1_total_terms = sum(class_one_count.values())
class_1_total_terms

5249549

In [25]:
%%time
# log probability of the terms in the Class 1 distribution
class_1_log_prob_terms = {k:np.log(v/class_1_total_terms) for k, v in class_one_count.items()}

CPU times: user 20.2 ms, sys: 2 µs, total: 20.2 ms
Wall time: 19.9 ms


In [26]:
# class_1_log_prob_terms

### Closed Vocabulary Unigram probability model

In [27]:
def probability_prediction(token_list, c_1p = class_1_log_prob_terms, c_zp = class_0_log_prob_terms):
    # probabilities are negative numbers since they are on the log scale --> higher number = greater probability

    # class 0, no need for simplification likelihood:
    class_0_prob = 0

    # class 1, need simplification likelihood:
    class_1_prob = 0
  
    for token in token_list:
        try:
            class_0_prob += class_0_log_prob_terms[token]
        except:
            class_0_prob += 0

        try:
            class_1_prob += class_1_log_prob_terms[token]
        except: 
            class_1_prob += 0

    predicted_class = 1 if class_1_prob > class_0_prob else 0

    return predicted_class

In [28]:
%%time
main_df['predicted'] = main_df.apply(lambda x: probability_prediction(x['tokens']), axis = 1)

CPU times: user 4.63 s, sys: 9.93 ms, total: 4.64 s
Wall time: 4.65 s


In [29]:
# main_df.head()

In [30]:
accuracy_score(main_df['label'], main_df['predicted'])

0.6817749923218673

In [31]:
# threshold 15 = 0.68106476504914
# threshold 13 = 0.6814894617628993
# threshold 12 = 0.6816214296683046
# threshold 11 = 0.6817749923218673 ***
# threshold 10 = 0.681525453009828

In [32]:
diff = 0.6817749923218673 - 0.66919
np.round(diff, 4)

0.0126