# Stage One: Closed Vocabulary
1. __Model 1:__ Closed Vocabulary Unigram probability from single training corpus: thresholds

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict
from nltk.tokenize import word_tokenize
import textstat
from sklearn.metrics import accuracy_score

In [2]:
import utilities as ut

In [3]:
training_path = './data/WikiLarge_Train.csv'

In [None]:
%%time
main_df = ut.produce_dataframe(training_path)

In [None]:
main_df.shape

In [None]:
main_df.head()

### Produce Closed Vocabulary

In [None]:
threshold = 11

In [None]:
def build_counts(token_list, d_dict):
    for token in token_list:
         d_dict[token] += 1
    return d_dict

In [None]:
total_word_counts = defaultdict(int)

In [None]:
main_df['tokens'].apply(lambda x: build_counts(x, total_word_counts))

In [None]:
# replace low frequency words with '<unk>'
# input is the list of tokens from the original sentence tokens
replace_low_f = lambda x: [t if total_word_counts[t] >= threshold else '<unk>' for t in x]

In [None]:
main_df['closed_tokens'] = main_df['tokens'].apply(lambda x: replace_low_f(x))

In [None]:
main_df.head()

In [None]:
%%time
# sentences that DO NOT need to be simplified; label = 0
good_df = main_df[ main_df['label'] == 0]
good_df.shape

In [None]:
# sentences that need to be simplified; label = 1
need_simp_ones_df = main_df[ main_df['label'] == 1]
need_simp_ones_df.shape

## Model 1: Closed Vocabulary Unigram model

In [None]:
# assumptions: differences in the distributions will remain if low frequency words are identified 
# assumptions: stop words left in, distribution will contribute to the need for simplification

### Class 0: No Simplification needed
Word distribution and probability from the class of sentences that do not need simplification

In [None]:
%%time
# class zero are the terms in the sentences that do not need simplification

# number of times each term appears in Class 0
class_zero_count = defaultdict(int)
good_df['closed_tokens'].apply(lambda x: build_counts(x, class_zero_count))

In [None]:
len(class_zero_count)

In [None]:
# total number of terms in Class 0
class_0_total_terms = sum(class_zero_count.values())
class_0_total_terms

In [None]:
%%time
# log probability of terms in Class 0
class_0_log_prob_terms = {k:np.log(v/class_0_total_terms) for k, v in class_zero_count.items()}

In [None]:
# class_0_log_prob_terms

### Class 1: Needs Simplification

In [None]:
%%time
# class 1 are the term frequency for the sentences that do need simplification

# number of times each term appears in Class 1
class_one_count = defaultdict(int)
need_simp_ones_df['closed_tokens'].apply(lambda x: build_counts(x, class_one_count))

In [None]:
len(class_one_count)

In [None]:
# total number of terms in Class 1
class_1_total_terms = sum(class_one_count.values())
class_1_total_terms

In [None]:
%%time
# log probability of the terms in the Class 1 distribution
class_1_log_prob_terms = {k:np.log(v/class_1_total_terms) for k, v in class_one_count.items()}

In [None]:
# class_1_log_prob_terms

### Closed Vocabulary Unigram probability model

In [None]:
def probability_prediction(token_list, c_1p = class_1_log_prob_terms, c_zp = class_0_log_prob_terms):
    # probabilities are negative numbers since they are on the log scale --> higher number = greater probability

    # class 0, no need for simplification likelihood:
    class_0_prob = 0

    # class 1, need simplification likelihood:
    class_1_prob = 0
  
    for token in token_list:
        try:
            class_0_prob += class_0_log_prob_terms[token]
        except:
            class_0_prob += 0

        try:
            class_1_prob += class_1_log_prob_terms[token]
        except: 
            class_1_prob += 0

    predicted_class = 1 if class_1_prob > class_0_prob else 0

    return predicted_class

In [None]:
%%time
main_df['predicted'] = main_df.apply(lambda x: probability_prediction(x['tokens']), axis = 1)

In [None]:
# main_df.head()

In [None]:
accuracy_score(main_df['label'], main_df['predicted'])

In [None]:
# threshold 15 = 0.68106476504914
# threshold 13 = 0.6814894617628993
# threshold 12 = 0.6816214296683046
# threshold 11 = 0.6817749923218673 ***
# threshold 10 = 0.681525453009828