In [192]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
import time
import numpy as np
import custom_lemmatizer
import pandas as pd
from scipy.stats import binom
import math
from scipy.stats import chi2

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [2]:
print('The nltk version is {}.'.format(nltk.__version__)) # check version

The nltk version is 3.8.1.


Part 1

In [2]:
with open('Fyodor Dostoyevski Processed.txt', 'r') as file:
    CorpusData = file.read()
words = nltk.word_tokenize(CorpusData)
tagged_Corpus = nltk.pos_tag(words, tagset="universal")

In [264]:
#what is the total number of words in the corpus?
total_words = len(words)
print('The total number of words in the corpus is {}.'.format(total_words))

The total number of words in the corpus is 1425758.


In [256]:
lemmatizer = custom_lemmatizer.custom_lemmatizer()
lemmaized_tokens = []

start_time = time.time()
for t in tagged_Corpus:

    lemmaized_tokens.append((lemmatizer.lemmatize(t), t[1]))

print("%s seconds lemmatization time!" % (time.time() - start_time))

1.989999771118164 seconds lemmatization time!


In [269]:
that_count = 0
for t in lemmaized_tokens:
    if t[0] == 'that':
        that_count += 1
print('The count of "that" after lemmatization is {}.'.format(that_count))

that_count = 0
for t in lemmaized_tokens:
    if t[0] == 'the':
        that_count += 1
print('The count of "the" after lemmatization is {}.'.format(that_count))

that_count = 0
for t in lemmaized_tokens:
    if t[0] == 'abject':
        that_count += 1
print('The count of "abject" after lemmatization is {}.'.format(that_count))

that_count = 0
for t in lemmaized_tokens:
    if t[0] == 'london':
        that_count += 1
print('The count of "london" after lemmatization is {}.'.format(that_count))
that_count = 0
for t in lemmaized_tokens:
    if t[0] == '.':
        that_count += 1
print('The count of "." after lemmatization is {}.'.format(that_count))

The count of "that" after lemmatization is 19429.
The count of "the" after lemmatization is 48392.
The count of "abject" after lemmatization is 21.
The count of "london" after lemmatization is 2.
The count of "." after lemmatization is 51738.


In [272]:
import time

start_time = time.time()

bigrams_size1 = [(lemmaized_tokens[i], lemmaized_tokens[i + 1]) for i in range(len(lemmaized_tokens) - 1)]

bigrams_size2 = [(lemmaized_tokens[i], lemmaized_tokens[i + j]) for i in range(len(lemmaized_tokens) - 3) for j in range(1, 4)]
bigrams_size2 += [(lemmaized_tokens[-3], lemmaized_tokens[-2]), (lemmaized_tokens[-3], lemmaized_tokens[-1]), 
              (lemmaized_tokens[-2], lemmaized_tokens[-1])]

print(f"{time.time() - start_time} seconds bigram creation time!")
print(f"Size of bigrams_size1: {len(bigrams_size1)}")
print(f"Size of bigrams_size2: {len(bigrams_size2)}")

1.3456499576568604 seconds bigram creation time!
Size of bigrams_size1: 1425757
Size of bigrams_size2: 4277268


In [275]:
bigram_count = 0
for i in range(len(bigrams_size1)):
    if bigrams_size1[i][0][0] == 'magnificent' and bigrams_size1[i][1][0] == 'capital':
        bigram_count += 1
print('The count of ("magnificent","capital") in windows of size 1 is {}.'.format(bigram_count))


bigram_count = 0
for i in range(len(bigrams_size2)):
    if bigrams_size2[i][0][0] == 'bright' and bigrams_size2[i][1][0] == 'fire':
        bigram_count += 1
print('The count of ("bright","fire") in windows of size 1 is {}.'.format(bigram_count))

The count of ("magnificent","capital") in windows of size 1 is 1.
The count of ("bright","fire") in windows of size 1 is 1.


In [276]:
#Eliminate all bigrams except those with POS tags NOUN-NOUN or ADJ-NOUN.
unigrams=[b for b in lemmaized_tokens if b[1] == 'NOUN' or b[1] == 'ADJ']
bigrams_size1 = [b for b in bigrams_size1 if (b[0][1] == 'NOUN' and b[1][1] == 'NOUN') or (b[0][1] == 'ADJ' and b[1][1] == 'NOUN')]
bigrams_size2 = [b for b in bigrams_size2 if (b[0][1] == 'NOUN' and b[1][1] == 'NOUN') or (b[0][1] == 'ADJ' and b[1][1] == 'NOUN')]

#Eliminate bigrams that include stopwords
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
#Eliminate bigrams that include stopwords
unigrams = [b for b in unigrams if b[0] not in stopwords]
bigrams_size1 = [b for b in bigrams_size1 if b[0][0] not in stopwords and b[1][0] not in stopwords]
bigrams_size2 = [b for b in bigrams_size2 if b[0][0] not in stopwords and b[1][0] not in stopwords]
#Eliminate bigrams including any punctuation marks. (Hint: You can use the isalpha() function).

unigrams = [b for b in unigrams if b[0].isalpha()]
bigrams_size1 = [b for b in bigrams_size1 if b[0][0].isalpha() and b[1][0].isalpha()]
bigrams_size2 = [b for b in bigrams_size2 if b[0][0].isalpha() and b[1][0].isalpha()]

#remove POS tags from bigrams
bigrams_size1 = [(b[0][0], b[1][0]) for b in bigrams_size1]
bigrams_size2 = [(b[0][0], b[1][0]) for b in bigrams_size2]


#create panda dataframe for unigrams with single column
unigrams = pd.DataFrame(unigrams, columns=['words', 'POS'])
#convert bigrams to a pandas dataframe
bigrams_size1 = pd.DataFrame(bigrams_size1, columns=['word1', 'word2'])
bigrams_size2 = pd.DataFrame(bigrams_size2, columns=['word1', 'word2'])

#count the words in uni-grams. Reorder from most frequent to least frequent.
unigrams = unigrams.groupby(['words']).size().reset_index(name='counts')
unigrams = unigrams.sort_values(by=['counts'], ascending=False)
unigrams = unigrams[unigrams['counts'] >= 10]
#Eliminate bigrams that occur less than 10 times.
bigrams_size1 = bigrams_size1.groupby(['word1', 'word2']).size().reset_index(name='counts')
bigrams_size1 = bigrams_size1.sort_values(by=['counts'], ascending=False)
bigrams_size1 = bigrams_size1[bigrams_size1['counts'] >= 10]
bigrams_size2 = bigrams_size2.groupby(['word1', 'word2']).size().reset_index(name='counts')
bigrams_size2 = bigrams_size2.sort_values(by=['counts'], ascending=False)
bigrams_size2 = bigrams_size2[bigrams_size2['counts'] >= 10]    


In [281]:
bigram_count = 0
for i in range(len(bigrams_size1)):
    if bigrams_size1.iloc[i][0] == 'mr.' and bigrams_size1.iloc[i][1] == 'skimpole':
        bigram_count += 1
print('The count of ("mr.","skimpole") in windows of size 1 is {}.'.format(bigram_count))


bigram_count = 0
for i in range(len(bigrams_size2)):
    if bigrams_size2.iloc[i][0] == 'spontaneous' and bigrams_size2.iloc[i][1] == 'combustion':
        bigram_count += 1
print('The count of ("spontaneous.","combustion") in windows of size 3 is {}.'.format(bigram_count))

The count of ("mr.","skimpole") in windows of size 1 is 0.
The count of ("spontaneous.","combustion") in windows of size 3 is 0.


Part 2

In [300]:
import pandas as pd
import numpy as np

def student_t_test(bigram_counts, unigrams, alpha=0.005):
    unigram_total = unigrams['counts'].sum()
    for row in bigram_counts.iterrows():
        
        H0 = unigrams[unigrams['words'] == row[1]['word1']]['counts'].values[0] * unigrams[unigrams['words'] == row[1]['word2']]['counts'].values[0] / (unigram_total * unigram_total)
        MLE = row[1]['counts'] / unigram_total

        t = (MLE - H0) / np.sqrt(MLE / unigram_total)
        #save the t value in the bigram_counts dataframe
        bigram_counts.loc[row[0], 't-score'] = t

    return bigram_counts

bigrams_size1_t_test = student_t_test(bigrams_size1, unigrams)
bigrams_size2_t_test = student_t_test(bigrams_size2, unigrams)

In [301]:
bigrams_size1_t_test = bigrams_size1_t_test.sort_values(by=['t-score'], ascending=False)
print('The top 20 bigrams in windows of size 1 are:')
print(bigrams_size1_t_test.head(20))

The top 20 bigrams in windows of size 1 are:
           word1            word2  counts    t-score
17254      pyotr     stepanovitch     427  20.596348
20396     stepan     trofimovitch     412  20.255550
23195    varvara         petrovna     331  18.152786
15141        old              man     289  16.193509
7435      fyodor       pavlovitch     246  15.654434
10934   katerina         ivanovna     229  15.092046
24386      young              man     232  14.716428
14260   nastasia       philipovna     215  14.638661
14724    nikolay  vsyevolodovitch     198  14.041475
15247        old            woman     208  14.031270
8321       great             deal     164  12.724990
17253      pyotr       petrovitch     143  11.882136
12244  lizabetha      prokofievna     136  11.652786
12567       long             time     143  11.458258
4401      dmitri     fyodorovitch     126  11.185020
14637       next              day     117  10.608096
15127        old             lady     119  10.573032
6

In [302]:
#find the occurance of word1 and word2 in the unigrams dataframe
bigrams_size1_t_test['word1_count'] = bigrams_size1_t_test.apply(lambda row: unigrams[unigrams['words'] == row['word1']]['counts'].values[0], axis=1)
bigrams_size1_t_test['word2_count'] = bigrams_size1_t_test.apply(lambda row: unigrams[unigrams['words'] == row['word2']]['counts'].values[0], axis=1)
print('The top 20 bigrams in windows of size 1 are:')
print(bigrams_size1_t_test.head(20))

The top 20 bigrams in windows of size 1 are:
           word1            word2  counts    t-score  word1_count  word2_count
17254      pyotr     stepanovitch     427  20.596348          701          502
20396     stepan     trofimovitch     412  20.255550          430          502
23195    varvara         petrovna     331  18.152786          379          491
15141        old              man     289  16.193509         1356         2546
7435      fyodor       pavlovitch     246  15.654434          260          455
10934   katerina         ivanovna     229  15.092046          253          613
24386      young              man     232  14.716428          776         2546
14260   nastasia       philipovna     215  14.638661          362          247
14724    nikolay  vsyevolodovitch     198  14.041475          354          298
15247        old            woman     208  14.031270         1356         1047
8321       great             deal     164  12.724990         1202          218
17253  

In [303]:
#do the same for bigrams in windows of size 3
bigrams_size2_t_test = bigrams_size2_t_test.sort_values(by=['t-score'], ascending=False)
bigrams_size2_t_test['word1_count'] = bigrams_size2_t_test.apply(lambda row: unigrams[unigrams['words'] == row['word1']]['counts'].values[0], axis=1)
bigrams_size2_t_test['word2_count'] = bigrams_size2_t_test.apply(lambda row: unigrams[unigrams['words'] == row['word2']]['counts'].values[0], axis=1)
print('The top 20 bigrams in windows of size 3 are:')
print(bigrams_size2_t_test.head(20))


The top 20 bigrams in windows of size 3 are:
           word1            word2  counts    t-score  word1_count  word2_count
58985      pyotr     stepanovitch     427  20.596348          701          502
70667     stepan     trofimovitch     412  20.255550          430          502
79235    varvara         petrovna     331  18.152786          379          491
51320        old              man     290  16.224287         1356         2546
26366     fyodor       pavlovitch     246  15.654434          260          455
37238   katerina         ivanovna     229  15.092046          253          613
83930      young              man     234  14.784147          776         2546
48499   nastasia       philipovna     215  14.638661          362          247
51481        old            woman     215  14.278359         1356         1047
49751    nikolay  vsyevolodovitch     198  14.041475          354          298
28920      great             deal     165  12.764221         1202          218
50655  

In [304]:


def pearson_chi_test(bigram_counts, unigrams, alpha=0.005):
    unigram_total = unigrams['counts'].sum()
    bigram_counts['expected'] = bigram_counts.apply(lambda row: 
                                        unigrams[unigrams['words'] == row['word1']]['counts'].values[0] * 
                                        unigrams[unigrams['words'] == row['word2']]['counts'].values[0] / 
                                        unigram_total, axis=1)
    
    bigram_counts['chi-square'] = np.power((bigram_counts['counts'] - bigram_counts['expected']), 2) / bigram_counts['expected']
    
        
    return bigram_counts



In [305]:
bigrams_size1_chi = pearson_chi_test(bigrams_size1_t_test, unigrams)
bigrams_size2_chi = pearson_chi_test(bigrams_size2_t_test, unigrams)

In [308]:
#reorder the bigrams by chi-square value
bigrams_size1_chi = bigrams_size1_chi.sort_values(by=['chi-square'], ascending=False)
bigrams_size2_chi = bigrams_size2_chi.sort_values(by=['chi-square'], ascending=False)


In [309]:
print('The top 20 bigrams in windows of size 1 are:')
print(bigrams_size1_chi.head(20))

The top 20 bigrams in windows of size 1 are:
           word1           word2  counts    t-score  word1_count  word2_count  \
13538     mihail     makarovitch      20   4.471726           22           21   
18163     rodion     romanovitch      80   8.940855           95           81   
11554        lef   nicolaievitch      33   5.743619           35           39   
22084     trifon    borissovitch      35   5.915033           40           39   
20396     stepan    trofimovitch     412  20.255550          430          502   
819      avdotya       romanovna      86   9.269403           92          107   
14711    nikodim         fomitch      19   4.358449           19           26   
7480     gavrila  ardalionovitch      49   6.998099           50           67   
10662    ippolit    kirillovitch      31   5.566789           38           36   
12244  lizabetha     prokofievna     136  11.652786          153          175   
18803     semyon    yakovlevitch      33   5.743437           44

In [310]:
print('The top 20 bigrams in windows of size 3 are:')
print(bigrams_size2_chi.head(20))


The top 20 bigrams in windows of size 3 are:
           word1           word2  counts    t-score  word1_count  word2_count  \
45464     mihail     makarovitch      20   4.471726           22           21   
62183     rodion     romanovitch      80   8.940855           95           81   
39460        lef   nicolaievitch      33   5.743619           35           39   
76729     trifon    borissovitch      35   5.915033           40           39   
70667     stepan    trofimovitch     412  20.255550          430          502   
3791     avdotya       romanovna      86   9.269403           92          107   
49655    nikodim         fomitch      19   4.358449           19           26   
26705    gavrila  ardalionovitch      49   6.998099           50           67   
36251    ippolit    kirillovitch      31   5.566789           38           36   
41331  lizabetha     prokofievna     136  11.652786          153          175   
82319       wisp             tow      14   3.741352           18

In [314]:
def likelihood_ratio_test(bigram_counts, unigrams, alpha=0.05):
    # Compute total counts
    unigram_total = unigrams['counts'].sum()
    bigram_total = bigram_counts['counts'].sum()

    for idx, row in bigram_counts.iterrows():
        # Extract counts and calculate probabilities for hypothesis tests
        c_12, c_1, c_2 = row['counts'], unigrams.loc[unigrams['words'] == row['word1'], 'counts'].values[0], unigrams.loc[unigrams['words'] == row['word2'], 'counts'].values[0]
        N = unigram_total
        H1_p = c_2 / N
        H2_p1 = c_12 / c_1
        H2_p2 = (c_2 - c_12) / (N - c_1)

        # Calculate the terms in the likelihood ratio test
        H1_term_1 = binom.pmf(c_12, c_1, H1_p)
        H1_term_2 = binom.pmf(c_2 - c_12, N - c_1, H1_p)
        H2_term_1 = binom.pmf(c_12, c_1, H2_p1)
        H2_term_2 = binom.pmf(c_2 - c_12, N - c_1, H2_p2)

        # Check for values of zero and use a small value instead
        H1_term_1 = H1_term_1 if H1_term_1 != 0 else math.ulp(0.0)
        H1_term_2 = H1_term_2 if H1_term_2 != 0 else math.ulp(0.0)
        H2_term_1 = H2_term_1 if H2_term_1 != 0 else math.ulp(0.0)
        H2_term_2 = H2_term_2 if H2_term_2 != 0 else math.ulp(0.0)

        # Calculate the likelihood ratio and the corresponding score
        log_L_H1 = math.log(H1_term_1) + math.log(H1_term_2)
        log_L_H2 = math.log(H2_term_1) + math.log(H2_term_2)
        log_likelihood_ratio = log_L_H1 - log_L_H2
        likelihood_ratio_score = -2 * log_likelihood_ratio

        # Store the likelihood ratio score in the bigram_counts dataframe
        bigram_counts.loc[idx, 'likelihood_ratio_score'] = likelihood_ratio_score

    return bigram_counts


In [315]:
bigrams_size1_likelihood=likelihood_ratio_test(bigrams_size1_chi, unigrams)
bigrams_size2_likelihood=likelihood_ratio_test(bigrams_size2_chi, unigrams)

In [316]:
#reorder according to likelihood ratio score
bigrams_size1_likelihood=bigrams_size1_likelihood.sort_values(by='likelihood_ratio_score', ascending=False)
bigrams_size2_likelihood=bigrams_size2_likelihood.sort_values(by='likelihood_ratio_score', ascending=False)

In [317]:
print('The top 20 bigrams in windows of size 1 are:')
print(bigrams_size1_likelihood.head(20))

The top 20 bigrams in windows of size 1 are:
           word1            word2  counts    t-score  word1_count  \
17254      pyotr     stepanovitch     427  20.596348          701   
20396     stepan     trofimovitch     412  20.255550          430   
23195    varvara         petrovna     331  18.152786          379   
14260   nastasia       philipovna     215  14.638661          362   
14724    nikolay  vsyevolodovitch     198  14.041475          354   
7435      fyodor       pavlovitch     246  15.654434          260   
12244  lizabetha      prokofievna     136  11.652786          153   
10934   katerina         ivanovna     229  15.092046          253   
8321       great             deal     164  12.724990         1202   
24450      yulia       mihailovna     105  10.238928          115   
13277    mavriky     nikolaevitch      96   9.792194          112   
819      avdotya        romanovna      86   9.269403           92   
18163     rodion      romanovitch      80   8.940855      

In [319]:
print('The top 20 bigrams in windows of size 3 are:')
print(bigrams_size2_likelihood.head(20))

The top 20 bigrams in windows of size 3 are:
           word1            word2  counts    t-score  word1_count  \
58985      pyotr     stepanovitch     427  20.596348          701   
70667     stepan     trofimovitch     412  20.255550          430   
79235    varvara         petrovna     331  18.152786          379   
48499   nastasia       philipovna     215  14.638661          362   
50655          o            clock     157  12.518405          194   
49751    nikolay  vsyevolodovitch     198  14.041475          354   
26366     fyodor       pavlovitch     246  15.654434          260   
41331  lizabetha      prokofievna     136  11.652786          153   
29888         ha               ha     144  11.980779          241   
37238   katerina         ivanovna     229  15.092046          253   
28920      great             deal     165  12.764221         1202   
84079      yulia       mihailovna     105  10.238928          115   
44546    mavriky     nikolaevitch      96   9.792194      

Part 3

In [320]:
bigrams_size1_likelihood[(bigrams_size1_likelihood['word1']=='head') & (bigrams_size1_likelihood['word2']=='clerk')]

Unnamed: 0,word1,word2,counts,t-score,word1_count,word2_count,expected,chi-square,likelihood_ratio_score
8917,head,clerk,22,4.598527,798,136,0.430995,1079.413746,134.120965


In [321]:
bigrams_size1_likelihood[(bigrams_size1_likelihood['word1']=='great') & (bigrams_size1_likelihood['word2']=='man')]

Unnamed: 0,word1,word2,counts,t-score,word1_count,word2_count,expected,chi-square,likelihood_ratio_score
8462,great,man,18,1.378086,1202,2546,12.153276,2.812755,2.488797


In [262]:
import dataframe_image as dfi
dfi.export(bigrams_size1_likelihood.head(),"mytableFinal.png")