In [2]:
import pandas as pd
from collections import Counter
# this imports the SimpleTokenize function from the simple_tokenize.py file that you uploaded
from simple_tokenize import simple_tokenize
# the log function for computing PMI
# for the sake of consistency across solutions, please use log base 10
from math import log10

# Now, let's tokenize Shakespeare's plays
with open('Shakespeare.txt') as f:
    t = []
    tokenpairs = []
    numberlines = 0
    for line in f:
        #count the lines
        numberlines += 1
        #create a list that contains all tokens at most once (set)
        my_list = list(set(simple_tokenize(line)))
        #store all tokens
        t.extend(my_list)
        #store all pairs
        for x in my_list:
            for y in my_list:
                #skip the pair with the same word
                if x != y:
                    tokenpairs.append((x, y))
#make collection counters from both lists
words = Counter(t)
tpairs = Counter(tokenpairs)
#transform the dictionary of pairs into pandas dataframe
df = pd.DataFrame.from_dict(dict(tpairs).items())
#separate tupple into different columns
df['a'], df['b'] = list(zip(*((x[0], x[1]) for x in df[0].values)))
###################################################################################################################
#  the user interface below defines the types of PMI queries that users can ask
#  you will need to modify it - where indicated - to access the results of your PMI analysis (above)
#  so that the queries can be answered
###################################################################################################################

while True:
    q = input("Input 1 or 2 space-separated tokens (return to quit): ")
    if len(q) == 0:
        break
    q_tokens = simple_tokenize(q)
    if len(q_tokens) == 1:
        threshold = 0
        if (words[q_tokens[0]] == 0):
            print("Word not found!")
            continue
        while threshold <= 0:
            try:
                threshold = int(input("Input a positive integer frequency threshold: "))
            except ValueError:
                print("Threshold must be a positive integer!")
                continue
        
        # Put code here to answer a One-Token Query with token q_tokens[0] and the specified threshold,
        # and output the result.
        #locate the single word and put results in small dataframe
        df2 = df.loc[df['a'] == q_tokens[0]]
        #get rows above threshold
        df2 = df2.loc[df2[1] >= threshold]
        #give new indices to smaller dataframe
        df2 = df2.reset_index(drop=True)
        #new pmi column
        df2['pmi'] = 0.
        #calculate pmi for each row
        for i in df2.index:
            df2['pmi'][i]= log10((df2.iloc[i][1]/numberlines)/
            ((words[df2.iloc[i]['a']]/numberlines)*(words[df2.iloc[i]['b']]/numberlines)))
        #sort rows in descending pmi
        df2 = df2.sort_values('pmi', ascending=False)
        #give new indices to smaller dataframe
        df2 = df2.reset_index(drop=True)
        if len(df2) < 5:
            print("Not enough pairs, select lower threshold!")
            continue
        # The print() statements below exist to show you the desired output format.
        # Replace them with your own output code, which should produce results in a similar format.
        print("  n({0}) = {1}".format(q_tokens[0],words[q_tokens[0]]))
        print("  high PMI tokens with respect to {0} (threshold: {1}):".format(q_tokens[0],threshold))
        print("    n({0},{1}) = {2},  PMI({0},{1}) = {3}".format(q_tokens[0],df2['b'][0],df2[1][0],df2['pmi'][0]))    
        print("    n({0},{1}) = {2},  PMI({0},{1}) = {3}".format(q_tokens[0],df2['b'][1],df2[1][1],df2['pmi'][1]))
        print("    n({0},{1}) = {2},  PMI({0},{1}) = {3}".format(q_tokens[0],df2['b'][2],df2[1][2],df2['pmi'][2]))
        print("    n({0},{1}) = {2},  PMI({0},{1}) = {3}".format(q_tokens[0],df2['b'][3],df2[1][3],df2['pmi'][3]))
        print("    n({0},{1}) = {2},  PMI({0},{1}) = {3}".format(q_tokens[0],df2['b'][4],df2[1][4],df2['pmi'][4]))
        # in the above, all XXX values should be at least as large as the threshold

    elif len(q_tokens) == 2:
        # Put code here to answer a Two-Token Query with tokens q_tokens[0] and q_tokens[1]
        if q_tokens[0] == q_tokens[1]:
            print("Choose different words!")
            continue
        if (words[q_tokens[0]] == 0) | (words[q_tokens[1]] == 0):
            print("Word not found!")
            continue
        pmi = log10((tpairs[(q_tokens[0],q_tokens[1])]/numberlines)/
            ((words[q_tokens[0]]/numberlines)*(words[q_tokens[1]]/numberlines)))
        # As was the case for the One-Token query, the print statements below show the desired output format
        # Replace them with your own output code
        print("  n({0},{1}) = {2}".format(q_tokens[0],q_tokens[1],tpairs[(q_tokens[0],q_tokens[1])]))
        print("  PMI({0},{1}) = {2}".format(q_tokens[0],q_tokens[1],pmi))
    else:
        print("Input must consist of 1 or 2 space-separated tokens!")


  n(love) = 2020
  high PMI tokens with respect to love (threshold: 40):
    n(love,true) = 48,  PMI(love,true) = 0.5607870431406314
    n(love,thee) = 126,  PMI(love,thee) = 0.3989913752069878
    n(love,her) = 137,  PMI(love,her) = 0.3810340266819545
    n(love,do) = 130,  PMI(love,do) = 0.3371513453043346
    n(love,if) = 122,  PMI(love,if) = 0.33130242906434526
  n(love,hate) = 34
  PMI(love,hate) = 1.0735654615166432
