In [1]:
# Module for predicting top n best possible word matches given a word

# Context:
# A -> True word
# B -> Typed word

# Make dictionary, key (true word), value (list of incorrect words)
# tru_in dict is A -> [B1,B2,...,Bn]
missp_dat_path = "/Users/daveistanto/Documents/project_normal/advanced_functions/predictive_search/dataset/missp.dat"
wiki_dat_path = "/Users/daveistanto/Documents/project_normal/advanced_functions/predictive_search/dataset/wikipedia.dat"
aspell_dat_path = "/Users/daveistanto/Documents/project_normal/advanced_functions/predictive_search/dataset/aspell.dat"

tru_in = dict()

def add_to_dict(data_path, input_dict):
    with open(data_path, 'r') as df:
        data_lines = df.readlines()
        for line in data_lines:
            lns = line[:-1].upper()
            if lns[0] == "$":     
                curr_key = lns[1:]
                if curr_key not in list(set(input_dict)):
                    input_dict[curr_key] = []
            else:
                 input_dict[curr_key].append(lns)
    
    for k, v in input_dict.items():
        tru_in[k] = list(set(tru_in[k]))
    return input_dict

tru_in = add_to_dict(missp_dat_path, tru_in)
tru_in = add_to_dict(wiki_dat_path, tru_in)
tru_in = add_to_dict(aspell_dat_path, tru_in)

In [2]:
# Get P(B|A) distribution for specific B. Does not equal 1 because different A(s).

def get_P_BA(B):
    PBA_dict = dict()
    B = B.upper()
    for k, v in tru_in.items():
        PBA_for_given_A = tru_in[k].count(B) / len(tru_in[k])
        PBA_dict[k] = PBA_for_given_A
        
    return PBA_dict

In [8]:
# Get P(A) distribution

import pandas as pd
import numpy as np
word_freq_path = "/Users/daveistanto/Documents/project_normal/advanced_functions/predictive_search/dataset/top_5k_words.csv"
word_freq_df = pd.read_csv(word_freq_path)

def normalize_word(input_word):
    input_word = input_word.upper()[3:]
    return input_word

normalize_word = np.vectorize(normalize_word)

word_freq_df["Normalized word"] = np.apply_along_axis(normalize_word, 0, word_freq_df["Word"].values)
word_freq_df = word_freq_df.drop(columns=["Word"])
word_freq_df["Relative Frequency"] = 0

# Add dummy frequency of 1000 if word not found
for k in tru_in.keys():
    if k.upper() not in word_freq_df["Normalized word"].values:
        new_row = {"Frequency": 1000, "Normalized word": k}
        word_freq_df = word_freq_df.append(new_row, ignore_index=True)

# Add relative frequency (P(A)) for each A
word_sum = word_freq_df["Frequency"].sum()
for row_index in range(len(word_freq_df)):
    word_freq_df.iloc[row_index, 2] = word_freq_df.iloc[row_index, 0] / word_sum
    
    
# Sort word_freq_df by frequency
word_freq_df = word_freq_df.sort_values(by="Frequency", ascending=False)


FIRE


In [23]:
# Get P(A) given specific A:

def get_PA(A):
    P_A = word_freq_df["Relative Frequency"][word_freq_df["Normalized word"] == A].values[0]
    return P_A
    

In [42]:
# Get relative P(A|B)

def get_PAB(B):
    B = B.upper()
    PBA_dict = get_P_BA(B)
    PAB_list = []
    for k, v in PBA_dict.items():
            if v != 0:
                P_A = get_PA(k)
                tup = (k, PBA_dict[k] * P_A)
                PAB_list.append(tup)

    PAB_list.append((B, 1))
    
    # Sort by value, descending
    PAB_list.sort(key=lambda tup: tup[1], reverse=True)
    return PAB_list

print(get_PAB("are"))


[('ARE', 1), ('A', 0.004335377292933662), ('AND', 0.0032133261164172633), ('I', 0.0009917897109013784), ('OR', 0.000515801002713679), ('AREA', 0.00024802365183412275), ('AIR', 7.922720154781406e-05), ('OUR', 4.909134884072801e-05), ('EYE', 1.94628210576491e-05), ('FIRE', 4.935017633029245e-06), ('ERE', 6.79914744173913e-08)]
