# <center><font color = "green">SPELL CORRECTION</font></center>

In [None]:
### Table Content 

1. [Importing Libraries](#import)
2. [Reading Data](#read)
3. [Data Cleaning](#clean)
4. [String Functions](#function)
5. [Prediction](#predict)

# <font color = "green">Importing Libraries</font><a class = "anchor" id = "import"></a>

In [75]:
import re
from collections import Counter
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd

In [2]:
df = pd.read_csv(r"C:\Users\CHAITANYA\Documents\NLP-PROJECTS\Correct+spelling+preiction+code+&+data\dataset\papers.csv")

# <font color = "green">Reading the Data </font><a class = "anchor" id = "read"></a>

In [3]:
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [4]:
df.shape

(7241, 7)

In [5]:
df.columns

Index(['id', 'year', 'title', 'event_type', 'pdf_name', 'abstract',
       'paper_text'],
      dtype='object')

In [6]:
df.isnull().sum()

id               0
year             0
title            0
event_type    4819
pdf_name         0
abstract         0
paper_text       0
dtype: int64

In [7]:
df.event_type.value_counts()

Poster       2146
Spotlight     181
Oral           95
Name: event_type, dtype: int64

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(df.event_type.value_counts() , labels =['df.event_type.value_counts().index[0]','df.event_type.value_counts().index[1]','df.event_type.value_counts().index[2]'])

# <font color = "green">Data Cleaning</font><a class = "anchor" id = "clean"></a>

In [8]:
# nltk.download('stopwords')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [9]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [10]:
##Creating a list of custom stopwords

new_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]

In [11]:
stop_words = list(stop_words.union(new_words))

In [12]:
# 2.) remove tags

# &lt; = This part matches the HTML entity for the less-than sign ("<").
# /?   = This part matches zero or one occurrence of the forward slash ("/").
# .*?  = This part matches any character (represented by .) zero or more times (denoted by *), but the ? makes it non-greedy, so it will match as few characters as possible.
# &gt; = This part matches the HTML entity for the greater-than sign (">").

# 3.) remove special characters and digits

# \\d = This part of the pattern matches any digit (0-9).
# \\W = This part of the pattern matches any non-word character.
# +   = The plus sign means "one or more occurrences."

In [13]:
def pre_process(text):
    
    # 1.) lowercase
    text=text.lower()
    
    # 2.) remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # 3.) remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    # 4.) Convert to list from string
    text = text.split()
    
    # 5.) remove stopwords
    text = [word for word in text if word not in stop_words]

    # 6.) remove words less than three letters
    text = [word for word in text if len(word) >= 3]

    # 7.) lemmatize
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    
    return ' '.join(text)

In [14]:
df['paper_text']

0       767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1       683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2       394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3       Bayesian Query Construction for Neural\nNetwor...
4       Neural Network Ensembles, Cross\nValidation, a...
                              ...                        
7236    Single Transistor Learning Synapses\n\nPaul Ha...
7237    Bias, Variance and the Combination of\nLeast S...
7238    A Real Time Clustering CMOS\nNeural Engine\nT....
7239    Learning direction in global motion: two\nclas...
7240    Correlation and Interpolation Networks for\nRe...
Name: paper_text, Length: 7241, dtype: object

In [15]:
df['paper_text'].shape

(7241,)

In [16]:
df['paper_text'][0]

'767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABASE\nAND ITS APPLICATIONS\nHisashi Suzuki and Suguru Arimoto\nOsaka University, Toyonaka, Osaka 560, Japan\nABSTRACT\nAn efficient method of self-organizing associative databases is proposed together with\napplications to robot eyesight systems. The proposed databases can associate any input\nwith some output. In the first half part of discussion, an algorithm of self-organization is\nproposed. From an aspect of hardware, it produces a new style of neural network. In the\nlatter half part, an applicability to handwritten letter recognition and that to an autonomous\nmobile robot system are demonstrated.\n\nINTRODUCTION\nLet a mapping f : X -+ Y be given. Here, X is a finite or infinite set, and Y is another\nfinite or infinite set. A learning machine observes any set of pairs (x, y) sampled randomly\nfrom X x Y. (X x Y means the Cartesian product of X and Y.) And, it computes some\nestimate j : X -+ Y of f to make small, the estimation erro

In [17]:
docs = df['paper_text'].iloc[:3000].apply(lambda x:pre_process(x))

In [18]:
docs.shape

(3000,)

In [20]:
sentences = docs.tolist()
len(sentences)

3000

In [21]:
sentences[:2]

['self organization associative database application hisashi suzuki suguru arimoto osaka university toyonaka osaka japan abstract efficient method self organizing associative database proposed together application robot eyesight system proposed database associate input output first half part discussion algorithm self organization proposed aspect hardware produce new style neural network latter half part applicability handwritten letter recognition autonomous mobile robot system demonstrated introduction let mapping given finite infinite set another finite infinite set learning machine observes set pair sampled randomly mean cartesian product computes estimate make small estimation error measure usually say faster decrease estimation error increase number sample better learning machine however expression performance incomplete since lack consideration candidate assumed preliminarily find good learning machine clarify conception let discus type learning machine let advance understanding 

In [22]:
# combines all sentences and storing in text_data

text_data = ' '.join(sentences)
text_data[:1000]

'self organization associative database application hisashi suzuki suguru arimoto osaka university toyonaka osaka japan abstract efficient method self organizing associative database proposed together application robot eyesight system proposed database associate input output first half part discussion algorithm self organization proposed aspect hardware produce new style neural network latter half part applicability handwritten letter recognition autonomous mobile robot system demonstrated introduction let mapping given finite infinite set another finite infinite set learning machine observes set pair sampled randomly mean cartesian product computes estimate make small estimation error measure usually say faster decrease estimation error increase number sample better learning machine however expression performance incomplete since lack consideration candidate assumed preliminarily find good learning machine clarify conception let discus type learning machine let advance understanding s

In [23]:
# \$\\w = dollar($) + word => ex: $A,$variable or $123, but not $@special since @ is not a word character.

# ^RT[\\s]+ = This regular expression is used to match text that starts with "RT" followed by one or more whitespace characters.
# ^         = at the beginning of the pattern indicates that the match should occur at the start of a line.

# 'https?:\\/\\/.*[\\r\\n]*'= This regular expression appears to be designed to match URLs that start with "http://" or "https://" and
#                             continue until the end of the line or until a newline character. 

#  \w = All letters , 0-9 , _

In [24]:
def process_tweet(tweet):
    
      tweet = re.sub(r'\$\w*', '', tweet) # $
      tweet = re.sub(r'^RT[\s]+', '', tweet) # RT
      tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) # urls
      tweet = re.sub(r'#', '', tweet)
      return tweet    
    
def misc(file_name):
    words = []
    file_name = process_tweet(file_name)
    words = re.findall(r'\w+', file_name)
    return words


In [25]:
words = misc(text_data)
words

['self',
 'organization',
 'associative',
 'database',
 'application',
 'hisashi',
 'suzuki',
 'suguru',
 'arimoto',
 'osaka',
 'university',
 'toyonaka',
 'osaka',
 'japan',
 'abstract',
 'efficient',
 'method',
 'self',
 'organizing',
 'associative',
 'database',
 'proposed',
 'together',
 'application',
 'robot',
 'eyesight',
 'system',
 'proposed',
 'database',
 'associate',
 'input',
 'output',
 'first',
 'half',
 'part',
 'discussion',
 'algorithm',
 'self',
 'organization',
 'proposed',
 'aspect',
 'hardware',
 'produce',
 'new',
 'style',
 'neural',
 'network',
 'latter',
 'half',
 'part',
 'applicability',
 'handwritten',
 'letter',
 'recognition',
 'autonomous',
 'mobile',
 'robot',
 'system',
 'demonstrated',
 'introduction',
 'let',
 'mapping',
 'given',
 'finite',
 'infinite',
 'set',
 'another',
 'finite',
 'infinite',
 'set',
 'learning',
 'machine',
 'observes',
 'set',
 'pair',
 'sampled',
 'randomly',
 'mean',
 'cartesian',
 'product',
 'computes',
 'estimate',
 'make

In [28]:
vocab = set(words)
print(f"There are {len(vocab)} unique words in the vocabulary.")

There are 90669 unique words in the vocabulary.


In [27]:
 #words = re.findall(r'\w+', text_data)
# print(len(words))
# vocab = set(words)
# print(len(vocab))

5482203


In [29]:
def get_count(word_l):
    """
    Input:
        word_l: a set of words representing the corpus.
    Output:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    """
    word_count_dict = {}
    word_count_dict = Counter(word_l)

    return word_count_dict

In [30]:
word_count_dict = get_count(words)

In [31]:
word_count_dict['infinite']

1484

In [32]:
word_count_dict['self']

1608

In [34]:
for word in words:
    print(words : word_count_dict[word])
    


SyntaxError: invalid syntax (2097997549.py, line 2)

In [33]:
word_count_dict.values()

dict_values([1608, 708, 680, 1864, 5620, 6, 22, 1, 8, 10, 5207, 1, 287, 3592, 3172, 21670, 343, 4547, 1603, 1885, 3, 16767, 345, 18959, 11032, 13657, 1103, 4764, 2497, 35358, 1199, 581, 2697, 9601, 553, 15291, 26341, 1075, 216, 522, 1270, 5618, 246, 281, 1049, 3490, 8605, 2888, 16863, 3032, 1484, 32799, 2767, 33856, 7859, 99, 4414, 1377, 2283, 11553, 123, 3482, 652, 8894, 5293, 6890, 5229, 16202, 5616, 1605, 1038, 1247, 1719, 3460, 20103, 4625, 4372, 9717, 2189, 11208, 350, 9168, 561, 503, 1131, 1842, 2, 5093, 3668, 78, 15, 1111, 4670, 2650, 800, 17717, 350, 682, 7183, 196, 106, 816, 8372, 2489, 4041, 451, 2866, 20416, 14647, 4128, 1881, 6128, 8241, 14011, 1742, 15395, 928, 2, 8289, 23258, 218, 1483, 449, 1538, 3522, 15, 789, 6794, 1582, 2192, 6679, 190, 2009, 4811, 1472, 223, 26, 689, 58, 205, 2127, 24, 1379, 1543, 2815, 1879, 60, 1375, 566, 2064, 8632, 4883, 4999, 1, 674, 2277, 2886, 863, 1948, 5166, 7493, 5884, 32, 53, 2185, 437, 1544, 648, 2183, 2486, 430, 492, 8036, 919, 394, 1408

In [34]:
word_count_dict.keys()



In [35]:
def get_probs(word_count_dict):
    """
    Input:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    Output:
        probs: A dictionary where keys are the words and the values are the probability that a word will occur.
    """
    probs = {}  # return this variable correctly
    m = sum(word_count_dict.values())
    for key in word_count_dict.keys():
        probs[key] = word_count_dict.get(key, 0) / m # gives count / m if key not found in dictionary then return 0

    return probs

In [36]:
probs = get_probs(word_count_dict)

# <font color = "green">String Functions </font><a class = "anchor" id = "function"></a>

In [37]:
# Part 2: String Manipulation

In [39]:
# delete_letter()
def delete_letter(word, verbose=False):
    delete_l = []
    split_l = []

    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    delete_l = [L + R[1:] for L, R in split_l if R]

    if verbose:
        print(f"input word {word}, \nsplit_l = {split_l}, \ndelete_l = {delete_l}")# printing implicitly.

    return delete_l

In [40]:
# checking the function
print(delete_letter(word="cans", verbose=True))

input word cans, 
split_l = [('', 'cans'), ('c', 'ans'), ('ca', 'ns'), ('can', 's')], 
delete_l = ['ans', 'cns', 'cas', 'can']
['ans', 'cns', 'cas', 'can']


In [41]:
# switch_letter()
def switch_letter(word, verbose=False):
    def swap(c, i, j):
        c = list(c)
        c[i], c[j] = c[j], c[i]
        return ''.join(c)

    switch_l = []
    split_l = []
    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    switch_l = [a + b[1] + b[0] + b[2:] for a, b in split_l if len(b) >= 2]

    if verbose:
        print(f"Input word = {word} \nsplit_l = {split_l} \nswitch_l = {switch_l}")

    return switch_l

In [44]:
print(switch_letter(word="eta", verbose=True))

Input word = eta 
split_l = [('', 'eta'), ('e', 'ta'), ('et', 'a')] 
switch_l = ['tea', 'eat']
['tea', 'eat']


In [49]:
# replace_letter()
def replace_letter(word, verbose=False):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    replace_l = []
    split_l = []

    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    replace_l = [a + l + (b[1:] if len(b) > 1 else '') for a, b in split_l if b for l in letters]
    replace_set = set(replace_l)
    replace_set.remove(word)
    # turn the set back into a list and sort it, for easier viewing
    # replace_l = sorted(list(replace_set))

    if verbose:
        print(f"Input word = {word} \nsplit_l = {split_l} \nreplace_l {replace_l}")

    return replace_l

In [50]:
print(replace_letter(word='can', verbose=True))

Input word = can 
split_l = [('', 'can'), ('c', 'an'), ('ca', 'n')] 
replace_l ['aan', 'ban', 'can', 'dan', 'ean', 'fan', 'gan', 'han', 'ian', 'jan', 'kan', 'lan', 'man', 'nan', 'oan', 'pan', 'qan', 'ran', 'san', 'tan', 'uan', 'van', 'wan', 'xan', 'yan', 'zan', 'can', 'cbn', 'ccn', 'cdn', 'cen', 'cfn', 'cgn', 'chn', 'cin', 'cjn', 'ckn', 'cln', 'cmn', 'cnn', 'con', 'cpn', 'cqn', 'crn', 'csn', 'ctn', 'cun', 'cvn', 'cwn', 'cxn', 'cyn', 'czn', 'caa', 'cab', 'cac', 'cad', 'cae', 'caf', 'cag', 'cah', 'cai', 'caj', 'cak', 'cal', 'cam', 'can', 'cao', 'cap', 'caq', 'car', 'cas', 'cat', 'cau', 'cav', 'caw', 'cax', 'cay', 'caz']
['aan', 'ban', 'can', 'dan', 'ean', 'fan', 'gan', 'han', 'ian', 'jan', 'kan', 'lan', 'man', 'nan', 'oan', 'pan', 'qan', 'ran', 'san', 'tan', 'uan', 'van', 'wan', 'xan', 'yan', 'zan', 'can', 'cbn', 'ccn', 'cdn', 'cen', 'cfn', 'cgn', 'chn', 'cin', 'cjn', 'ckn', 'cln', 'cmn', 'cnn', 'con', 'cpn', 'cqn', 'crn', 'csn', 'ctn', 'cun', 'cvn', 'cwn', 'cxn', 'cyn', 'czn', 'caa', 'c

In [52]:
#  insert_letter()
def insert_letter(word, verbose=False):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    insert_l = []
    split_l = []
    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    insert_l = [ a + l + b for a, b in split_l for l in letters]

    if verbose:
        print(f"Input word {word} \nsplit_l = {split_l} \ninsert_l = {insert_l}")

    return insert_l


In [55]:
print(insert_letter(word='at', verbose=True))

Input word at 
split_l = [('', 'at'), ('a', 't')] 
insert_l = ['aat', 'bat', 'cat', 'dat', 'eat', 'fat', 'gat', 'hat', 'iat', 'jat', 'kat', 'lat', 'mat', 'nat', 'oat', 'pat', 'qat', 'rat', 'sat', 'tat', 'uat', 'vat', 'wat', 'xat', 'yat', 'zat', 'aat', 'abt', 'act', 'adt', 'aet', 'aft', 'agt', 'aht', 'ait', 'ajt', 'akt', 'alt', 'amt', 'ant', 'aot', 'apt', 'aqt', 'art', 'ast', 'att', 'aut', 'avt', 'awt', 'axt', 'ayt', 'azt']
['aat', 'bat', 'cat', 'dat', 'eat', 'fat', 'gat', 'hat', 'iat', 'jat', 'kat', 'lat', 'mat', 'nat', 'oat', 'pat', 'qat', 'rat', 'sat', 'tat', 'uat', 'vat', 'wat', 'xat', 'yat', 'zat', 'aat', 'abt', 'act', 'adt', 'aet', 'aft', 'agt', 'aht', 'ait', 'ajt', 'akt', 'alt', 'amt', 'ant', 'aot', 'apt', 'aqt', 'art', 'ast', 'att', 'aut', 'avt', 'awt', 'axt', 'ayt', 'azt']


In [56]:
# Combining the edits:
# Now that you have implemented the string manipulations, you will create two functions that,
#  given a string, will return all the possible single and double edits on that string. These will
#  be edit_one_letter() and edit_two_letters().

In [58]:
#  Edit one letter
def edit_one_letter(word, allow_switches=True): # allow_switches=True => allowing or not ?

    edit_one_set = set()
    edit_one_set.update(delete_letter(word))
    if allow_switches:
        edit_one_set.update(switch_letter(word))
    edit_one_set.update(replace_letter(word))
    edit_one_set.update(insert_letter(word))

    return edit_one_set

1.) You first call the edit_one_letter function on the original word to obtain a set of words that are one edit distance away from the original word. This set is stored in edit_one.

2.) Then, you iterate through the words in the edit_one set, and for each word w in that set, you call the edit_one_letter function again. This generates a set of words that are one edit distance away from the word w.

3.) You update the edit_two_set with the words generated in step 2.

In [59]:
# Edit two letters
def edit_two_letters(word, allow_switches=True):

    edit_two_set = set()
    edit_one = edit_one_letter(word, allow_switches=allow_switches)
    for w in edit_one:
        if w:
            edit_two = edit_one_letter(w, allow_switches=allow_switches)
            edit_two_set.update(edit_two)

    return edit_two_set


In [67]:
edit_one_letter('cat')

{'aat',
 'acat',
 'act',
 'at',
 'bat',
 'bcat',
 'ca',
 'caa',
 'caat',
 'cab',
 'cabt',
 'cac',
 'cact',
 'cad',
 'cadt',
 'cae',
 'caet',
 'caf',
 'caft',
 'cag',
 'cagt',
 'cah',
 'caht',
 'cai',
 'cait',
 'caj',
 'cajt',
 'cak',
 'cakt',
 'cal',
 'calt',
 'cam',
 'camt',
 'can',
 'cant',
 'cao',
 'caot',
 'cap',
 'capt',
 'caq',
 'caqt',
 'car',
 'cart',
 'cas',
 'cast',
 'cat',
 'catt',
 'cau',
 'caut',
 'cav',
 'cavt',
 'caw',
 'cawt',
 'cax',
 'caxt',
 'cay',
 'cayt',
 'caz',
 'cazt',
 'cbat',
 'cbt',
 'ccat',
 'cct',
 'cdat',
 'cdt',
 'ceat',
 'cet',
 'cfat',
 'cft',
 'cgat',
 'cgt',
 'chat',
 'cht',
 'ciat',
 'cit',
 'cjat',
 'cjt',
 'ckat',
 'ckt',
 'clat',
 'clt',
 'cmat',
 'cmt',
 'cnat',
 'cnt',
 'coat',
 'cot',
 'cpat',
 'cpt',
 'cqat',
 'cqt',
 'crat',
 'crt',
 'csat',
 'cst',
 'ct',
 'cta',
 'ctat',
 'ctt',
 'cuat',
 'cut',
 'cvat',
 'cvt',
 'cwat',
 'cwt',
 'cxat',
 'cxt',
 'cyat',
 'cyt',
 'czat',
 'czt',
 'dat',
 'dcat',
 'eat',
 'ecat',
 'fat',
 'fcat',
 'gat',
 'g

In [64]:
# proposed
edit_two_letters('cat')

{'clzat',
 'zcal',
 'yzt',
 'yawt',
 'ccak',
 'xkt',
 'has',
 'vwt',
 'cydat',
 'cgau',
 'cyad',
 'ncrat',
 'zet',
 'cgac',
 'ctha',
 'cqqat',
 'nht',
 'ccab',
 'cfay',
 'yfcat',
 'ecax',
 'cpft',
 'wcgat',
 'ftat',
 'caszt',
 'aot',
 'yam',
 'iart',
 'lkcat',
 'cdtt',
 'cmbt',
 'cdst',
 'naht',
 'vcyt',
 'cqao',
 'wcaet',
 'pcav',
 'nccat',
 'catet',
 'cthat',
 'vtcat',
 'sctat',
 'ccast',
 'cxaut',
 'cddt',
 'hcaat',
 'gag',
 'cqaet',
 'ceapt',
 'ecaqt',
 'ekt',
 'onat',
 'iuat',
 'fxat',
 'yact',
 'ceak',
 'jak',
 'aadt',
 'caks',
 'exat',
 'agcat',
 'chan',
 'moat',
 'akcat',
 'lgat',
 'qcgat',
 'rcaet',
 'jwt',
 'mcfat',
 'cdas',
 'ao',
 'clbat',
 'vxcat',
 'cvhat',
 'rca',
 'caxr',
 'gcht',
 'icad',
 'cadmt',
 'rcbt',
 'fajt',
 'fcft',
 'cnar',
 'cagk',
 'acvt',
 'nabt',
 'oadt',
 'cpaqt',
 'ocac',
 'bhcat',
 'bap',
 'cabi',
 'cacs',
 'cbzat',
 'cdamt',
 'ycpt',
 'jyat',
 'uakt',
 'icjt',
 'czyt',
 'vapt',
 'oa',
 'cmaut',
 'ktat',
 'ncit',
 'craqt',
 'xpt',
 'lcbat',
 'mau',
 'a

# <font color = "green">Prediction</font><a class = "anchor" id = "predict"></a>

In [68]:
# suggest spelling suggestions
def get_corrections(word, probs, vocab, verbose=False):
    """
    Input:
        word: a user entered string to check for suggestions
        probs: a dictionary that maps each word to its probability in the corpus
        vocab: a set containing all the vocabulary
        n: number of possible word corrections you want returned in the dictionary
    Output:
        n_best: a list of tuples with the most probable n corrected words and their probabilities.
    """

    suggestions = []
    n_best = []
    #suggestions = list((word in vocab) or edit_one_letter(word).intersection(vocab) or
    #                   edit_two_letters(word).intersection(vocab))
    suggestions = list(edit_two_letters(word).intersection(vocab))
    # suggestions = list(edit_two_letters(word, False).intersection(vocab))
    n_best = [[s, probs.get(s, -1)] for s in list(reversed(suggestions))]
    n_best.sort(key=lambda x: x[1], reverse=True)

    if verbose:
        print("suggestions = ", suggestions)

    return n_best

In [72]:
# Testing
my_word = 'cast'
tmp_corrections = get_corrections(my_word, probs, vocab, verbose=False)
for i, word_prob in enumerate(tmp_corrections):
    print(f"word {i}: {word_prob[0]}, probability {word_prob[1]:.6f}")

word 0: case, probability 0.002808
word 1: class, probability 0.002200
word 2: test, probability 0.001770
word 3: task, probability 0.001724
word 4: cost, probability 0.000984
word 5: part, probability 0.000869
word 6: best, probability 0.000862
word 7: must, probability 0.000685
word 8: fact, probability 0.000657
word 9: least, probability 0.000623
word 10: fast, probability 0.000400
word 11: base, probability 0.000355
word 12: call, probability 0.000343
word 13: last, probability 0.000323
word 14: cause, probability 0.000259
word 15: easy, probability 0.000251
word 16: cut, probability 0.000232
word 17: pas, probability 0.000177
word 18: want, probability 0.000174
word 19: list, probability 0.000163
word 20: mass, probability 0.000157
word 21: rest, probability 0.000153
word 22: past, probability 0.000141
word 23: art, probability 0.000136
word 24: post, probability 0.000132
word 25: act, probability 0.000126
word 26: car, probability 0.000111
word 27: cat, probability 0.000084
word 

# SAVING THE FILE

In [70]:
import pickle

with open('word-probability-spellings.pkl' , 'wb') as file1:
    pickle.dump(probs,file1)

In [71]:
with open('vocab-spellings.pkl' , 'wb') as file2:
    pickle.dump(vocab,file2)

In [None]:
# Saving the dictionary and vocabulary:

# import pickle

# pickle.dump(probs, open('drive/MyDrive/Dataset/DataScience-Pianalytix-Models/word-probability-spellings.pkl', 'wb'))
# pickle.dump(vocab, open('drive/MyDrive/Dataset/DataScience-Pianalytix-Models/vocab-spellings.pkl', 'wb'))