<a href="https://colab.research.google.com/github/AhmedOsama45/CODXO/blob/main/AutoCorrect_tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Auto Correct tool trained on romeo and juliet novel by William Shakespeare**


In [1]:
import re #for regular expressions
import string #for strings
from collections import Counter #specialized container datatypes
import numpy as np #for numerical computations

In [6]:
def read_corpus(filename):
  with open(filename, "r") as file: #open file in read mode
    lines = file.readlines() #read all lines
    words = [] #initializes empty list
    for line in lines:
      words += re.findall(r'\w+', line.lower()) #find all the words sequence

  return words

In [22]:
words = read_corpus('/content/romeo and juliet.html') # read the file
print(f"There are {len(words)} total words in the corpus") # print the number of words

There are 39554 total words in the corpus


In [23]:
vocabs = set(words) #create a set of unique words
print(f"There are {len(vocabs)} unique words in the vocabulary") # print the number of unique words

There are 4170 unique words in the vocabulary


In [24]:
word_counts = Counter(words) #create a counter object from the list of words
print(word_counts["romeo"]) # print the count of the word 'romeo'

324


In [25]:
total_word_count = float(sum(word_counts.values())) #total number of words
word_probas = {word: word_counts[word] / total_word_count for word in word_counts.keys()} #probability of each word

In [26]:
print(word_probas["romeo"]) # print the probability of the word 'romeo'

0.008191333367042523


In [27]:
def split(word):
  return [(word[:i], word[i:]) for i in range(len(word) + 1)] #split the word into all possible word pairs

In [28]:
print(split("Juliet")) # print all possible word pairs for the word 'Juliet' using split function

[('', 'Juliet'), ('J', 'uliet'), ('Ju', 'liet'), ('Jul', 'iet'), ('Juli', 'et'), ('Julie', 't'), ('Juliet', '')]


In [29]:
def delete(word):
  return [l + r[1:] for l,r in split(word) if r] #delete each letter from the word

In [30]:
print(delete("juliet")) # print all possible word pairs for the word 'juliet' using delete function

['uliet', 'jliet', 'juiet', 'julet', 'julit', 'julie']


In [33]:
def swap(word):
  return [l + r[1] + r[0] + r[2:] for l, r in split(word) if len(r)>1] #swap each pair of adjacent letters

In [34]:
print(swap("juliet")) # print all possible word pairs for the word 'juliet' using swap function

['ujliet', 'jluiet', 'juilet', 'juleit', 'julite']


In [35]:
string.ascii_lowercase # provides a string containing all the lowercase letters in the ASCII alphabet

'abcdefghijklmnopqrstuvwxyz'

In [36]:
def replace(word):
  letters = string.ascii_lowercase
  return [l + c + r[1:] for l, r in split(word) if r for c in letters] #replace each letter with all possible letters

In [37]:
print(replace("juliet")) # print all possible word pairs for the word 'juliet' using replace function

['auliet', 'buliet', 'culiet', 'duliet', 'euliet', 'fuliet', 'guliet', 'huliet', 'iuliet', 'juliet', 'kuliet', 'luliet', 'muliet', 'nuliet', 'ouliet', 'puliet', 'quliet', 'ruliet', 'suliet', 'tuliet', 'uuliet', 'vuliet', 'wuliet', 'xuliet', 'yuliet', 'zuliet', 'jaliet', 'jbliet', 'jcliet', 'jdliet', 'jeliet', 'jfliet', 'jgliet', 'jhliet', 'jiliet', 'jjliet', 'jkliet', 'jlliet', 'jmliet', 'jnliet', 'joliet', 'jpliet', 'jqliet', 'jrliet', 'jsliet', 'jtliet', 'juliet', 'jvliet', 'jwliet', 'jxliet', 'jyliet', 'jzliet', 'juaiet', 'jubiet', 'juciet', 'judiet', 'jueiet', 'jufiet', 'jugiet', 'juhiet', 'juiiet', 'jujiet', 'jukiet', 'juliet', 'jumiet', 'juniet', 'juoiet', 'jupiet', 'juqiet', 'juriet', 'jusiet', 'jutiet', 'juuiet', 'juviet', 'juwiet', 'juxiet', 'juyiet', 'juziet', 'julaet', 'julbet', 'julcet', 'juldet', 'juleet', 'julfet', 'julget', 'julhet', 'juliet', 'juljet', 'julket', 'jullet', 'julmet', 'julnet', 'juloet', 'julpet', 'julqet', 'julret', 'julset', 'jultet', 'juluet', 'julvet',

In [38]:
def insert(word):
  letters = string.ascii_lowercase
  return [l + c + r for l, r in split(word) for c in letters] #insert each letter into all possible positions

In [39]:
print(insert("juliet")) # print all possible word pairs for the word 'juliet' using insert function

['ajuliet', 'bjuliet', 'cjuliet', 'djuliet', 'ejuliet', 'fjuliet', 'gjuliet', 'hjuliet', 'ijuliet', 'jjuliet', 'kjuliet', 'ljuliet', 'mjuliet', 'njuliet', 'ojuliet', 'pjuliet', 'qjuliet', 'rjuliet', 'sjuliet', 'tjuliet', 'ujuliet', 'vjuliet', 'wjuliet', 'xjuliet', 'yjuliet', 'zjuliet', 'jauliet', 'jbuliet', 'jculiet', 'jduliet', 'jeuliet', 'jfuliet', 'jguliet', 'jhuliet', 'jiuliet', 'jjuliet', 'jkuliet', 'jluliet', 'jmuliet', 'jnuliet', 'jouliet', 'jpuliet', 'jquliet', 'jruliet', 'jsuliet', 'jtuliet', 'juuliet', 'jvuliet', 'jwuliet', 'jxuliet', 'jyuliet', 'jzuliet', 'jualiet', 'jubliet', 'jucliet', 'judliet', 'jueliet', 'jufliet', 'jugliet', 'juhliet', 'juiliet', 'jujliet', 'jukliet', 'julliet', 'jumliet', 'junliet', 'juoliet', 'jupliet', 'juqliet', 'jurliet', 'jusliet', 'jutliet', 'juuliet', 'juvliet', 'juwliet', 'juxliet', 'juyliet', 'juzliet', 'julaiet', 'julbiet', 'julciet', 'juldiet', 'juleiet', 'julfiet', 'julgiet', 'julhiet', 'juliiet', 'juljiet', 'julkiet', 'julliet', 'julmiet'

In [40]:
def edit1(word):
  return set(delete(word) + swap(word) + replace(word) + insert(word)) #return all possible word pairs for the word using edit1 function

In [41]:
print(edit1("juliet")) # print all possible word pairs for the word 'juliet' using edit1 function

{'julieet', 'jzuliet', 'julijet', 'julielt', 'qjuliet', 'juluiet', 'julbet', 'fjuliet', 'jrliet', 'quliet', 'jkuliet', 'jupiet', 'jeuliet', 'sjuliet', 'gjuliet', 'juliext', 'julieot', 'jpliet', 'jukliet', 'wjuliet', 'jaliet', 'jhuliet', 'pjuliet', 'julies', 'juliety', 'jiuliet', 'jouliet', 'juxliet', 'juliem', 'suliet', 'julietb', 'julcet', 'djuliet', 'mjuliet', 'juliket', 'juliek', 'juldet', 'juliht', 'julijt', 'julivet', 'jujliet', 'julilt', 'julient', 'jkliet', 'jcliet', 'juliset', 'julieq', 'juiet', 'julietw', 'julietc', 'julyet', 'julietl', 'jgliet', 'juoiet', 'julivt', 'ljuliet', 'jxuliet', 'julaet', 'jsuliet', 'julitt', 'juliep', 'juxiet', 'julwiet', 'julzet', 'julnet', 'julciet', 'culiet', 'julit', 'juliebt', 'julxet', 'juliejt', 'julietz', 'julist', 'juliew', 'julieo', 'xuliet', 'julilet', 'juliezt', 'julier', 'juliev', 'julie', 'jueiet', 'juloet', 'julieat', 'ajuliet', 'julifet', 'jyuliet', 'jlliet', 'jugiet', 'julpiet', 'buliet', 'juqiet', 'julievt', 'jmuliet', 'huliet', 'ul

In [42]:
def edit2(word):
  return set(e2 for e1 in edit1(word) for e2 in edit1(e1)) # return all possible word pairs for the word using edit2 function

In [43]:
print(edit2("juliet")) # print all possible word pairs for the word 'juliet' using edit2 function

{'rjkuliet', 'jmuliget', 'euiiet', 'dnuliet', 'jlhuliet', 'jiulilet', 'julxietp', 'julqitt', 'juluinet', 'jkljiet', 'vjullet', 'jilipt', 'jiciet', 'jurlnet', 'jusnliet', 'jfuliyet', 'jclbet', 'julqft', 'julxietg', 'jfuloiet', 'jufljet', 'julkeht', 'julqest', 'julzviet', 'julnetb', 'mvuliet', 'jrljiet', 'jutlipt', 'jeleit', 'julyibet', 'jnlietm', 'ruliset', 'julntt', 'jlicet', 'jrlwet', 'sulieto', 'njulieut', 'jjuloiet', 'jruiet', 'julietly', 'juxleit', 'ujulivet', 'jrliei', 'nzjuliet', 'jhlimet', 'jmquliet', 'jsukiet', 'juliettu', 'julietrj', 'jculiev', 'jusvet', 'judkiet', 'julssiet', 'jxlijt', 'julbext', 'vuuiet', 'julietfq', 'fijuliet', 'juuiyt', 'julieuz', 'wulliet', 'jubjet', 'julifj', 'ptuliet', 'jslietc', 'wjuliee', 'julioeti', 'julievct', 'jdzliet', 'mjuzliet', 'jnlijt', 'jflieti', 'ujuliete', 'jumietg', 'julfee', 'julihpt', 'juliytx', 'gjulitet', 'jufliey', 'mugiet', 'julirr', 'ztuliet', 'julcgt', 'julfiek', 'guliwt', 'juliextq', 'jliec', 'juxlifet', 'iufiet', 'jzgliet', 'cjul

In [44]:
def correct_spelling(word, vocabulary, word_probabilities): #function to correct spelling
  if word in vocabulary: #check if the word is in the vocabulary
    print(f"{word} is already correctly spelt")
    return

  suggestions = edit1(word) or edit2(word) or [word] #get all possible word pairs for the word
  best_guesses = [w for w in suggestions if w in vocabulary] #get the best guesses from the vocabulary
  best_guess_probabilities = [word_probabilities[w] for w in best_guesses] #get the probabilities of the best guesses
  return [(w, word_probabilities[w]) for w in best_guesses] #return the best guesses and their probabilities

In [65]:
word = "sh" #word to be corrected
corrections = correct_spelling(word, vocabs, word_probas) #correct the spelling of the word

if corrections: #check if there are any corrections
  print(corrections) #print the corrections
  probs = np.array([c[1] for c in corrections]) #get the probabilities of the corrections
  best_ix = np.argmax(probs) #get the index of the best correction
  best_guess = corrections[best_ix][0] #get the best guess
  correct = corrections[best_ix][0] if probs[best_ix] > 0.7 else word #check if the probability of the best guess is greater than 0.7
  print(f"{correct} is suggested for {word}") #print the best guess
else:
  print(f"{word} is not in the vocabulary") #print that the word is not in the vocabulary

[('s', 0.008191333367042523), ('she', 0.0031602366385194923), ('ah', 0.00040451028973049504), ('h', 5.056378621631188e-05), ('th', 0.00010112757243262376), ('st', 0.0010365576174343936), ('so', 0.003767002073115235)]
sh is suggested for sh


In [66]:
class SpellChecker(object) :

  def __init__(self, corpus_file_path):
    self.corpus_file_path = corpus_file_path
    with open(corpus_file_path, "r") as file: #open file in read mode
      lines = file.readlines() #read all lines
      words = [] #initializes empty list
      for line in lines:
        words += re.findall(r'\w+', line.lower()) #find all the words sequence

    self.vocabs = set(words) #create a set of unique words
    self.word_counts = Counter(words) #create a counter object from the list of words
    total_words = float(sum(self.word_counts.values())) #total number of words
    self.word_probas = {word: self.word_counts[word] / total_words for word in self.vocabs} #probability of each word

  def _level_one_edits(self, word):
    letters = string.ascii_lowercase
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] #split the word into all possible word pairs
    deletes = [l + r[1:] for l,r in splits if r] #delete each letter from the word
    swaps = [l + r[1] + r[0] + r[2:] for l, r in splits if len(r)>1] #swap each pair of adjacent letters
    replaces = [l + c + r[1:] for l, r in splits if r for c in letters] #replace each letter with all possible letters
    inserts = [l + c + r for l, r in splits for c in letters]  #insert each letter into all possible positions

    return set(deletes + swaps + replaces + inserts) #return all possible word pairs for the word

  def _level_two_edits(self, word):
    return set(e2 for e1 in self._level_one_edits(word) for e2 in self._level_one_edits(e1)) # return all possible word pairs for the word

  def check(self, word):
    candidates = self._level_one_edits(word) or self._level_two_edits(word) or [word] #get all possible word pairs for the word
    valid_candidates = [w for w in candidates if w in self.vocabs] #get the best guesses from the vocabulary
    return sorted([(c, self.word_probas[c]) for c in valid_candidates], key=lambda tup: tup[1], reverse=True) #return the best guesses and their probabilities


In [67]:
checker = SpellChecker("/content/romeo and juliet.html") #create a SpellChecker object

In [69]:
checker.check("sh") #check the spelling of the word

[('s', 0.008191333367042523),
 ('so', 0.003767002073115235),
 ('she', 0.0031602366385194923),
 ('st', 0.0010365576174343936),
 ('ah', 0.00040451028973049504),
 ('th', 0.00010112757243262376),
 ('h', 5.056378621631188e-05)]