<a href="https://colab.research.google.com/github/AYA0HASSAN/AutoCorrect/blob/main/NoisyChannel_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import numpy as np
import random
import re
import string
import sys
import pandas as pd
from collections import Counter
corpus = open(r"F:\7sabat\NLP\datasetTRY.txt").read()

# Get Words to make Vocabulary

In [None]:
def get_all_words(corpus):
    return re.findall(r'\w+',corpus.lower())

In [None]:
Words = Counter(get_all_words(corpus))

# calculate probability of each word 

In [None]:
def probability(word):
    N = sum(Words.values())
    return Words[word]/N

In [None]:
def known(words):
    revelant = []
    for w in words:
        if w in Words:
            revelant.append(w)
    return revelant

# Minimun Edit Distance of Only one change

In [None]:
def edits_1(word):
    letters_english = 'abcdefghijklmnopqrstuvwxyz'
    letters_arbic = 'ابتثجحخدذرزسشصضطظعغفقكلمنهويةىء'
    splits = [(word[:i],word[i:]) for i in range(len(word)+1)]
    deletes = [left + right[1:] for left,right in splits if right]
    inserts = [left + center + right for left , right in splits for center in letters_english]
    replaces = [left + center + right[1:] for left , right in splits if right for center in letters_english]
    transposes = [left + right[1] + right[0] +right[2:] for left , right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

    

# Minimun Edit Distance of Two changes

In [None]:
def edits_2(word):
    for edit1 in edits_1(word):
        for edit2 in edits_1(edit1):
            return edit2

# Returning Candidates for the misspelled word

In [None]:
def candidates(word):
    return ( known(edits_1(word)) or known(edits_2(word)) or known([word]) or [word])


In [None]:
def best_correction(word):
    return max(candidates(word),key = probability)

In [None]:
len(Words)

968

In [None]:
max(Words , key=probability)

'presbyterian'

In [None]:
probability('presbyterian')

0.002034587995930824

In [None]:
known(edits_1('wen'))

['won', 'wan']

In [None]:
candidates('wen')

['won', 'wan']

In [None]:
best_correction('deu')

'due'

# Making Combinations for N-Gram Model 

In [None]:
def creat_Ngram(n,words):
    words_len = len(words)
    Ngram_list = []
    for idx , word in enumerate(words):
        single_Ngram = []
        for i in range(n):
            if(idx+n <= words_len):
                single_Ngram.append(words[idx+i])
        if(len(single_Ngram)==n):
            Ngram_list.append(tuple(single_Ngram))
    return Ngram_list

# Calculate Count of Uni-Grams and Bi-Grams

In [None]:
def get_counts(N):
    words = get_all_words(corpus)
    Ngram = creat_Ngram(N,words)
    Ngram_count = Counter(Ngram)
    return Ngram_count

In [None]:
get_counts(2)

Counter({('nevada', 'nevade'): 1,
         ('nevade', 'presbyterian'): 1,
         ('presbyterian', 'presbyterian'): 1,
         ('presbyterian', 'rsx'): 1,
         ('rsx', 'rsx'): 1,
         ('rsx', 'stephen'): 1,
         ('stephen', 'steffen'): 1,
         ('steffen', 'susan'): 1,
         ('susan', 'susan'): 1,
         ('susan', 'ability'): 1,
         ('ability', 'abilitey'): 1,
         ('abilitey', 'about'): 1,
         ('about', 'abouy'): 1,
         ('abouy', 'absorption'): 1,
         ('absorption', 'absorbtion'): 1,
         ('absorbtion', 'accidentally'): 1,
         ('accidentally', 'accidently'): 1,
         ('accidently', 'accommodate'): 1,
         ('accommodate', 'accomodate'): 1,
         ('accomodate', 'acommadate'): 1,
         ('acommadate', 'accord'): 1,
         ('accord', 'acord'): 1,
         ('acord', 'acquaintance'): 1,
         ('acquaintance', 'aquantance'): 1,
         ('aquantance', 'acquire'): 1,
         ('acquire', 'equire'): 1,
         ('equire', 

# Calculate the probabilities of all Candidates for a given word

In [None]:
def get_previous(word):
    words = get_all_words(corpus)
    previous = dict()
    for w in range(len(words)):
        previous[words[w]] = words[w-1]
    return previous[word]

In [None]:
get_previous('coaln')

'colon'

In [None]:
def Bigram_prob(word):
    uni_lambda = 0.4
    bi_lambda = 0.6
    candidate_prob_all = dict()
    candidate_words = candidates(word)
    unigram_counts = get_counts(1)
    bigram_counts = get_counts(2)
    prev_word = get_previous(word)
    for w in candidate_words:
        unigram_prob = unigram_counts[(word,)]/len(unigram_counts)
        if bigram_counts[(prev_word,word)]!=0:
            bigram_prob = bigram_counts[(prev_word,word)]/unigram_counts[(prev_word),]
        else:
            bigram_prob = 0
        candidate_prob = (unigram_prob * uni_lambda)+(bigram_prob * bi_lambda)
        if candidate_prob != 0:
            candidate_prob_all[word] = candidate_prob
    return candidate_prob_all

In [None]:
Bigram_prob('coaln')

{'coaln': 0.6004132231404958}

In [None]:
def correct(word):
    correct_words = candidates(word)
    correct = dict()
    for err in correct_words:
        correct[err] = Bigram_prob(err)
    return correct

In [None]:
correct('colan')

{'colon': {'colon': 0.6004132231404958},
 'coaln': {'coaln': 0.6004132231404958}}