In [1]:
import re
from collections import Counter

In [2]:
# function to tokenise words
def words(document):
    "Convert text to lower case and tokenise the document"
    return re.findall(r'\w+', document.lower())

In [5]:
# create a frequency table of all the words of the document
all_words = Counter(words(open('seed_document.txt').read()))

In [6]:
# check frequency of a random word, say, 'chair'
all_words['chair']

135

In [8]:
# look at top 10 frequent words
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [9]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])                   for i in range(len(word) + 1)]
    deletes    = [left + right[1:]                       for left, right in splits if right]
    inserts    = [left + c + right                       for left, right in splits for c in alphabets]
    replaces   = [left + c + right[1:]                   for left, right in splits if right for c in alphabets]
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

In [19]:
def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

In [11]:
def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

In [12]:
def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

In [13]:
def prob(word, N=sum(all_words.values())): 
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [14]:
print(len(set(edits_one("monney"))))
print(edits_one("monney"))

336
{'mokney', 'monneo', 'motnney', 'mfonney', 'monvney', 'money', 'moqnney', 'moneey', 'mmnney', 'mohnney', 'umonney', 'rmonney', 'konney', 'monkey', 'monneyb', 'monnedy', 'monxey', 'mozney', 'amonney', 'monnecy', 'monneyt', 'xmonney', 'moynney', 'monnevy', 'monnley', 'mkonney', 'tmonney', 'oonney', 'monrney', 'monneyg', 'monpey', 'monfney', 'monnoy', 'monneyy', 'monbey', 'myonney', 'monnez', 'mqonney', 'monnemy', 'monny', 'moonney', 'bmonney', 'mocnney', 'onney', 'monnew', 'monnety', 'monnvy', 'monneqy', 'monneyd', 'pmonney', 'monneya', 'monnej', 'moncney', 'mdnney', 'maonney', 'monnee', 'mdonney', 'munney', 'monnepy', 'monnjey', 'zonney', 'monnjy', 'monneyj', 'monndey', 'monniey', 'vonney', 'emonney', 'monneyi', 'moinney', 'monnney', 'mcnney', 'monncey', 'monneey', 'monnxy', 'mlnney', 'monlney', 'momney', 'monneyq', 'mouney', 'mynney', 'mxonney', 'manney', 'moiney', 'monndy', 'mnonney', 'monnhey', 'monnsy', 'monzey', 'mhonney', 'monnef', 'monjey', 'monneoy', 'mzonney', 'mmonney', 'm

In [None]:
print(known(edits_one("monney")))

In [23]:
len(set(edits_two("emfasize")))

90902

In [21]:
# Let's look at words that are two edits away
print(len(set(edits_two("monney"))))
print(known(edits_one("monney")))

51013
{'money', 'monkey'}


In [24]:
# Let's look at possible corrections of a word
print(possible_corrections("emfasize"))

{'emphasize'}


In [25]:
# Let's look at probability of a word
print(prob("money"))
print(prob("emfasize"))

0.0002922233626303688
0.0


In [26]:
def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct spelling."

In [30]:
# test spell check
print(spell_check("emfasize"))

Did you mean emphasize?


ModuleNotFoundError: No module named 'spell_corrector'

In [36]:
text=f"""
The Nobel Prize is a set of five annual international awards bestowed in several categories by Swedish and Norwegian institutions in recognition of academic, cultural, or scientific advances. In the 19th century, the Nobel family who were known for their innovations to the oil industry in Azerbaijan was the leading representative of foreign capital in Baku. The Nobel Prize was funded by personal fortune of Alfred Nobel. The Board of the Nobel Foundation decided that after this addition, it would allow no further new prize.
"""


In [33]:
5/85

0.058823529411764705

In [34]:
from nltk.corpus import stopwords

In [38]:
text_split=text.split(" ")

text_cleaned=[word for word in text_split if word not in stopwords.words("english")]

# stopwords.words("english")

In [39]:
len(text_cleaned)

52

In [40]:
5/52

0.09615384615384616

In [43]:
from nltk.tokenize import word_tokenize
words = word_tokenize(text)

In [44]:
words

['The',
 'Nobel',
 'Prize',
 'is',
 'a',
 'set',
 'of',
 'five',
 'annual',
 'international',
 'awards',
 'bestowed',
 'in',
 'several',
 'categories',
 'by',
 'Swedish',
 'and',
 'Norwegian',
 'institutions',
 'in',
 'recognition',
 'of',
 'academic',
 ',',
 'cultural',
 ',',
 'or',
 'scientific',
 'advances',
 '.',
 'In',
 'the',
 '19th',
 'century',
 ',',
 'the',
 'Nobel',
 'family',
 'who',
 'were',
 'known',
 'for',
 'their',
 'innovations',
 'to',
 'the',
 'oil',
 'industry',
 'in',
 'Azerbaijan',
 'was',
 'the',
 'leading',
 'representative',
 'of',
 'foreign',
 'capital',
 'in',
 'Baku',
 '.',
 'The',
 'Nobel',
 'Prize',
 'was',
 'funded',
 'by',
 'personal',
 'fortune',
 'of',
 'Alfred',
 'Nobel',
 '.',
 'The',
 'Board',
 'of',
 'the',
 'Nobel',
 'Foundation',
 'decided',
 'that',
 'after',
 'this',
 'addition',
 ',',
 'it',
 'would',
 'allow',
 'no',
 'further',
 'new',
 'prize',
 '.']

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer()
feats=vectorizer.fit_transform(text_cleaned)

In [62]:
import pandas as pd
df=pd.DataFrame(feats.toarray(),columns=vectorizer.get_feature_names_out())
df.columns
df["Nobel"]

KeyError: 'Nobel'

In [56]:
text_cleaned

['\nThe',
 'Nobel',
 'Prize',
 'set',
 'five',
 'annual',
 'international',
 'awards',
 'bestowed',
 'several',
 'categories',
 'Swedish',
 'Norwegian',
 'institutions',
 'recognition',
 'academic,',
 'cultural,',
 'scientific',
 'advances.',
 'In',
 '19th',
 'century,',
 'Nobel',
 'family',
 'known',
 'innovations',
 'oil',
 'industry',
 'Azerbaijan',
 'leading',
 'representative',
 'foreign',
 'capital',
 'Baku.',
 'The',
 'Nobel',
 'Prize',
 'funded',
 'personal',
 'fortune',
 'Alfred',
 'Nobel.',
 'The',
 'Board',
 'Nobel',
 'Foundation',
 'decided',
 'addition,',
 'would',
 'allow',
 'new',
 'prize.\n']