<a href="https://colab.research.google.com/github/Aayush360/data_science/blob/master/Spelling_corrector_word_suggestion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

fastText is a very convenient technique for building word representations using character- level features.

In [11]:
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import FastText
import io
import collections

import pandas as pd

In [1]:
# reading the data into basic structure

In [4]:
words=[]
data=[]

with open('comments.txt','r') as file:
    for entry in file:
        entry = entry.strip()
        data.append(entry)
        words.extend(entry.split())
        
       

In [5]:
len(words) # there are about 10 million words 

10737835

In [6]:
len(data)

561808

In [7]:
# most common word in the corpus
# dictionary of words in the form of key as the words and their frequency as the value

In [12]:
unique_words = []
unique_words = collections.Counter(words)
unique_words.most_common(10)


[('the', 445892),
 ('to', 288753),
 ('of', 219279),
 ('and', 207335),
 ('a', 201765),
 ('I', 182618),
 ('is', 164602),
 ('you', 157025),
 ('that', 140495),
 ('in', 130244)]

In [13]:
#  data is dominated by stopwords
# preprocessing in terms of keeping only alphanumeric data, case-folding, and removing stopwords.

# we dont use stemming or lemmatizing because we want the model to understand incorrect spellings as well

In [14]:
# let us preprocess the data using the preprocessing pipleline

In [15]:
def text_clean(corpus):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = []
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus.append(' '.join(qs))
    return cleaned_corpus

In [16]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word) # removing wh words from the set of stopwords
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [17]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [18]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [19]:
def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    
    if cleaning == True:
        corpus = text_clean(corpus)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]        

    return corpus

In [20]:
data = preprocess(data)

In [21]:
# data conversion into the format expected by fastText

In [22]:
preprocessed_data = []
for line in data:
    if line !="":
        preprocessed_data.append(line.split())


In [None]:
# contains list of list of tokens for each sentence 
preprocessed_data

In [24]:
## Initialize fastText Model


In [25]:
# min_n and max_n -- helps us by setting the minimum and maximum lengths of the character
# n-grams so that we can build representations

In [27]:
model = FastText(size=300, window=3, min_count=1, min_n=1, max_n=5)

In [28]:
# let us build our vocabulary and check the size of the build vocab

In [29]:
model.build_vocab(sentences=preprocessed_data)

In [31]:
len(model.wv.vocab)

182228

In [32]:
# the size would have been smaller if we would have appiled lemmatization or stemming

In [33]:
# let us train our model

In [35]:
model.train(sentences=preprocessed_data,total_examples=len(preprocessed_data),epochs=10)

In [None]:
# check whether our model can actually predict the correct spelling for the incorrect words as part of top 5 similar
# suggestion

In [36]:
# took about 11m 29s to train the model

In [37]:
model.wv.most_similar('pnhoe',topn=5)

[('jjphoto', 0.5202710032463074),
 ('jpn', 0.5185641050338745),
 ('phj', 0.5159940719604492),
 ('phospho', 0.5111843943595886),
 ('pgj', 0.5102100968360901)]

In [38]:
model.wv.most_similar('eplain',topn=5)

[('xplain', 0.8854570388793945),
 ('eexplain', 0.8442478179931641),
 ('explain', 0.8437426090240479),
 ('plain', 0.8350042104721069),
 ('reexplain', 0.8279873132705688)]

In [39]:
model.wv.most_similar('phon',topn=5)

[('phonon', 0.9068917632102966),
 ('phonton', 0.8983222246170044),
 ('xiphon', 0.8712285757064819),
 ('zephon', 0.8708957433700562),
 ('photon', 0.8643317222595215)]

In [40]:
model.wv.most_similar('mosue',topn=5)

[('mosque', 0.899060845375061),
 ('mose', 0.86293625831604),
 ('mosul', 0.8495413064956665),
 ('misue', 0.8446767330169678),
 ('mordue', 0.8430222272872925)]

In [41]:
model.wv.most_similar('reminder',topn=5)

[('remainder', 0.9125209450721741),
 ('rejoinder', 0.9104687571525574),
 ('reindeer', 0.9011728763580322),
 ('reminde', 0.901120126247406),
 ('reminders', 0.8968104720115662)]

In [42]:
model.wv.most_similar('relevnt',topn=5)

[('relevant', 0.8275952935218811),
 ('relev', 0.8186317682266235),
 ('relevanmt', 0.7933294177055359),
 ('releveant', 0.7883942127227783),
 ('releant', 0.7858115434646606)]

In [43]:
# Our fastText model does a good job in terms of suggesting corrections and potential alternatives for input text

### FastText and document distances

In [45]:
# built for spelling correction to check for document distances using the Word Mover's Distance (WMD) algorithm.

In [46]:
sentence_1 = 'Obama speaks to the media in Illinios.'
sentence_2 = 'President greets the press in Chicago'
sentence_3 ='Apple is my favorite comapany'

In [48]:
# let us compute the distance between the doucment pairs using WMD
# using fastText based vectors

In [49]:
word_mover_distance = model.wmdistance(sentence_1,sentence_2)
word_mover_distance

  """Entry point for launching an IPython kernel.


13.7337269612388

In [50]:
word_mover_distance = model.wmdistance(sentence_2,sentence_3)
word_mover_distance

  """Entry point for launching an IPython kernel.


17.90574322915053

In [51]:
# sentence 1 and sentence 2 as smaller distance compared to 2 and 3

In [52]:
# The results that we obtained in the spelling correction and distance calculations would be potentially better 
# if pre-trained fastText models were used since those are mostly built on Wikipedia text corpora and are more 
# generalized to understand different data points.

In [53]:
import pickle

In [58]:
pickle.dump(model, open('fasttext_model.sav','wb'))