# Auto-complete model using n-grams


In this exercize, your mission, shall you choose to accept it, is to create a program that predicts the next word to appear in a given user input, based on a data base of text.

Using n-grams, create 3-grams of the provided text, applying the necessary pre-processing to it, to predict the next word of a user input. 

The notebook below guides you in providing the framework of the necessary steps.

In [24]:
import re
import unicodedata
import string
import random
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [25]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return ''

In [26]:
#Tokenize and perform lemmatization
lemmatizer = WordNetLemmatizer()

def clean(text):
    output = [] #our list of cleaned words
    # tokenization
    text = word_tokenize(text)
    # only alphabets, numerics and lower case
    text = [word.lower() for word in text if word.isalpha()]
    # apply lemmatization
    for i, word in enumerate(text):
        pos = get_wordnet_pos(pos_tag([word])[0][1])
        if pos != '':
            output.append(lemmatizer.lemmatize(word, pos))
        else:
            output.append(word)
    return output

In [27]:
# remove unnecessary characters, perform regex parsing, and make lowercase
def filter(text):
    # remove punctuation (and some special characters)
    text = re.sub("\[.,\/#!$%\^&\*;:{}=\-_`~()]","",text) 
    # only alphabets and numerics
    # text = re.sub("[^a-zA-Z0-9]+","",text)
    # replace newline with space
    text = re.sub("[\\t\\n\\r]+"," ",text)
    return text

In [28]:
# Generate predictions from the created 3-grams
def predict(model, user_input):
    print("Filtering user input processing...")
    text = filter(user_input)
    print("Cleaning user input processing...")
    words = clean(text)
    l = len(words)
    trigram = model[(words[l-2],words[l-1])].most_common(5)
    print(trigram)
    print(text + " " + str(trigram[0][0]))

In [29]:
from nltk.util import ngrams

# Make a language model using a dictionary, trigrams

def n_gram_model(list_of_tokenized_text):
    # a nifty tool to help us create ngrams. in this scenairo, we are creating tri-grams
    trigrams = list(ngrams(list_of_tokenized_text, 3, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
    # we use nltk to get a trigram frequency
    a,b,c = list(zip(*trigrams))
    trigrams = list(zip(a,b))
    return nltk.ConditionalFreqDist(list(zip(trigrams, c)))

In [30]:
# file = open('n-gram-data.txt', 'r')
    
# text = ""
# while True:
#     line = file.readline()
#     text += line
#     if not line:
#         break

# # pre-process text
# print("Filtering...")
# words = filter(text)
# print("Cleaning...")
# words = clean(words)

Filtering...
Cleaning...


In [31]:
# from nltk.tokenize import sent_tokenize

# text_clean_sent_tokenized = sent_tokenize(text)
# text_clean_sent_tokenized[:10]

['A few words about Dostoevsky himself may help the English reader to\nunderstand his work.',
 'Dostoevsky was the son of a doctor.',
 'His parents were very hard-\nworking and deeply religious people, but so poor that they lived with\ntheir five children in only two rooms.',
 'The father and mother spent\ntheir evenings in reading aloud to their children, generally from\nbooks of a serious character.',
 'Though always sickly and delicate Dostoevsky came out third in the\nfinal examination of the Petersburg school of Engineering.',
 'There he\nhad already begun his first work, "Poor Folk."',
 'This story was published by the poet Nekrassov in his review and was\nreceived with acclamations.',
 'The shy, unknown youth found himself\ninstantly something of a celebrity.',
 'A brilliant and successful career\nseemed to open before him, but those hopes were soon dashed.',
 'In 1849\nhe was arrested.']

In [32]:
def main():
    file = open('n-gram-data.txt', 'r')
    
    text = ""
    while True:
        line = file.readline()
        text += line
        if not line:
            break

    # pre-process text
    print("Filtering...")
    words = filter(text)
    print("Cleaning...")
    words = clean(words)
    print("Text cleaned.", '\n')

    # make language model
    print("Making model...")
    model = n_gram_model(words)

    print("Enter a phrase: ")
    user_input = input()
    predict(model, user_input)

main()

Filtering...
Cleaning...
Text cleaned. 

Making model...
Enter a phrase: 
Filtering user input processing...
Cleaning user input processing...
[('sickly', 1)]
Though always sickly
