In [89]:
import numpy as np
import re
import pandas as pd

#### Loading file

In [90]:
with open('./big.txt','r') as f:
    lines = f.readlines()

lines

['The Project Gutenberg EBook of The Adventures of Sherlock Holmes\n',
 'by Sir Arthur Conan Doyle\n',
 '(#15 in our series by Sir Arthur Conan Doyle)\n',
 '\n',
 'Copyright laws are changing all over the world. Be sure to check the\n',
 'copyright laws for your country before downloading or redistributing\n',
 'this or any other Project Gutenberg eBook.\n',
 '\n',
 'This header should be the first thing seen when viewing this Project\n',
 'Gutenberg file.  Please do not remove it.  Do not change or edit the\n',
 'header without written permission.\n',
 '\n',
 'Please read the "legal small print," and other information about the\n',
 'eBook and Project Gutenberg at the bottom of this file.  Included is\n',
 'important information about your specific rights and restrictions in\n',
 'how the file may be used.  You can also find out about how to make a\n',
 'donation to Project Gutenberg, and how to get involved.\n',
 '\n',
 '\n',
 '**Welcome To The World of Free Plain Vanilla Electronic 

In [91]:
words = []

for line in lines:
    word = re.findall(r'[\w]+',line.lower())
    words += word

words

['the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes',
 'by',
 'sir',
 'arthur',
 'conan',
 'doyle',
 '15',
 'in',
 'our',
 'series',
 'by',
 'sir',
 'arthur',
 'conan',
 'doyle',
 'copyright',
 'laws',
 'are',
 'changing',
 'all',
 'over',
 'the',
 'world',
 'be',
 'sure',
 'to',
 'check',
 'the',
 'copyright',
 'laws',
 'for',
 'your',
 'country',
 'before',
 'downloading',
 'or',
 'redistributing',
 'this',
 'or',
 'any',
 'other',
 'project',
 'gutenberg',
 'ebook',
 'this',
 'header',
 'should',
 'be',
 'the',
 'first',
 'thing',
 'seen',
 'when',
 'viewing',
 'this',
 'project',
 'gutenberg',
 'file',
 'please',
 'do',
 'not',
 'remove',
 'it',
 'do',
 'not',
 'change',
 'or',
 'edit',
 'the',
 'header',
 'without',
 'written',
 'permission',
 'please',
 'read',
 'the',
 'legal',
 'small',
 'print',
 'and',
 'other',
 'information',
 'about',
 'the',
 'ebook',
 'and',
 'project',
 'gutenberg',
 'at',
 'the',
 'bottom',
 'of',
 'th

#### Converting each line into 2 pair words

In [92]:
def make_pair(words):
    two_pair = []

    for i in range(len(words)-1):
        two_pair.append(' '.join([words[i],words[i+1]]))

    return two_pair

In [93]:
pair_words = make_pair(words)

pair_words

['the project',
 'project gutenberg',
 'gutenberg ebook',
 'ebook of',
 'of the',
 'the adventures',
 'adventures of',
 'of sherlock',
 'sherlock holmes',
 'holmes by',
 'by sir',
 'sir arthur',
 'arthur conan',
 'conan doyle',
 'doyle 15',
 '15 in',
 'in our',
 'our series',
 'series by',
 'by sir',
 'sir arthur',
 'arthur conan',
 'conan doyle',
 'doyle copyright',
 'copyright laws',
 'laws are',
 'are changing',
 'changing all',
 'all over',
 'over the',
 'the world',
 'world be',
 'be sure',
 'sure to',
 'to check',
 'check the',
 'the copyright',
 'copyright laws',
 'laws for',
 'for your',
 'your country',
 'country before',
 'before downloading',
 'downloading or',
 'or redistributing',
 'redistributing this',
 'this or',
 'or any',
 'any other',
 'other project',
 'project gutenberg',
 'gutenberg ebook',
 'ebook this',
 'this header',
 'header should',
 'should be',
 'be the',
 'the first',
 'first thing',
 'thing seen',
 'seen when',
 'when viewing',
 'viewing this',
 'this pr

#### Get Probability distribution

In [94]:
a = np.array(pair_words)
unique_pair, count = np.unique(a, return_counts=True)

print(len(pair_words))
print(len(unique_pair))

1115584
390694


In [95]:
prob_dist = []

for i in range(len(unique_pair)):
    prob_dist.append([unique_pair[i], count[i], unique_pair[i].split(' ')[-1]])

prob_dist[-10:]

[['zubov over', 1, 'over'],
 ['zubov rampart', 1, 'rampart'],
 ['zubova and', 2, 'and'],
 ['zubova with', 1, 'with'],
 ['zubovski rampart', 2, 'rampart'],
 ['zueblin _american', 1, '_american'],
 ['zum henker', 1, 'henker'],
 ['zweck ist', 1, 'ist'],
 ['zygoma in', 1, 'in'],
 ['zygomatic and', 1, 'and']]

#### Create dataframe

In [96]:
df = pd.DataFrame(prob_dist,columns=['pair','frequency','output'])

df_freq_5 = df[df['frequency']>=5]

df_freq_5.tail(10)

Unnamed: 0,pair,frequency,output
390370,yourself i,5,i
390372,yourself in,11,in
390394,yourself said,6,said
390401,yourself that,8,that
390404,yourself to,8,to
390408,yourself up,6,up
390429,youth and,10,and
390563,zeal for,7,for
390659,znaim road,5,road
390675,zone of,12,of


#### Prediction

In [97]:
def predict(word):

    output_data = []

    for row in df_freq_5.values:
        if row[0].split(' ')[0] == word:
            output_data.append([row[0], row[1], row[2]])
    
    output_data = sorted(output_data, key = lambda x: x[1])
    output_data = output_data[-1:-6:-1]
    output_data = [k for i,j,k in output_data]

    return output_data

In [98]:
# one word at a time
predict('the')

['same', 'french', 'first', 'old', 'emperor']

In [99]:
# automatic

length = 20
line = 'my'

while len(line.split(' ')) < length:
    word = line.split(' ')[-1]
    pred_word = predict(word)
    if len(pred_word) == 0:
        break
    line = ' '.join([line,pred_word[0]])

print(line)

my dear fellow said the same time to the same time to the same time to the same time to


In [107]:
# manual

length = 10
line = 'my'

while len(line.split(' ')) < length:
    print(line)
    word = line.split(' ')[-1]
    pred_word = predict(word)
    if len(pred_word) == 0:
        break
    
    print("Suggested words:",pred_word)

    # enter index from the suggation or else your own word
    selected_word = input()
    try:
        selected_word = pred_word[int(selected_word)]
    except:
        selected_word = selected_word

    line = ' '.join([line,selected_word])

print('-'*50)
print(line)

my
Suggested words: ['dear', 'own', 'friend', 'father', 'wife']


my friend
Suggested words: ['s', 'of', 'and', 'said', 'i']
my friend from
Suggested words: ['the', 'a', 'his', 'which', 'her']
my friend from india
Suggested words: ['company']
my friend from india is
Suggested words: ['a', 'the', 'not', 'to', 'it']
my friend from india is a
Suggested words: ['man', 'few', 'long', 'little', 'very']
my friend from india is a very
Suggested words: ['well', 'much', 'good', 'glad', 'important']
my friend from india is a very rich
Suggested words: ['and', 'in', 'man']
my friend from india is a very rich man
Suggested words: ['who', 's', 'of', 'in', 'and']
--------------------------------------------------
my friend from india is a very rich man and
