In [1]:
import re
from tqdm import tqdm
import numpy as np
import pandas as pd

with open('big.txt' , 'r') as fd:
    lines = fd.readlines()
    words = []
    for word in lines:
        words += re.findall('\w+' , word.lower())

In [2]:
words

['the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes',
 'by',
 'sir',
 'arthur',
 'conan',
 'doyle',
 '15',
 'in',
 'our',
 'series',
 'by',
 'sir',
 'arthur',
 'conan',
 'doyle',
 'copyright',
 'laws',
 'are',
 'changing',
 'all',
 'over',
 'the',
 'world',
 'be',
 'sure',
 'to',
 'check',
 'the',
 'copyright',
 'laws',
 'for',
 'your',
 'country',
 'before',
 'downloading',
 'or',
 'redistributing',
 'this',
 'or',
 'any',
 'other',
 'project',
 'gutenberg',
 'ebook',
 'this',
 'header',
 'should',
 'be',
 'the',
 'first',
 'thing',
 'seen',
 'when',
 'viewing',
 'this',
 'project',
 'gutenberg',
 'file',
 'please',
 'do',
 'not',
 'remove',
 'it',
 'do',
 'not',
 'change',
 'or',
 'edit',
 'the',
 'header',
 'without',
 'written',
 'permission',
 'please',
 'read',
 'the',
 'legal',
 'small',
 'print',
 'and',
 'other',
 'information',
 'about',
 'the',
 'ebook',
 'and',
 'project',
 'gutenberg',
 'at',
 'the',
 'bottom',
 'of',
 'th

# Getting the Pairs

In [3]:
def get_pairs(words, n):
    
    data = []
    for i in range(len(words) - n):
        data.append(' '.join(words[i:i+n]))
    return data

In [4]:
get_pairs(words , 5)

['the project gutenberg ebook of',
 'project gutenberg ebook of the',
 'gutenberg ebook of the adventures',
 'ebook of the adventures of',
 'of the adventures of sherlock',
 'the adventures of sherlock holmes',
 'adventures of sherlock holmes by',
 'of sherlock holmes by sir',
 'sherlock holmes by sir arthur',
 'holmes by sir arthur conan',
 'by sir arthur conan doyle',
 'sir arthur conan doyle 15',
 'arthur conan doyle 15 in',
 'conan doyle 15 in our',
 'doyle 15 in our series',
 '15 in our series by',
 'in our series by sir',
 'our series by sir arthur',
 'series by sir arthur conan',
 'by sir arthur conan doyle',
 'sir arthur conan doyle copyright',
 'arthur conan doyle copyright laws',
 'conan doyle copyright laws are',
 'doyle copyright laws are changing',
 'copyright laws are changing all',
 'laws are changing all over',
 'are changing all over the',
 'changing all over the world',
 'all over the world be',
 'over the world be sure',
 'the world be sure to',
 'world be sure to ch

# Probability Distribution

In [5]:
def get_prob_dist(data):
    
    prob_dist = []    
    a = np.array(data)
    pairs, counts = np.unique(a, return_counts = True)
    
    for i in range(len(pairs)):
        
        left_seq   = ' '.join(pairs[i].split(' ')[:len(pairs[i].split(' '))//2])
        right_seq  = ' '.join(pairs[i].split(' ')[len(pairs[i].split(' '))//2 + 1: ])
        middle_seq = pairs[i].split(' ')[len(pairs[i].split(' '))//2]
        
        prob_dist.append([pairs[i],left_seq, right_seq, middle_seq, counts[i]])
        
    return prob_dist
        
    
data = get_pairs(words,5)
prob_dist = get_prob_dist(data)

In [6]:
df = pd.DataFrame(prob_dist, columns = ['seq','left_seq','right_seq','output','freq'])
df.head()

Unnamed: 0,seq,left_seq,right_seq,output,freq
0,0 05 grm novarsenbillon injected,0 05,novarsenbillon injected,grm,1
1,0 25 u and is,0 25,and is,u,1
2,0 45 grm given intra,0 45,given intra,grm,1
3,0 5 to 2 per,0 5,2 per,to,1
4,0 6 grm all these,0 6,all these,grm,1


# Getting the Predictions

In [21]:
def predict(word):
    left_seq = word.split('_')[0].strip()
    right_seq = word.split('_')[1].strip()
    
    df_ = df[df['left_seq'] == left_seq]
    df_ = df_[df_['right_seq'] == right_seq]
    return list(df_.sort_values(by = 'freq' , ascending = False).head()['output'].values)

In [26]:
predict('in the _ of the')

['region', 'middle', 'case', 'vicinity', 'hands']

In [24]:
df.sort_values(by = 'freq')

Unnamed: 0,seq,left_seq,right_seq,output,freq
0,0 05 grm novarsenbillon injected,0 05,novarsenbillon injected,grm,1
1089057,zoology and so on just,zoology and,on just,so,1
1089058,zoology for in a frog,zoology for,a frog,in,1
1089059,zoology was not merely acknowledged,zoology was,merely acknowledged,not,1
1089060,zu schwachen so kann man,zu schwachen,kann man,so,1
...,...,...,...,...,...
627289,of the united states_ pp,of the,states_ pp,united,31
893586,the project gutenberg literary archive,the project,literary archive,gutenberg,32
701067,project gutenberg literary archive foundation,project gutenberg,archive foundation,literary,36
450975,in the middle of the,in the,of the,middle,38
