In [1]:
import re


### 1. Finding the Pairs.

In [4]:
with open('big.txt') as fd:
    lines=fd.readlines()
    words=[]
    for line in lines:
        words+=re.findall('\w+',line.lower())

def get_pairs(text):
    data=[]
    for i in range(len(text)-1):
        data.append(' '.join(words[i:i+2]))
    return data

data=get_pairs(words)
    

In [5]:
data[:10]

['the project',
 'project gutenberg',
 'gutenberg ebook',
 'ebook of',
 'of the',
 'the adventures',
 'adventures of',
 'of sherlock',
 'sherlock holmes',
 'holmes by']

### 2. Find Occurence Probablities.

In [6]:
data.count('the project')

95

In [12]:
print('--'*10)
print("Total pairs:",len(data))
unique=list(set(data))
print("Unique pairs:",len(unique))
print('--'*10)

--------------------
Total pairs: 1115584
Unique pairs: 390694
--------------------


In [13]:
from tqdm import tqdm
prob_dist=[]
for pair in tqdm(unique):
    prob_dist.append([pair,data.count(pair)])

  1%|â–ˆ                                                                         | 5365/390694 [02:59<3:35:02, 29.87it/s]


KeyboardInterrupt: 

##### NOTE: as we can see normal prob classw will take hours to distribute while we can use inbuilt libraries.

In [17]:
import numpy as np

a=np.array(data)

uniq , counts=np.unique(a,return_counts=True)

print(len(uniq),len(counts))

390694 390694


In [19]:
uniq[:11]

array(['0 05', '0 25', '0 45', '0 5', '0 6', '0 7', '0 9', '0 i', '00 99',
       '00 went', '000 000'], dtype='<U30')

In [20]:
counts[:11]

array([ 1,  1,  1,  1,  4,  1,  1,  1,  2,  1, 43], dtype=int64)

In [22]:
prob_dist=[] # stores the data informaiotn as the data its count and its output

for i in range(len(uniq)):
    prob_dist.append([uniq[i],counts[i],uniq[i].split()[1]])

In [23]:
prob_dist[:10]

[['0 05', 1, '05'],
 ['0 25', 1, '25'],
 ['0 45', 1, '45'],
 ['0 5', 1, '5'],
 ['0 6', 4, '6'],
 ['0 7', 1, '7'],
 ['0 9', 1, '9'],
 ['0 i', 1, 'i'],
 ['00 99', 2, '99'],
 ['00 went', 1, 'went']]

### 3. Making the Dataframe

In [25]:
import pandas as pd 

In [32]:
df=pd.DataFrame(prob_dist,columns=['Pair','Freq','Output'])
df=df[df['Freq']>=5]
df.head(7)

Unnamed: 0,Pair,Freq,Output
10,000 000,43,000
14,000 acres,5,acres
17,000 against,6,against
21,000 and,5,and
37,000 in,31,in
38,000 inhabitants,7,inhabitants
42,000 men,5,men


#### 3.1 Predicting the pair 

In [33]:
word='this'

for pair in df['Pair']:
    if(pair.split(' ')[0]==word):
        print(pair)
    

this act
this action
this affair
this affection
this agreement
this aim
this ancient
this and
this article
this as
this at
this battle
this business
this but
this campaign
this case
this chapter
this character
this class
this condition
this connection
this constitution
this continent
this could
this country
this crisis
this day
this difference
this direction
this disease
this does
this ebook
this electronic
this end
this evening
this expression
this extraordinary
this fact
this fails
this feeling
this fellow
this file
this for
this force
this form
this fresh
this general
this gentleman
this good
this great
this group
this had
this happened
this happiness
this has
this he
this his
this house
this i
this idea
this in
this information
this is
this it
this kind
this last
this law
this letter
this life
this little
this long
this made
this man
this manner
this marriage
this matter
this may
this mean
this means
this measure
this method
this moment
this morning
this most
this movement
this new

In [40]:
def predict(word):
    df_pred=[]
    for i in df.values:
        if(i[0].split(' ')[0]==word):
            df_pred.append([i[0],i[1],i[2]])

    df_1=pd.DataFrame(df_pred,columns=['Input','Freq','Output'])
    return (list(df_1.sort_values(by='Freq',ascending=False).head()['Output'].values))
    

##### 3.1.1 Predicting one word 

In [41]:
predict('the')

['same', 'french', 'first', 'old', 'emperor']

##### 3.1.2 Predicting with auto sequencing.

In [42]:
word = 'one'

for i in range(20):
    pred = predict(word)
    word = pred[0]
    print(word, end = ' ')

of the same time to the same time to the same time to the same time to the same time 

##### 3.1.3  Prediction with manual Sequencing

In [43]:
word = 'this'

preds = []
preds.append(word)

for i in range(5):

    pred = predict(word)
    print(pred)
    word = pred[int(input('Enter the Index : '))]
    preds.append(word)
    
print('-'*20)
print(' '.join(preds))
print('-'*20)

['is', 'was', 'way', 'and', 'time']


Enter the Index :  1


['a', 'the', 'not', 'in', 'to']


Enter the Index :  1


['same', 'french', 'first', 'old', 'emperor']


Enter the Index :  1


['and', 'army', 'had', 'were', 'revolution']


Enter the Index :  1


['and', 'was', 'of', 'to', 'he']


Enter the Index :  1


--------------------
this was the french army was
--------------------


### Working with the N-Gram.

In [44]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm 


with open('big.txt','r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words += re.findall('\w+', line.lower())

### 1. Finding the Pairs


In [45]:
def get_pairs(words, n):
    
    n = n+1  
    data = []
    for i in range(len(words) - n):
        data.append(' '.join(words[i:i+n]))
    return data

### 2. Finding Occuring Probablitites

In [47]:
def get_prob_dist(data):
    
    a = np.array(data)

    pair , count = np.unique(a, return_counts = True)
    unique_pairs = list(set(data))

    prob_dist = []

    for i in range(len(unique_pairs)):
        prob_dist.append([unique_pairs[i] , ' '.join(unique_pairs[i].split(' ')[:-1]), unique_pairs[i].split(' ')[-1], count[i]])

    return prob_dist

In [48]:
data = get_pairs(words,4)
prob_dist = get_prob_dist(data)

### 3. Predicting the Words

In [49]:
df = pd.DataFrame(prob_dist, columns = ['Seq','Input','Output','Freq'])
df.head()

def predict(word):

    if len(df[df['Input'] == word]):
        df_ = df[df['Input'] == word]
        return df_.sort_values(by = 'Freq').head()['Output'].values
    else:
        print('Seq is not present')
        
predict('this is a beautiful')

array(['country'], dtype=object)

#### 3.1 Predicting the one word


In [50]:
predict('the is a beautiful')

Seq is not present


### 3.2 Predicting the word with auto sequencing

In [55]:
def pred_seq(seq,n):

    output = []
    output.append(seq)

    for i in range(n):

        pred = predict(seq)
        seq = ' '.join(seq.split(' ')[1:]) + ' ' + pred[0]
        output.append(pred[0])
        
    return ' '.join(output)
pred_seq('of the united states',50)

'of the united states shall guarantee to every state in this union a republican form of government and shall protect each of them against invasion and on application of the legislature or of the executive when the legislature cannot be convened against domestic violence article v the congress whenever two thirds of both houses'

### <b>Note:</b> Actually working with n grams for predicting the things seems to quite interesting but the pair of 3 or 4 is best for the sequence genration of the sequence.

## Context Word Prediction.

In [56]:
with open('big.txt','r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words += re.findall('\w+', line.lower())

def get_pairs(words, n):
    
    data = []
    for i in range(len(words) - n):
        data.append(' '.join(words[i:i+n]))
    return data

#### Get Probability Distribution.

In [57]:
def get_prob_dist(data):
    
    prob_dist = []    
    a = np.array(data)
    pairs, counts = np.unique(a, return_counts = True)
    
    for i in range(len(pairs)):
        
        left_seq   = ' '.join(pairs[i].split(' ')[:len(pairs[i].split(' '))//2])
        right_seq  = ' '.join(pairs[i].split(' ')[len(pairs[i].split(' '))//2 + 1: ])
        middle_seq = pairs[i].split(' ')[len(pairs[i].split(' '))//2]
        
        prob_dist.append([pairs[i],left_seq, right_seq, middle_seq, counts[i]])
        
    return prob_dist
        

In [59]:
data = get_pairs(words,3)
prob_dist = get_prob_dist(data)

df = pd.DataFrame(prob_dist, columns = ['Sequence','Left_seq','Right_seq','Output','Freq'])
df.head()

Unnamed: 0,Sequence,Left_seq,Right_seq,Output,Freq
0,0 05 grm,0,grm,5,1
1,0 25 u,0,u,25,1
2,0 45 grm,0,grm,45,1
3,0 5 to,0,to,5,1
4,0 6 grm,0,grm,6,4


#### Getting the Predictions

In [61]:
def predict(word):
    
    left_seq  = word.split('_')[0].strip()
    right_seq = word.split('_')[1].strip()

    df_ = df[df['Left_seq'] == left_seq]
    df_ = df_[df_['Right_seq'] == right_seq]

    return list(df_.sort_values(by = 'Freq', ascending = False).head()['Output'].values)


predict('the _ states')

['united', 'several', 'southern', 'northern', 'planting']

In [64]:
df.sort_values(by = 'Freq').head(10)

Unnamed: 0,Sequence,Left_seq,Right_seq,Output,Freq
0,0 05 grm,0,grm,05,1
532804,ponderous maketh repaid,ponderous,repaid,maketh,1
532805,ponderous steps up,ponderous,up,steps,1
532806,pondicherry in a,pondicherry,a,in,1
532807,pondicherry in january,pondicherry,january,in,1
532808,pondicherry postmark what,pondicherry,what,postmark,1
532809,pondicherry seven weeks,pondicherry,weeks,seven,1
532810,pondicherry the second,pondicherry,second,the,1
532811,ponds and ditches,ponds,ditches,and,1
532812,ponds and lakes,ponds,lakes,and,1
