## This notebook consists of :

1) wmd implementation as direct transportation problem.(along with functions for datapreprocessing)<br>
2) our experiment of using max cost<br>
3) file loaders to load dataset/embedding of glove.<br>

We may use keyedvectors word2vec as otherwise the gensim loader is crashing the jupyter kernel on ada. 

### Following experimentations :
1) direct wmd for the three original paper sentences.<br>
2) our max cost experimentation.<br>
Repeated with glove vector of 200d and 50d.

Steps :
1) Enter two sentences <br>
2) Preprocess : tokenize and lemmatize to get a list of words. <br>
3) Create a dictionary for count of every word in each sentence.<br>
4) Sort alphabetically<br>
5) Pick one dictionary at a time, create embedding matrix for each sentence by picking one word from the sentence, find its embedding and append in the list. At the same time append its count in the p/q matrix ie nbow.<br>
6) Proceed to find min/max cost

In [1]:
#imports:

# file imports

import numpy as np
import matplotlib.pyplot as plt
import gensim
import gensim.downloader
import os
from scipy.optimize import linprog
import nltk
from collections import defaultdict
from gensim.models import KeyedVectors
import sklearn

#nltk.download('stopwords')
#nltk.download('punkt')
# nltk.download('wordnet')
from nltk.corpus import stopwords



In [2]:
os.listdir("../")

['files', '.ipynb_checkpoints', 'src', 'Miniconda3-latest-Linux-x86_64.sh']

In [53]:
def sentence_preprocess(sentence,lowercase = 1, strip_punctuation = 1, remove_stopwords = 1, embed_dict, removedigit = 1):
    ''' 1 : True, 0 : False : Lowercase, Strip puncutation, Remove Stopwords, removedigit'''

    stop_words = list(stopwords.words('english'))

    if lowercase == 1:
        sentence = sentence.lower()

    sentence_words = nltk.word_tokenize(sentence)

    if strip_punctuation == 1 and removedigit == 1:
        sentence_words = [word for word in sentence_words if word.isalpha()] 
        


    if remove_stopwords == 1:
        sentence_words = [word for word in sentence_words if not word in stop_words]
    
    
    sentence_words = [word for word in sentence_words if word in embed_dict.keys()]



    return sentence_words

In [54]:
embeddingtype = None
embd_model = None




In [55]:
## to load from embedding text files:
## have used this to load glove vectors and not word2vec

def load_glove(embeddingtype):
    
    if embeddingtype == 3:
        i = 300
    if embeddingtype == 4:
        i = 200
    if embeddingtype == 5:
        i = 100
    if embeddingtype == 6:
        i = 50
    
    
    embeddings_dict = defaultdict(lambda:np.zeros(i)) 
    # defaultdict to take care of OOV words.
    
    with open(f"../test/glove.6B.{i}d.txt",'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
        
    return embeddings_dict

In [56]:
def embeddings_setup(embeddingtype):
    '''to avoid loading all the embeddings in the memory.'''
    
    if embeddingtype == 1:
        embedding = KeyedVectors.load('google300w2v.kv', mmap='r')
        ## This will be slower but will prevent kernel from crashing.
        
        ## comment the above line and uncomment this if you have sufficient RAM:
        
        #w2v_emb = gensim.downloader.load('word2vec-google-news-300')
        
    if embeddingtype == 2:
        print('Normalised word2vec not loaded, will get it soon')
        embedding = None
    
    if embeddingtype in (3,4,5,6):
        embedding = load_glove(embeddingtype)
        
    
    return embedding
        
    

In [57]:
def find_embdMatrix(sentence, newembtype):
    global embeddingtype
    global embd_model
    print(" global embedding type being passed is :", embeddingtype,"\n")
    print("embedding type received by the find emb matrix is :", newembtype,"\n")
    print("embd model type is :", type(embd_model),"\n")
    
    sent_mtx = []
    ## Note : we are finding the embd matrix two times, ie once for each sentence in
    ## the pair of sentences.
    ## so this happens that embedding type is changed when find_embmatrix is called
    ## by the first sentence 
    if ( embeddingtype != newembtype):
        print("if embdtype part entered :", embeddingtype != newembtype,"\n")
        
        embeddingtype = newembtype
        embd_model = embeddings_setup(embeddingtype)
        
        print("embd_model type changed to :", type(embd_model),"\n" )
    #to make sure that we don't download the embeddings again and again,
    # we will check if the embedding type is same as the old one
    # and update global embd_model, vrna next time vo use hi nhi ho payega.
    
    print("embd_model type changed to :", type(embd_model),"\n" )
    for word in sentence:
        word_emb = embd_model[word]
        sent_mtx.append(word_emb)
    
    sent_mtx = np.array(sent_mtx).reshape(len(sentence),-1)

    return sent_mtx

In [58]:
 def wasserstein_distance(pi, qj, D, cost = 'min'):
        """Find Wasserstein distance through linear programming
        p.shape=[m], q.shape=[n], D.shape=[m, n]
    
        suppose doc1 has m words and doc2 has n words, then an mxn array would be formed, 
        having distance of each word in doc1 to that of doc2.
    
    
    
        p.sum()=1, q.sum()=1, p∈[0,1], q∈[0,1]
        """
        A_eq = [] # a list which will later be converted to array after appending.
        for i in range(len(pi)): # len = number of words.
            A = np.zeros_like(D) # a 2d array made with the shape of D.  
            A[i, :] = 1 
            print("Dshape, len pi till here :",D.shape,len(pi),"\n")
            
            # to make summation over "i" of Tij = pi, ie total / sum of outflow
            ## from one word is equal to its pi (normalized bag of word/ frequency/density)
            ## ex : if 2x3 D:
            ##T1,1 + T1,2 + T1,3 + 0 T2,1 + 0 T2,2 + 0 T2,3 = P1 and so on for every i,
            ## ie for each word in the doc1
            print("A.shape", A.shape,"\n")
            A_eq.append(A.reshape(-1)) ## reshape(-1) flatens and then appending in A_eq.
            print(A_eq,"Aeq\n")
            ## A_eq will be (m+n)x(m.n)
    
        for i in range(len(qj)):
            A = np.zeros_like(D)
            A[:, i] = 1 ## summation over "j" this time, so this time for different rows, 
            ## over a column "j" which refers to doc2, ie total incoming flow = qj density
            A_eq = list(A_eq)
            A_eq.append(A.reshape(-1))
            A_eq = np.array(A_eq)
        
        print(A_eq.shape,A_eq)
       
        b_eq = np.concatenate([pi, qj])
        D = D.reshape(-1)
        #print("Dshape:",D.shape)
        if cost == 'max':
            D = D*(-1)
        
        result = linprog(D, A_eq=A_eq[:-1], b_eq=b_eq[:-1]) ## removing redundant to make 
        ## solution more robust.
        return np.absolute(result.fun), result.x , D.reshape((len(pi),len(qj)))  ## fun returns the final optimized value, x returns each value of xi,j that is the array


In [84]:
def relaxed_distance(pi,qj,D,cost='min'):
    
    # to find relaxed we just add the min/max cost directly using the least distance for pi to qj.
    
    # D is calculated from P to Q ie P in rows and Q in columns, To find Q to P we will transpose 
    if cost == 'min':
        p_to_q = np.dot(D.min(axis=1),pi)
        q_to_p = np.dot(D.T.min(axis=1),qj)
        
        return max(p_to_q,q_to_p)
    
    if cost == 'max':
        
        p_to_q = np.dot(D.max(axis=1),pi)
        q_to_p = np.dot(D.T.max(axis=1),qj)
        
        return min(p_to_q,q_to_p)
        
        
    

In [37]:
class WMD:
    
    ''' Enter Two sentence strings, cost = max if you want to try 
    max cost max flow version, embeddingtype = 1 for word2vec, 2 = normalized
    word2vec, 3 = glove300d, 4 = glove200d, 5 = glove100d 6 = glove50d'''
    
    def __init__(self,embeddingtype, wmd_type = 'normal', costtype='min'):
        
        
        self.cost = costtype
        
        self.embeddingtype = embeddingtype 
        self.wmd_type = wmd_type
        
    #def word_count(self):
#         self.sent1_dic = defaultdict(int)
#         self.sent2_dic = defaultdict(int)
        
#         for word in sorted(sentence_preprocess(self.sent1)):
#             self.sent1_dic[word] += 1
            
#         for word in sorted(sentence_preprocess(self.sent2)):
#             self.sent2_dic[word] += 1
        
#         return dict(self.sent1_dic), dict(self.sent2_dic)



#     def wasserstein_distance(self, pi, qj, D):
#         """Find Wasserstein distance through linear programming
#         p.shape=[m], q.shape=[n], D.shape=[m, n]
    
#         suppose doc1 has m words and doc2 has n words, then an mxn array would be formed, 
#         having distance of each word in doc1 to that of doc2.
    
    
    
#         p.sum()=1, q.sum()=1, p∈[0,1], q∈[0,1]
#         """
#         A_eq = [] # a list which will later be converted to array after appending.
#         for i in range(len(pi)): # len = number of words.
#             A = np.zeros_like(D) # a 2d array made with the shape of D.  
#             A[i, :] = 1 
#             # to make summation over "i" of Tij = pi, ie total / sum of outflow
            ## from one word is equal to its pi (normalized bag of word/ frequency/density)
            ## ex : if 2x3 D:
            ##T1,1 + T1,2 + T1,3 + 0 T2,1 + 0 T2,2 + 0 T2,3 = P1 and so on for every i,
            ## ie for each word in the doc1
        
#             A_eq.append(A.reshape(-1)) ## reshape(-1) flatens and then appending in A_eq.
            ## A_eq will be (m+n)x(m.n)
    
#         for i in range(len(qj)):
#             A = np.zeros_like(D)
#             A[:, i] = 1 ## summation over "j" this time, so this time for different rows, 
#             ## over a column "j" which refers to doc2, ie total incoming flow = qj density
#             A_eq.append(A.reshape(-1))
#             A_eq = np.array(A_eq)
        
#         print(A_eq.shape,A_eq)
       
#         b_eq = np.concatenate([pi, qj])
#         D = D.reshape(-1)
#         if self.cost == 'max':
#             D = D*(-1)
        
#         result = linprog(D, A_eq=A_eq[:-1], b_eq=b_eq[:-1]) ## removing redundant to make 
#         ## solution more robust.
#         return result.fun, result.x  ## fun returns the final optimized value, x returns each value of xi,j that is the array

    
    def word_mover_distance(self,sentence1,sentence2, ):
        
        self.sent1 = sentence1
        print(self.sent1 ,"\n")
        self.sent2 = sentence2
        print(self.sent2 ,"\n")
        
        
        self.sent1_dic = defaultdict(int)
        self.sent2_dic = defaultdict(int)
        
        for word in sorted(sentence_preprocess(self.sent1)): # sorted to have better
            self.sent1_dic[word] += 1 # idea of the sequence of the words.
            
        for word in sorted(sentence_preprocess(self.sent2)):
            self.sent2_dic[word] += 1
        
        
        self.sent1_dic = dict(self.sent1_dic) # converted from default dict to dict.
        self.sent2_dic = dict(self.sent2_dic) # because following operations work on dict
        
        
        print(self.sent1_dic ,"\n")
        print(self.sent2_dic ,"\n")
        
        self.sent1_words = np.array(list(self.sent1_dic.keys()))
        self.sent1_counts = np.array(list(self.sent1_dic.values()))
        
        self.sent2_words = np.array(list(self.sent2_dic.keys()))
        self.sent2_counts = np.array(list(self.sent2_dic.values()))
        
        
        print(self.sent1_words ,"\n")
        print(self.sent1_counts ,"\n")
        
        print(self.sent2_words ,"\n")
        print(self.sent2_counts ,"\n")
        
        #dictionary values cant be converted into an array directly, hence the
        #list step.
        
        print("embedding type being passed is :", self.embeddingtype,"\n")
        self.sent1_embmtx = find_embdMatrix(self.sent1_words, self.embeddingtype)
        print(self.sent1_embmtx.shape,"sent1emb\n")
        self.sent2_embmtx = find_embdMatrix(self.sent2_words, self.embeddingtype)
        print(self.sent2_embmtx.shape,"sent2emb\n")
        
        self.pi = self.sent1_counts/np.sum(self.sent1_counts)
        print(self.pi,"self.pi\n")
        self.qj = self.sent2_counts/np.sum(self.sent2_counts)
        print(self.qj,"self.pi\n")
        
        self.D = np.sqrt(np.square(self.sent1_embmtx[:, None] - self.sent2_embmtx[None, :]).sum(axis=2)) 
        print(self.D.shape,"Dshape \n")
        ## programmers sought used mean instead of sum.
        ## scipy cdist can be used as well.
        
        if self.wmd_type == 'normal':
            return wasserstein_distance(self.pi, self.qj, self.D, self.cost)
        
        
        if self.wmd_type == 'relaxed':
            return relaxed_distance(self.pi,self.qj,self.D,self.cost)
 
             
    



   
 

 


In [98]:
pi = [0.25,0.25,0.25,0.25]
qi = [0.25,0.25,0.25,0.25]
d = np.array([[1,1,1,1],[1,1,1,1],[1,1,0,1],[1,1,1,0]]).reshape((4,4))
a,b,c = wasserstein_distance(pi,qi,d)
a,b,c

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]), array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0])] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]), array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1])] Aeq

(8, 16) [[1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1]
 [1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0]
 [0

(0.500000000120658,
 array([1.25000000e-01, 1.25000000e-01, 8.10264749e-12, 8.10301845e-12,
        1.25000000e-01, 1.25000000e-01, 8.10264749e-12, 8.10301845e-12,
        8.10233249e-12, 8.10233249e-12, 2.50000000e-01, 1.19639048e-11,
        8.10196153e-12, 8.10196153e-12, 1.19633571e-11, 2.50000000e-01]),
 array([[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 0, 1],
        [1, 1, 1, 0]]))

In [16]:
sent1 = "Obama speaks to the media in Illinois."
sent2 = "The President greets the press in Chicago."
sent3 = "The band gave a concert in Japan."

In [38]:
model = WMD(sent1,sent2,1)

Obama speaks to the media in Illinois. 

The President greets the press in Chicago. 



In [40]:
diss, T,mtx = model.word_mover_distance()
diss,T,mtx

{'illinois': 1, 'media': 1, 'obama': 1, 'speaks': 1} 

{'chicago': 1, 'greets': 1, 'president': 1, 'press': 1} 

['illinois' 'media' 'obama' 'speaks'] 

[1 1 1 1] 

['chicago' 'greets' 'president' 'press'] 

[1 1 1 1] 

embedding type being passed is : 1 

 global embedding type being passed is : 1 

embedding type received by the find emb matrix is : 1 

embd model type is : <class 'gensim.models.keyedvectors.KeyedVectors'> 

embd_model type changed to : <class 'gensim.models.keyedvectors.KeyedVectors'> 

(4, 300) sent1emb

 global embedding type being passed is : 1 

embedding type received by the find emb matrix is : 1 

embd model type is : <class 'gensim.models.keyedvectors.KeyedVectors'> 

embd_model type changed to : <class 'gensim.models.keyedvectors.KeyedVectors'> 

(4, 300) sent2emb

[0.25 0.25 0.25 0.25] self.pi

[0.25 0.25 0.25 0.25] self.pi

(4, 4) Dshape 

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

(3.3741232173489615,
 array([2.50000000e-01, 3.18531834e-11, 2.14184326e-11, 2.52857048e-11,
        4.70801233e-12, 1.81424091e-11, 5.88324554e-11, 2.50000000e-01,
        6.85027770e-11, 3.21133789e-11, 2.50000000e-01, 2.74871660e-11,
        5.34556353e-12, 2.50000000e-01, 4.78527543e-11, 2.89110126e-11]),
 array([[3.169699 , 4.937704 , 4.357834 , 4.119726 ],
        [4.4567947, 4.215941 , 3.366701 , 2.1343176],
        [4.0367475, 5.5453153, 4.782619 , 4.6934934],
        [4.7602735, 3.4098573, 3.7898932, 3.376263 ]], dtype=float32))

In [67]:
model2 = WMD(sent3,sent2,1)

The band gave a concert in Japan. 

The President greets the press in Chicago. 



In [68]:
## NOTE : IMPORTANT

## Do not rerun this as the code has been modified,
## now the code returns three values. here we just using two values

diss2,t2 = model2.word_mover_distance()
diss2,t2

{'band': 1, 'concert': 1, 'gave': 1, 'japan': 1} 

{'chicago': 1, 'greets': 1, 'president': 1, 'press': 1} 

['band' 'concert' 'gave' 'japan'] 

[1 1 1 1] 

['chicago' 'greets' 'president' 'press'] 

[1 1 1 1] 

(4, 300) sent1emb

(4, 300) sent2emb

[0.25 0.25 0.25 0.25] self.pi

[0.25 0.25 0.25 0.25] self.pi

(4, 4) Dshape 

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 0

(3.883809542596161,
 array([3.48501400e-10, 1.65290340e-09, 2.49999999e-01, 8.68767734e-10,
        3.28668785e-10, 3.40554971e-09, 8.32333805e-11, 2.49999998e-01,
        3.36383594e-10, 2.49999997e-01, 2.45479080e-09, 2.60203117e-09,
        2.50000001e-01, 3.34751925e-10, 3.32148598e-10, 3.46652947e-10]))

In [None]:
## These values we got with modified code and they are same as the 
## previous one, so the code is working smoothly.


# (3.883809542596161,
#  array([3.48501400e-10, 1.65290340e-09, 2.49999999e-01, 8.68767734e-10,
#         3.28668785e-10, 3.40554971e-09, 8.32333805e-11, 2.49999998e-01,
#         3.36383594e-10, 2.49999997e-01, 2.45479080e-09, 2.60203117e-09,
#         2.50000001e-01, 3.34751925e-10, 3.32148598e-10, 3.46652947e-10]),
#  array([[4.5288696, 4.5337734, 3.711798 , 3.5188146],
#         [4.813723 , 4.4849935, 3.7947109, 3.5508246],
#         [4.6724405, 4.218369 , 3.5704765, 3.4494445],
#         [4.0542464, 5.3404236, 4.6758094, 4.4132013]], dtype=float32))

In [41]:
## Just retrying with modified code aimed at making it faster:

model2 = WMD(sent3,sent2,1)
diss2,t2,t2distancematrix = model2.word_mover_distance()
diss2,t2,t2distancematrix

The band gave a concert in Japan. 

The President greets the press in Chicago. 

{'band': 1, 'concert': 1, 'gave': 1, 'japan': 1} 

{'chicago': 1, 'greets': 1, 'president': 1, 'press': 1} 

['band' 'concert' 'gave' 'japan'] 

[1 1 1 1] 

['chicago' 'greets' 'president' 'press'] 

[1 1 1 1] 

embedding type being passed is : 1 

 global embedding type being passed is : 1 

embedding type received by the find emb matrix is : 1 

embd model type is : <class 'gensim.models.keyedvectors.KeyedVectors'> 

embd_model type changed to : <class 'gensim.models.keyedvectors.KeyedVectors'> 

(4, 300) sent1emb

 global embedding type being passed is : 1 

embedding type received by the find emb matrix is : 1 

embd model type is : <class 'gensim.models.keyedvectors.KeyedVectors'> 

embd_model type changed to : <class 'gensim.models.keyedvectors.KeyedVectors'> 

(4, 300) sent2emb

[0.25 0.25 0.25 0.25] self.pi

[0.25 0.25 0.25 0.25] self.pi

(4, 4) Dshape 

Dshape, len pi till here : (4, 4) 4 

A.shap

(3.883809542596161,
 array([3.48501400e-10, 1.65290340e-09, 2.49999999e-01, 8.68767734e-10,
        3.28668785e-10, 3.40554971e-09, 8.32333805e-11, 2.49999998e-01,
        3.36383594e-10, 2.49999997e-01, 2.45479080e-09, 2.60203117e-09,
        2.50000001e-01, 3.34751925e-10, 3.32148598e-10, 3.46652947e-10]),
 array([[4.5288696, 4.5337734, 3.711798 , 3.5188146],
        [4.813723 , 4.4849935, 3.7947109, 3.5508246],
        [4.6724405, 4.218369 , 3.5704765, 3.4494445],
        [4.0542464, 5.3404236, 4.6758094, 4.4132013]], dtype=float32))

In [69]:
model3 = WMD(sent2,sent1,1)
diss3,t3 = model3.word_mover_distance()
diss3,t3

The President greets the press in Chicago. 

Obama speaks to the media in Illinois. 

{'chicago': 1, 'greets': 1, 'president': 1, 'press': 1} 

{'illinois': 1, 'media': 1, 'obama': 1, 'speaks': 1} 

['chicago' 'greets' 'president' 'press'] 

[1 1 1 1] 

['illinois' 'media' 'obama' 'speaks'] 

[1 1 1 1] 

(4, 300) sent1emb

(4, 300) sent2emb

[0.25 0.25 0.25 0.25] self.pi

[0.25 0.25 0.25 0.25] self.pi

(4, 4) Dshape 

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1

(3.3741232173489397,
 array([2.50000000e-01, 4.70799925e-12, 6.85028414e-11, 5.34553639e-12,
        3.18531061e-11, 1.81424167e-11, 3.21133748e-11, 2.50000000e-01,
        2.14181898e-11, 5.88325714e-11, 2.50000000e-01, 4.78522987e-11,
        2.52854272e-11, 2.50000000e-01, 2.74869074e-11, 2.89106866e-11]))

In [70]:
model4 = WMD(sent2,sent3,1)
diss4,t4 = model4.word_mover_distance()
diss4,t4

The President greets the press in Chicago. 

The band gave a concert in Japan. 

{'chicago': 1, 'greets': 1, 'president': 1, 'press': 1} 

{'band': 1, 'concert': 1, 'gave': 1, 'japan': 1} 

['chicago' 'greets' 'president' 'press'] 

[1 1 1 1] 

['band' 'concert' 'gave' 'japan'] 

[1 1 1 1] 

(4, 300) sent1emb

(4, 300) sent2emb

[0.25 0.25 0.25 0.25] self.pi

[0.25 0.25 0.25 0.25] self.pi

(4, 4) Dshape 

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1., 1., 1., 1.

(3.883809542596191,
 array([3.48500876e-10, 3.28668429e-10, 3.36383262e-10, 2.50000001e-01,
        1.65290346e-09, 3.40554969e-09, 2.49999997e-01, 3.34752324e-10,
        2.49999999e-01, 8.32333712e-11, 2.45479057e-09, 3.32148980e-10,
        8.68768132e-10, 2.49999998e-01, 2.60203119e-09, 3.46653379e-10]))

In [71]:
model5 = WMD(sent1,sent3,1)
diss5,t5 = model5.word_mover_distance()
diss5,t5

Obama speaks to the media in Illinois. 

The band gave a concert in Japan. 

{'illinois': 1, 'media': 1, 'obama': 1, 'speaks': 1} 

{'band': 1, 'concert': 1, 'gave': 1, 'japan': 1} 

['illinois' 'media' 'obama' 'speaks'] 

[1 1 1 1] 

['band' 'concert' 'gave' 'japan'] 

[1 1 1 1] 

(4, 300) sent1emb

(4, 300) sent2emb

[0.25 0.25 0.25 0.25] self.pi

[0.25 0.25 0.25 0.25] self.pi

(4, 4) Dshape 

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 

(3.9959894418802078,
 array([2.50000000e-01, 4.91648393e-13, 1.25024155e-13, 5.08493061e-13,
        7.17848100e-13, 2.50000000e-01, 4.95096865e-15, 6.11554198e-14,
        2.56308321e-13, 2.52526604e-13, 1.13908655e-13, 2.50000000e-01,
        1.51104141e-13, 4.01038173e-14, 2.50000000e-01, 5.26588932e-14]))

In [None]:
[2.50000000e-01, 3.03720784e-11, 1.86845495e-11, 1.87096356e-11,
        1.25400619e-11, 6.25733973e-10, 1.24664083e-11, 2.49999999e-01,
        2.33762480e-11, 1.92005097e-11, 2.50000000e-01, 5.69153838e-13,
        3.18499400e-11, 2.49999999e-01, 1.19951772e-11, 6.31456647e-10]))

In [72]:
model6 = WMD(sent3,sent1,1)
diss6,t6 = model5.word_mover_distance()
diss6,t6

The band gave a concert in Japan. 

Obama speaks to the media in Illinois. 

{'illinois': 1, 'media': 1, 'obama': 1, 'speaks': 1} 

{'band': 1, 'concert': 1, 'gave': 1, 'japan': 1} 

['illinois' 'media' 'obama' 'speaks'] 

[1 1 1 1] 

['band' 'concert' 'gave' 'japan'] 

[1 1 1 1] 

(4, 300) sent1emb

(4, 300) sent2emb

[0.25 0.25 0.25 0.25] self.pi

[0.25 0.25 0.25 0.25] self.pi

(4, 4) Dshape 

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 

(3.9959894418802078,
 array([2.50000000e-01, 4.91648393e-13, 1.25024155e-13, 5.08493061e-13,
        7.17848100e-13, 2.50000000e-01, 4.95096865e-15, 6.11554198e-14,
        2.56308321e-13, 2.52526604e-13, 1.13908655e-13, 2.50000000e-01,
        1.51104141e-13, 4.01038173e-14, 2.50000000e-01, 5.26588932e-14]))

In [93]:
#using glove now

glovemodel1 = WMD(sent1,sent3,3)
glovediss1,glovet1,dmat13 = glovemodel1.word_mover_distance()
glovediss1,glovet1,dmat13

Obama speaks to the media in Illinois. 

The band gave a concert in Japan. 

{'illinois': 1, 'media': 1, 'obama': 1, 'speaks': 1} 

{'band': 1, 'concert': 1, 'gave': 1, 'japan': 1} 

['illinois' 'media' 'obama' 'speaks'] 

[1 1 1 1] 

['band' 'concert' 'gave' 'japan'] 

[1 1 1 1] 

(4, 300) sent1emb

(4, 300) sent2emb

[0.25 0.25 0.25 0.25] self.pi

[0.25 0.25 0.25 0.25] self.pi

(4, 4) Dshape 

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 

(8.413724306355522,
 array([2.50000000e-01, 3.03720784e-11, 1.86845495e-11, 1.87096356e-11,
        1.25400619e-11, 6.25733973e-10, 1.24664083e-11, 2.49999999e-01,
        2.33762480e-11, 1.92005097e-11, 2.50000000e-01, 5.69153838e-13,
        3.18499400e-11, 2.49999999e-01, 1.19951772e-11, 6.31456647e-10]),
 array([[8.9826   , 9.047554 , 7.807904 , 9.878976 ],
        [8.741273 , 8.166015 , 6.9354115, 8.927074 ],
        [9.665114 , 9.106033 , 7.3337817, 9.572257 ],
        [8.857818 , 8.411441 , 7.132688 , 9.221346 ]], dtype=float32))

In [92]:
#using glove now

glovemodel2 = WMD(sent1,sent2,3)
glovediss2,glovet2,Dmatx = glovemodel2.word_mover_distance()
glovediss2,glovet2,Dmatx

Obama speaks to the media in Illinois. 

The President greets the press in Chicago. 

{'illinois': 1, 'media': 1, 'obama': 1, 'speaks': 1} 

{'chicago': 1, 'greets': 1, 'president': 1, 'press': 1} 

['illinois' 'media' 'obama' 'speaks'] 

[1 1 1 1] 

['chicago' 'greets' 'president' 'press'] 

[1 1 1 1] 

(4, 300) sent1emb

(4, 300) sent2emb

[0.25 0.25 0.25 0.25] self.pi

[0.25 0.25 0.25 0.25] self.pi

(4, 4) Dshape 

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)] Aeq

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32), array([0., 0., 0., 0., 1

(6.713788986251661,
 array([2.50000000e-01, 2.22835912e-13, 2.78691205e-13, 9.45355661e-14,
        1.53777156e-13, 2.07076598e-13, 2.18497826e-13, 2.50000000e-01,
        2.46322668e-13, 2.32627133e-14, 2.50000000e-01, 2.30461774e-13,
        1.95720392e-13, 2.50000000e-01, 3.09153046e-15, 2.54647804e-13]),
 array([[6.490648 , 9.131097 , 9.054297 , 8.658126 ],
        [8.138092 , 8.493923 , 8.3068075, 5.7176623],
        [8.488325 , 8.662736 , 7.8315964, 8.067997 ],
        [8.160118 , 6.8152494, 8.358283 , 7.697754 ]], dtype=float32))

In [None]:
## To check performance with normalisation later.


from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
  
print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
  
# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("better", pos ="a"))

In [7]:
def test(a,b):
    print('function called')
    return a*b

class Test:
    
    def __init__(self,a,b):
        print(test(a,b))
        
        
        

    

In [9]:
testobj = Test(3,4)

function called
12


In [9]:
#import os
os.path.exists("../test/glove.6B.50d.txt")

True

In [13]:
sent = ' I am a good boy good good'

#list11 = sentence_preprocess(sent)
#list11.sort()

count_dict = defaultdict(int)

for word in sorted(sentence_preprocess(sent)):
    count_dict[word] += 1
    
dict(count_dict)

{'boy': 1, 'good': 3}

In [71]:
dict(count_dict)

{'a': 2,
 'b': 2,
 'c': 1,
 'd': 1,
 'e': 3,
 'f': 1,
 'g': 1,
 'h': 1,
 'j': 1,
 'v': 1}

In [72]:
count_dict.values()

dict_values([2, 2, 1, 1, 3, 1, 1, 1, 1, 1])

In [19]:
a = np.array(list(count_dict.values()))
a/np.sum(a)

array([0.25, 0.75])

10

In [82]:
embeddings_dict = defaultdict(lambda:np.zeros(300))
with open(f"../test/glove.6B.300d.txt",'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector

In [86]:
obama = embeddings_dict['media']
president = embeddings_dict['concert']
eucdist = np.linalg.norm(president-obama)
eucdist

8.166015

## Preprocessing bbc sport raw file

In [47]:
len(os.listdir("../test/bbcsport/athletics"))

101

In [48]:
len(os.listdir("../test/bbcsport/rugby"))

147

In [49]:
len(os.listdir("../test/bbcsport/cricket"))

124

In [50]:
os.listdir("../test/bbcsport")

['README.TXT', 'rugby', 'athletics', 'tennis', 'cricket', 'football']

In [67]:
len(os.listdir("../test/bbcsport/football"))

265

In [14]:
import glob

In [15]:
sports_path = glob.glob("../test/bbcsport/*")
sports_path.pop(0)
sports_path

['../test/bbcsport/rugby',
 '../test/bbcsport/athletics',
 '../test/bbcsport/tennis',
 '../test/bbcsport/cricket',
 '../test/bbcsport/football']

In [16]:
import re

In [77]:
string = "../test/bbcsport/rugby"
print(len(string))
name = re.match('(../test/bbcsport/)(\w*)',string)
name.groups()

22


('../test/bbcsport/', 'rugby')

### Note here: How to handle such files:

with open open() function returns a file object. And for file object, there is no method like splitlines() or split(). You could use dir(f) to see all the methods of file object. _ioTextwrapper object. 

the following things will have following op:

path = "text_file_path.txt"
for txt in path:
    with open ( path, "r") as f :

In [21]:
d = os.listdir("../test/bbcsport/football/")
d

['188.txt',
 '003.txt',
 '153.txt',
 '217.txt',
 '027.txt',
 '031.txt',
 '161.txt',
 '143.txt',
 '112.txt',
 '126.txt',
 '104.txt',
 '167.txt',
 '195.txt',
 '237.txt',
 '223.txt',
 '141.txt',
 '086.txt',
 '262.txt',
 '109.txt',
 '226.txt',
 '091.txt',
 '106.txt',
 '148.txt',
 '001.txt',
 '002.txt',
 '046.txt',
 '227.txt',
 '193.txt',
 '197.txt',
 '119.txt',
 '243.txt',
 '151.txt',
 '258.txt',
 '127.txt',
 '245.txt',
 '220.txt',
 '010.txt',
 '057.txt',
 '162.txt',
 '207.txt',
 '133.txt',
 '087.txt',
 '192.txt',
 '150.txt',
 '103.txt',
 '210.txt',
 '168.txt',
 '121.txt',
 '231.txt',
 '017.txt',
 '125.txt',
 '194.txt',
 '248.txt',
 '117.txt',
 '068.txt',
 '191.txt',
 '131.txt',
 '081.txt',
 '171.txt',
 '169.txt',
 '019.txt',
 '174.txt',
 '085.txt',
 '241.txt',
 '145.txt',
 '030.txt',
 '149.txt',
 '101.txt',
 '094.txt',
 '242.txt',
 '206.txt',
 '042.txt',
 '128.txt',
 '093.txt',
 '115.txt',
 '190.txt',
 '014.txt',
 '043.txt',
 '154.txt',
 '107.txt',
 '203.txt',
 '216.txt',
 '113.txt',
 '23

In [76]:
d = os.listdir("../test/bbcsport/athletics/")


for txt in d:
    with open(f"../test/bbcsport/football/{txt}", 'r') as f:
        
        lines = f.read()
        preprocesedline = sentence_preprocess(lines)
        print(preprocesedline)
        
        
        #for line in f:
            #print(line.split())
        break

['moyes', 'beattie', 'dismissal', 'everton', 'manager', 'david', 'moyes', 'discipline', 'striker', 'james', 'beattie', 'headbutt', 'chelsea', 'defender', 'william', 'gallas', 'scot', 'initially', 'defended', 'beattie', 'whose', 'dismissal', 'put', 'everton', 'back', 'foot', 'game', 'ultimately', 'lost', 'saying', 'gallas', 'overreacted', 'rethink', 'looking', 'video', 'evidence', 'said', 'believe', 'set', 'record', 'straight', 'conceding', 'dismissal', 'right', 'correct', 'moyes', 'added', 'comments', 'saturday', 'came', 'immediately', 'final', 'whistle', 'point', 'opportunity', 'see', 'one', 'quick', 'incident', 'club', 'website', 'also', 'reported', 'beattie', 'seemed', 'unrepentant', 'saturday', 'match', 'insisting', 'gallas', 'would', 'stayed', 'lot', 'longer', 'headbutted', 'apologised', 'moyes', 'continued', 'although', 'incident', 'totally', 'character', 'james', 'never', 'even', 'suspended', 'career', 'actions', 'unacceptable', 'detrimental', 'effect', 'james', 'issue', 'formal

In [30]:

embedding = KeyedVectors.load('google300w2v.kv', mmap='r')

In [36]:
glove_test = defaultdict(int)
with open(f"../test/glove.6B.300d.txt",'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            glove_test[word] = vector
        


In [39]:
glove_test['giggs']

array([-0.38914  ,  0.54391  , -0.37051  ,  0.11197  ,  0.52875  ,
       -0.48597  , -0.81167  , -0.49878  , -0.42569  ,  0.34979  ,
       -0.59059  , -0.45513  , -0.12591  , -0.35044  , -0.33486  ,
        0.1549   , -0.265    , -0.18403  , -0.086644 ,  0.32109  ,
        0.68264  ,  0.38181  , -0.14069  , -0.30722  ,  0.26674  ,
        0.081064 ,  0.10712  ,  0.48131  ,  0.022878 , -0.1875   ,
        0.65655  , -0.43314  ,  0.71209  , -0.15094  , -0.44511  ,
       -0.37371  , -0.17895  ,  0.3753   , -0.38329  ,  0.25674  ,
        0.0017494, -0.20797  , -0.0259   ,  0.18894  , -0.044307 ,
        0.58492  , -0.052792 , -0.0034005,  0.64318  , -0.31403  ,
       -0.55783  , -0.33489  ,  0.13992  ,  0.28535  ,  0.025751 ,
       -0.63934  ,  0.40252  , -0.18133  , -0.68885  ,  0.38223  ,
       -0.16864  ,  0.048815 , -0.75052  , -0.26309  , -0.32622  ,
       -0.079043 ,  0.56622  , -0.52637  ,  0.12174  ,  0.016791 ,
       -0.60541  , -0.50993  ,  0.2132   ,  1.1645   ,  0.3378

In [35]:
embedding['giggs']

memmap([-0.05004883,  0.01330566,  0.01586914,  0.13183594,  0.12255859,
        -0.04589844,  0.01202393, -0.22363281,  0.00939941, -0.01116943,
        -0.03881836, -0.00193787, -0.20507812, -0.04492188,  0.02355957,
         0.20214844,  0.04223633,  0.15722656, -0.06982422, -0.08837891,
        -0.13574219, -0.04003906,  0.26953125, -0.11132812,  0.00280762,
         0.03955078, -0.09814453, -0.00674438,  0.06738281, -0.09228516,
        -0.09619141,  0.13183594,  0.06494141, -0.11962891, -0.07910156,
         0.171875  , -0.17382812,  0.09716797,  0.06298828,  0.14941406,
         0.04711914, -0.17871094,  0.22851562, -0.05737305,  0.18554688,
        -0.19921875,  0.14453125, -0.21191406,  0.03857422,  0.07617188,
        -0.02514648, -0.01428223, -0.08789062, -0.06835938, -0.13671875,
        -0.00415039, -0.03686523,  0.03686523, -0.05859375,  0.00527954,
        -0.08642578,  0.16699219, -0.10107422, -0.1328125 ,  0.03222656,
        -0.11328125, -0.04541016, -0.00263977,  0.0

In [10]:
categories = ['athletics','rugby','tennis','cricket','football']


In [24]:
os.listdir("../test/bbcsport/rugby")

['003.txt',
 '027.txt',
 '031.txt',
 '143.txt',
 '112.txt',
 '126.txt',
 '104.txt',
 '141.txt',
 '086.txt',
 '109.txt',
 '091.txt',
 '106.txt',
 '001.txt',
 '002.txt',
 '046.txt',
 '119.txt',
 '127.txt',
 '010.txt',
 '057.txt',
 '133.txt',
 '087.txt',
 '103.txt',
 '121.txt',
 '017.txt',
 '125.txt',
 '117.txt',
 '068.txt',
 '131.txt',
 '081.txt',
 '019.txt',
 '085.txt',
 '145.txt',
 '030.txt',
 '101.txt',
 '094.txt',
 '042.txt',
 '128.txt',
 '093.txt',
 '115.txt',
 '014.txt',
 '043.txt',
 '107.txt',
 '113.txt',
 '118.txt',
 '090.txt',
 '011.txt',
 '054.txt',
 '029.txt',
 '047.txt',
 '080.txt',
 '036.txt',
 '096.txt',
 '065.txt',
 '025.txt',
 '061.txt',
 '077.txt',
 '058.txt',
 '116.txt',
 '045.txt',
 '095.txt',
 '076.txt',
 '120.txt',
 '147.txt',
 '139.txt',
 '098.txt',
 '124.txt',
 '082.txt',
 '132.txt',
 '041.txt',
 '039.txt',
 '097.txt',
 '013.txt',
 '129.txt',
 '064.txt',
 '026.txt',
 '020.txt',
 '092.txt',
 '049.txt',
 '111.txt',
 '099.txt',
 '102.txt',
 '059.txt',
 '067.txt',
 '05

In [18]:
## Important Important Important, handling file.

# I created a dictionary named bbcsport_dataset, then stored it using json, and now read
# that into system as loaded_bbcdataset.


categories = ['athletics','rugby','tennis','cricket','football']

bbcsport_dataset = {}

main_path = "../test/bbcsport/"

i = 1

for category in categories:
    category_path = main_path+category # "../test/bbcsport/category", go to each category, make a list of 
    elements_incategory = os.listdir(category_path) #path of all the text files in that category folder.
    print(category)
    
    for element in elements_incategory: #read each file one by one and store the sentence string in dictionary.
            
            #print(i)
            #i+= 1
            with open(f"../test/bbcsport/{category}/{element}", encoding="utf8", errors='ignore') as f:
            
                lines = f.read()
                #lines.decode("utf-8")
            
            try:
                print(bbcsport_dataset[category])
            
            except KeyError:
                #print('triggered')
                bbcsport_dataset[category] = []
            
            bbcsport_dataset[category].append(lines) 
            
            
        

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [29]:
bbcsport_dataset['football'][2]

'Real will finish abandoned match\n\nReal Madrid and Real Socieded will play the final six minutes of their match, which was abandoned on Sunday because of a bomb scare.\n\nThe Bernabeu was evacuated with the score at 1-1 and two minutes of normal time remaining in the game. The teams will now play the final two minutes, plus four minutes of injury time, on 5 January. Brazilian Ronaldo and England captain David Beckham had to wait in the street in their kit after the abandonment. Real Sociedad president Jose Luis Astiazaran said: "We thought the best thing was to play the time remaining."\n\nHundreds of fans streamed across the pitch on their way to the exits after the game was called off. Tourists and fans took advantage of the opportunity for a photograph between the famous stadium\'s goalposts. The two clubs met the Spanish FA on Monday and Astiazaran added: "We thought about giving the game as concluded but after talking with the FA we decided there was no precedent for that and th

In [None]:
## will enter path one by one, create a dictionary which will have different articles.

In [26]:
'''import json


a_file = open("bbcsport_dataset.json", "w")
json.dump(bbcsport_dataset, a_file)
a_file.close()
'''
# a_file = open("data.json", "r")
# output = a_file.read()
# print(output)

# a_file.close()

In [6]:
### IMPORTANT : to store and read file as dictionary string
'''
a_file = open("bbcsport_dataset.json", "r")
output = a_file.read()
print(len(output))

a_file.close()'''

1453351


In [1]:
#### TO LOAD as DICTIONARY

import json
file = open("bbcsport_dataset.json","r")
loaded_bbcdataset = json.load(file)


In [3]:
loaded_bbcdataset['football'][3]


'Bitter Santini hits out at Spurs\n\nFormer Tottenham coach Jacques Santini said he quit partly because he felt agreements with the club were broken.\n\nSantini, 52, left in November after just 13 games in charge amid tensions with sporting director Frank Arnesen. "They promised me a big apartment on the beach and I found myself 200m from the sea with a view of my neighbours," he told France\'s Journal di Dimanche. But the ex-France coach admitted he "dug his own grave" by agreeing to join the club before the end of Euro 2004. "My only regret is having signed too early (for Tottenham). I should have waited until after Euro 2004 even if that means I might have missed my chance," he said. Santini also said he was not given enough information about Spurs\' transfer policy. "I learned on the day of our team photo that our captain (Stephen Carr) was leaving the club," he said.\n'

In [45]:
os.listdir("../test")

['WMD_MAX experimentation.ipynb',
 'data.json',
 '.ipynb_checkpoints',
 'glove.6B.100d.txt',
 'wmd.ipynb',
 'google300w2v.kv',
 'glove.6B.200d.txt',
 'model.npy',
 'glove.6B.300d.txt',
 'glove.6B.zip',
 'Miniconda3-latest-Linux-x86_64.sh',
 'google300w2v.kv.vectors.npy',
 'Untitled.ipynb',
 'glove.6B.50d.txt',
 'model.npz',
 'test_first_jupyteronada.ipynb',
 'bbcsport',
 'totestsavedmodel.ipynb']

In [56]:
newdict = {}

try:
    newdict['category']
    print['error']
except KeyError:
    print('triggered')
    newdict['category'] = []
    
    
newdict['category'].append('hey hey')


triggered


In [58]:
newdict['category'].append('hello')
newdict

{'category': ['hey hey', 'hello']}

In [30]:
## Relaxed WMD : both min and max.

a = np.arange(9).reshape(3,3)
a




array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [36]:
b = a.T.min(axis=1)
b

array([0, 1, 2])

In [32]:
pi = [0.5,0.25,0.25]

In [39]:
c = np.dot(b,pi)
d = np.dot(a.min(axis =1),pi)
max(c,d)

2.25

In [49]:
## Trying with the new code of min max and normal and relaxed:

sent1 = "Obama speaks to the media in Illinois."
sent2 = "The President greets the press in Chicago."
sent3 = "The band gave a concert in Japan."



In [64]:
modeltest = WMD(sent1,sent2,1,cost='min')

Obama speaks to the media in Illinois. 

The President greets the press in Chicago. 



In [65]:
diss,t,matx = modeltest.word_mover_distance()
diss,t,matx

{'illinois': 1, 'media': 1, 'obama': 1, 'speaks': 1} 

{'chicago': 1, 'greets': 1, 'president': 1, 'press': 1} 

['illinois' 'media' 'obama' 'speaks'] 

[1 1 1 1] 

['chicago' 'greets' 'president' 'press'] 

[1 1 1 1] 

embedding type being passed is : 1 

 global embedding type being passed is : 1 

embedding type received by the find emb matrix is : 1 

embd model type is : <class 'gensim.models.keyedvectors.KeyedVectors'> 

embd_model type changed to : <class 'gensim.models.keyedvectors.KeyedVectors'> 

(4, 300) sent1emb

 global embedding type being passed is : 1 

embedding type received by the find emb matrix is : 1 

embd model type is : <class 'gensim.models.keyedvectors.KeyedVectors'> 

embd_model type changed to : <class 'gensim.models.keyedvectors.KeyedVectors'> 

(4, 300) sent2emb

[0.25 0.25 0.25 0.25] self.pi

[0.25 0.25 0.25 0.25] self.pi

(4, 4) Dshape 

Dshape, len pi till here : (4, 4) 4 

A.shape (4, 4) 

[array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

(3.3741232173489615,
 array([2.50000000e-01, 3.18531834e-11, 2.14184326e-11, 2.52857048e-11,
        4.70801233e-12, 1.81424091e-11, 5.88324554e-11, 2.50000000e-01,
        6.85027770e-11, 3.21133789e-11, 2.50000000e-01, 2.74871660e-11,
        5.34556353e-12, 2.50000000e-01, 4.78527543e-11, 2.89110126e-11]),
 array([[3.169699 , 4.937704 , 4.357834 , 4.119726 ],
        [4.4567947, 4.215941 , 3.366701 , 2.1343176],
        [4.0367475, 5.5453153, 4.782619 , 4.6934934],
        [4.7602735, 3.4098573, 3.7898932, 3.376263 ]], dtype=float32))

In [86]:
model2 = WMD(sent1,sent2,1,wmd_type='relaxed',cost='min')

Obama speaks to the media in Illinois. 

The President greets the press in Chicago. 



In [87]:
diss_relaxed = model2.word_mover_distance()
diss_relaxed

{'illinois': 1, 'media': 1, 'obama': 1, 'speaks': 1} 

{'chicago': 1, 'greets': 1, 'president': 1, 'press': 1} 

['illinois' 'media' 'obama' 'speaks'] 

[1 1 1 1] 

['chicago' 'greets' 'president' 'press'] 

[1 1 1 1] 

embedding type being passed is : 1 

 global embedding type being passed is : 1 

embedding type received by the find emb matrix is : 1 

embd model type is : <class 'gensim.models.keyedvectors.KeyedVectors'> 

embd_model type changed to : <class 'gensim.models.keyedvectors.KeyedVectors'> 

(4, 300) sent1emb

 global embedding type being passed is : 1 

embedding type received by the find emb matrix is : 1 

embd model type is : <class 'gensim.models.keyedvectors.KeyedVectors'> 

embd_model type changed to : <class 'gensim.models.keyedvectors.KeyedVectors'> 

(4, 300) sent2emb

[0.25 0.25 0.25 0.25] self.pi

[0.25 0.25 0.25 0.25] self.pi

(4, 4) Dshape 



3.1792567372322083

## WORKED JUST PERFECTLY

### Splitting dictionary into arrays and making training and test set.

In [5]:
bbcsport_sentences = []

bbcsport_labels= []

for key in loaded_bbcdataset.keys():
    bbcsport_sentences.extend(loaded_bbcdataset[key])
    bbcsport_labels.extend([key] * len(loaded_bbcdataset[key]))
    
    

In [7]:
len(bbcsport_sentences),len(bbcsport_labels)

(737, 737)

In [18]:
bbcsport_sentences[1],bbcsport_labels[1]

("Holmes feted with further honour\n\nDouble Olympic champion Kelly Holmes has been voted European Athletics (EAA) woman athlete of 2004 in the governing body's annual poll.\n\nThe Briton, made a dame in the New Year Honours List for taking 800m and 1,500m gold, won vital votes from the public, press and EAA member federations. She is only the second British woman to land the title after- Sally Gunnell won for her world 400m hurdles win in 1993. Swedish triple jumper Christian Olsson was voted male athlete of the year. The accolade is the latest in a long list of awards that Holmes has received since her success in Athens. In addition to becoming a dame, she was also named the BBC Sports Personality of the Year in December. Her gutsy victory in the 800m also earned her the International Association of Athletics Federations' award for the best women's performance in the world for 2004. And she scooped two awards at the British Athletics Writers' Association annual dinner in October.\n",

## Storing the arrays for later use:


In [13]:
## storing the sentences in an array

np.save('bbcsport_sentences.npy',np.array(bbcsport_sentences))

In [19]:
## storing the corresponding labels:

np.save('bbcsport_labels.npy',np.array(bbcsport_labels))


In [20]:
os.listdir("../test")

['WMD_MAX experimentation.ipynb',
 'data.json',
 'bbcsport_dataset.json',
 '.ipynb_checkpoints',
 'glove.6B.100d.txt',
 'wmd.ipynb',
 'google300w2v.kv',
 'glove.6B.200d.txt',
 'model.npy',
 'glove.6B.300d.txt',
 'bbcsport_sentences.npy',
 'glove.6B.zip',
 'Miniconda3-latest-Linux-x86_64.sh',
 'google300w2v.kv.vectors.npy',
 'Untitled.ipynb',
 'glove.6B.50d.txt',
 'model.npz',
 'test_first_jupyteronada.ipynb',
 'bbcsport',
 'totestsavedmodel.ipynb',
 'bbcsport_labels.npy']

In [21]:
loadedbbcsport_sentences = np.load('bbcsport_sentences.npy')
loadedbbcsport_labels = np.load('bbcsport_labels.npy')
type(loadedbbcsport_sentences),loadedbbcsport_sentences[1],loadedbbcsport_labels[1]

(numpy.ndarray,
 "Holmes feted with further honour\n\nDouble Olympic champion Kelly Holmes has been voted European Athletics (EAA) woman athlete of 2004 in the governing body's annual poll.\n\nThe Briton, made a dame in the New Year Honours List for taking 800m and 1,500m gold, won vital votes from the public, press and EAA member federations. She is only the second British woman to land the title after- Sally Gunnell won for her world 400m hurdles win in 1993. Swedish triple jumper Christian Olsson was voted male athlete of the year. The accolade is the latest in a long list of awards that Holmes has received since her success in Athens. In addition to becoming a dame, she was also named the BBC Sports Personality of the Year in December. Her gutsy victory in the 800m also earned her the International Association of Athletics Federations' award for the best women's performance in the world for 2004. And she scooped two awards at the British Athletics Writers' Association annual dinner

## Shuffling the sentences and corresponding labels, then dividing in test train.

In [41]:
BBCsport_sentences_shfld, BBCsport_labels_shfld = sklearn.utils.shuffle(bbcsport_sentences, bbcsport_labels)

In [42]:
len(BBCsport_labels_shfld),len(BBCsport_sentences_shfld)

(737, 737)

In [46]:
Train_BBCsport_sent,Train_BBCsport_label = BBCsport_sentences_shfld[:514], BBCsport_labels_shfld[:514]
Test_BBCsport_sent,Test_BBCsport_label = BBCsport_sentences_shfld[515:],BBCsport_labels_shfld[515:]

In [49]:
Test_BBCsport_label.count('cricket'),Test_BBCsport_label.count('cricket'),Test_BBCsport_label.count('rugby'),Test_BBCsport_label.count('athletics'),Test_BBCsport_label.count('football')

(36, 36, 52, 31, 71)

In [50]:
np.save('Train_BBCsport_sent.npy',np.array(Train_BBCsport_sent))
np.save('Train_BBCsport_label.npy',np.array(Train_BBCsport_label))
np.save('Test_BBCsport_sent.npy',np.array(Test_BBCsport_sent))
np.save('Test_BBCsport_label.npy',np.array(Test_BBCsport_label))

In [45]:
BBCsport_labels_shfld[600],BBCsport_sentences_shfld[600]

('football',
 'Legendary Dutch boss Michels dies\n\nLegendary Dutch coach Rinus Michels, the man credited with developing "total football", has died aged 77.\n\nReferred to in the Netherlands as "the General", Michels led the Dutch at the 1974 World Cup - when they reached the final only to lose 2-1 to Germany. However, he guided his side to the 1988 European Championship title with a 2-0 win over the Soviet Union in the final. Michels played for Ajax and coached the side to four national titles between 1965-71 and a European Cup in 1971. His 1970s Dutch team was built around Johan Cruyff and Johan Neeskens and introduced the concept of \'total football\' to the world. The strategy was to foster team coherence and individual imagination - with all players possessing the skills to play in any part of the pitch. Cruyff was the on-field organiser of a team whose players rotated in and out of defence at will and was encouraged to play creative attacking football. Michels had recently under

## creating an array of words having frequency less than 5 in all the documents.

In [28]:
from collections import Counter

In [51]:
allthewords = []

for sentence in bbcsport_sentences:
    allthewords.extend(sentence.lower().split())
    
countofwords = Counter(allthewords)   

listoflowfrequencywords = [word for word in countofwords.keys() if countofwords[word] < 5]
len(listoflowfrequencywords)

18370

In [36]:
len(allthewords)

251456

In [35]:
listoflowfrequencywords

['wipe',
 'hurts',
 'worlds."',
 '0.02',
 '9.87',
 'conserving',
 'energy.',
 'kansas',
 'recapturing',
 'finland.',
 'coming.',
 'ato',
 '(boldon)',
 '(eaa)',
 'dame',
 'votes',
 'eaa',
 'after-',
 'sally',
 'gunnell',
 "federations'",
 "writers'",
 'dougie',
 'musselburgh',
 'racecourse',
 'nandrolone.',
 'goon,"',
 'walker.',
 'newspaper:',
 'shape,',
 'chunk',
 'half-decent',
 'racing.',
 'circuit.',
 'cagigal',
 'memorial',
 'kilbride',
 'vivancos,',
 "haiti's",
 'dudley',
 '7.64secs',
 '0.04secs',
 'vivancos',
 'slashed',
 '7.60secs',
 '7.62secs',
 '7.63secs',
 'cooperative',
 'illston',
 'evidentiary',
 'valente,',
 'clients',
 'bonds,',
 'baseball',
 'congressional',
 'admissibility',
 'raids',
 "balco's",
 'offices',
 'obtain',
 'legality',
 'raids.',
 'confident,"',
 'claxton.',
 'translate',
 'scotland-born',
 'fifth-fastest',
 'prix,',
 're-focused',
 'attentions.',
 'etienne.',
 'cobh',
 'nationals',
 'currentily',
 'agio.',
 'mccambridge',
 'fionnualla',
 'britton,',
 'lo

## NOW KNN

In [None]:
def find_Category(sentence):
    
    for category in categories
    