In [0]:
import keras

Using TensorFlow backend.


In [0]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet tensorflow-hub
!pip3 install --quiet seaborn

In [0]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@pa

In [0]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import scipy.spatial as sp

In [0]:
embed = hub.Module(module_url)

# Compute a representation for each message, showing various lengths supported.
sentence1 = "The name I have is dweepa"
sentence2 = "my name is dweepa"
messages = [sentence1, sentence2]

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

# Google Embedding with Similarity Measure

In [0]:
def google_embedding(messages):
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(embed(messages))
        me = np.array(message_embeddings).tolist()
        return 1 - sp.distance.cosine(me[0], me[1])

In [0]:
messages = [['What are natural numbers?', 'Which is the least natural number?'],
            ["Should I learn Java or Python first?", "If I had to choose between learning Java or Python, which should I opt to learn first?"],
            ['Which are the most popular pizzas ordered at Domino’s?', 'How many calories do the most popular Domina’s pizzas contain?'],
            ['How do you start a bakery?', 'What must I do to start a bakery business?']]


In [0]:
list(map(google_embedding, messages))

[0.7986731168459479,
 0.9439757577616475,
 0.7049273669173751,
 0.8648544846558146]

# Synset approach

In [0]:
import pandas as pd

In [0]:
def preprocess(sentence):
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('averaged_perceptron_tagger')
    
    tokenized_word=word_tokenize(sentence)
    sentence = [word.lower() for word in tokenized_word if word.isalpha()]
    stop_words=set(stopwords.words("english"))
    filtered=[]
    for w in sentence:
        if w not in stop_words:
            filtered.append(w)
    pos = nltk.pos_tag(filtered)
    return pos

## Find Similarity Matrix for each algorithm:

In [0]:
def find_similarity_matrix(syns, method):
    sim_matrix = []
    for key in syns.keys():
        synlist = syns[key]
        similarities = []
        for key1 in syns.keys():  
            maxsim = 0
            if method == 'wp' or method == 'lch':
                for i in synlist:
                    synlist1 = syns[key1]
                    for j in synlist1:
                        if(method == 'wp'):
                            sim = i.wup_similarity(j)
                        if(method == 'lch'):
                            try:
                                sim = i.lch_similarity(j)
                            except:
                                continue
                        if(str(sim) == 'None'):
                            sim=0
                        if(sim>maxsim):
                            maxsim=sim
            else:
                from nltk.corpus import wordnet as wn
                from itertools import product 
                if key[1] == key1[1]:
                    if(method == 'res'):
                        from nltk.corpus import wordnet_ic
                        brown_ic = wordnet_ic.ic('ic-brown.dat')
                        sim = max([(wn.res_similarity(s1, s2, brown_ic) if(s1.pos() == s2.pos() and s1.pos()!='a' and s2.pos()!='a' and s1.pos()!='s' and s2.pos!='s') else 0,s1,s2) for s1, s2 in product(syns[key], syns[key1]) ])
                        maxsim=sim[0]
                                        
                    if(method == 'jcn'):
                        from nltk.corpus import wordnet_ic
                        brown_ic = wordnet_ic.ic('ic-brown.dat')
                        sim = max([(wn.jcn_similarity(s1, s2, brown_ic) if(s1.pos() == s2.pos() and s1.pos()!='a' and s2.pos()!='a' and s1.pos()!='s' and s2.pos!='s') else 0,s1,s2) for s1, s2 in product(syns[key], syns[key1]) ])
                        maxsim=sim[0]
                else:
                    maxsim = 0
            similarities.append(maxsim)
        sim_matrix.append(similarities)
    return sim_matrix

### Function to create 2D matrix in double dictionary form and function to convert matrix to data frame

In [0]:
def create_vector(sent, corpus, dicti):
    vector = []
    for word in corpus:
        if word in sent:
            vector.append(1)
        else:
            maxval = 0
            for word1 in corpus:
                if word==word1:
                    continue
                if dicti[word][word1]>maxval:
                    maxval = dicti[word][word1]
                    bestword = word1
            if(maxval<0.4):
                vector.append(0)
            else:
                vector.append(maxval)
    return vector
                
            
        

In [0]:
sent1 = "I like fish fry with lunch whenever I am by the side of sea."
sent2 = "Our professor is ferocious i.e. if you make noise in classroom, he will fry you in front of everybody."
pre_sent1 = preprocess(sent1)
pre_sent2 = preprocess(sent2)
print(pre_sent1)
print(pre_sent2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[('like', 'IN'), ('fish', 'JJ'), ('fry', 'NN'), ('lunch', 'NN'), ('whenever', 'WRB'), ('side', 'NN'), ('sea', 'NN')]
[('professor', 'NN'), ('ferocious', 'JJ'), ('ma

In [0]:
import numpy as np
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('wordnet_ic')
content_words = []
content_words.extend(pre_sent1+pre_sent2)
content_words = set(content_words)
corpus = []
corpus_words=[]
syns ={}
for syn in content_words:
    if len(wordnet.synsets(syn[0])) == 0:
           continue
    syns[syn] = wordnet.synsets(syn[0])
    corpus.append(syn)
    corpus_words.append(syn[0])
sim_matrix_wp = find_similarity_matrix(syns, 'wp')
sim_matrix_res = find_similarity_matrix(syns, 'res')
sim_matrix_jcn = find_similarity_matrix(syns, 'jcn')
sim_matrix_lch = find_similarity_matrix(syns, 'lch')


wp_dict = convert_to_doubledict(sim_matrix_wp, corpus_words)
res_dict = convert_to_doubledict(sim_matrix_res, corpus_words)
jcn_dict = convert_to_doubledict(sim_matrix_jcn, corpus_words)
lch_dict = convert_to_doubledict(sim_matrix_lch, corpus_words)

wp_df = convert_to_df(sim_matrix_wp, corpus_words)
res_df = convert_to_df(sim_matrix_res, corpus_words)
jcn_df = convert_to_df(sim_matrix_jcn, corpus_words)
lch_df = convert_to_df(sim_matrix_lch, corpus_words)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet_ic to /root/nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!


In [0]:
print("Similarity matrix for Wu Palmer: \n")
print(wp_df)

Similarity matrix for Wu Palmer: 

              front      make      like  ...       fry  ferocious     noise
front      1.000000  0.555556  0.555556  ...  0.600000   0.333333  0.444444
make       0.555556  1.000000  0.900000  ...  0.400000   0.500000  0.400000
like       0.555556  0.900000  1.000000  ...  0.400000   0.500000  0.400000
professor  0.521739  0.105263  0.105263  ...  0.571429   0.000000  0.133333
fish       0.631579  0.400000  0.400000  ...  0.705882   0.400000  0.285714
side       0.947368  0.666667  0.666667  ...  0.500000   0.250000  0.588235
classroom  0.588235  0.105263  0.105263  ...  0.444444   0.000000  0.133333
sea        0.400000  0.250000  0.250000  ...  0.363636   0.000000  0.333333
lunch      0.333333  0.333333  0.333333  ...  0.285714   0.333333  0.250000
fry        0.600000  0.400000  0.400000  ...  1.000000   0.400000  0.285714
ferocious  0.000000  0.000000  0.000000  ...  0.000000   1.000000  0.000000
noise      0.444444  0.400000  0.400000  ...  0.28571

In [0]:
print("Similarity matrix for Resnik: ")
print(res_df)

Similarity matrix for Resnik: 
              front      make       like  ...        fry  ferocious      noise
front      9.821209  0.000000   0.000000  ...   2.333545          0   0.000000
make       0.000000  7.751429   0.000000  ...   0.000000          0   0.000000
like       0.000000  0.000000  11.469868  ...   0.000000          0   0.000000
professor  2.333545  0.000000   0.000000  ...   6.394069          0   0.000000
fish       0.000000  0.000000   0.000000  ...   0.000000          0  -0.000000
side       6.825477  0.000000   0.000000  ...   1.531834          0   0.000000
classroom  2.305849  0.000000   0.000000  ...   1.531834          0   0.000000
sea        4.088771  0.000000   0.000000  ...   0.801759          0   0.000000
lunch      0.801759  0.000000   0.000000  ...   0.801759          0   0.000000
fry        2.333545  0.000000   0.000000  ...  13.772453          0   0.000000
ferocious  0.000000  0.000000   0.000000  ...   0.000000          0   0.000000
noise      0.000000  

In [0]:
print("Similarity matrix for JCN: ")
print(jcn_df)

Similarity matrix for JCN: 
                   front           make  ...  ferocious          noise
front      1.000000e+300   0.000000e+00  ...          0   0.000000e+00
make        0.000000e+00  1.000000e+300  ...          0   0.000000e+00
like        0.000000e+00   0.000000e+00  ...          0   0.000000e+00
professor   6.718101e-02   0.000000e+00  ...          0   0.000000e+00
fish        0.000000e+00   0.000000e+00  ...          0   6.305875e-02
side        7.620869e-01   0.000000e+00  ...          0   0.000000e+00
classroom   6.951639e-02   0.000000e+00  ...          0   0.000000e+00
sea         8.660314e-02   0.000000e+00  ...          0   0.000000e+00
lunch       6.025995e-02   0.000000e+00  ...          0   0.000000e+00
fry         8.022412e-02   0.000000e+00  ...          0   0.000000e+00
ferocious   0.000000e+00   0.000000e+00  ...          0   0.000000e+00
noise       0.000000e+00   0.000000e+00  ...          0  1.000000e+300

[12 rows x 12 columns]


In [0]:
print("Similarity matrix for LCH Similarity: ")
print(lch_df)

Similarity matrix for LCH Similarity: 
              front      make      like  ...       fry  ferocious     noise
front      3.637586  1.648659  1.648659  ...  1.691676          0  1.558145
make       1.648659  3.637586  2.538974  ...  1.871802          0  1.648659
like       1.648659  2.538974  3.637586  ...  1.871802          0  1.648659
professor  1.335001  0.747214  0.747214  ...  1.691676          0  0.998529
fish       1.845827  1.871802  1.871802  ...  2.251292          0  1.466337
side       2.944439  1.558145  1.558145  ...  1.558145          0  1.558145
classroom  1.558145  0.747214  0.747214  ...  1.239691          0  0.998529
sea        1.558145  1.072637  1.072637  ...  1.558145          0  1.440362
lunch      1.312186  1.648659  1.648659  ...  1.466337          0  1.312186
fry        1.691676  1.871802  1.871802  ...  3.637586          0  1.466337
ferocious  0.000000  0.000000  0.000000  ...  0.000000          0  0.000000
noise      1.558145  1.648659  1.648659  ...  1.4

## Function to create vector for each sentence

In [0]:
def create_vector(sent, corpus, dicti):
    vector = []
    for word in corpus:
        if word in sent:
            vector.append(1)
        else:
            maxval = 0
            for word1 in corpus:
                if word == word1:
                    continue
                if dicti[word][word1]>maxval:
                    maxval = dicti[word][word1]
                    bestword = word1
            if(maxval<0.4):
                vector.append(0)
            else:
                vector.append(maxval)
    return vector
                
            
        

In [0]:
sent1_words = [i[0] for i in pre_sent1]
sent2_words = [i[0] for i in pre_sent2]
sent_vec1 = create_vector(sent1_words, corpus_words, lch_dict)
sent_vec2 = create_vector(sent2_words, corpus_words, lch_dict)
print(sent1_words)
print(sent_vec1)
print(sent2_words)
print(sent_vec2)

['like', 'fish', 'fry', 'lunch', 'whenever', 'side', 'sea']
[2.9444389791664407, 2.538973871058276, 1, 1.6916760106710724, 1, 1, 1.6916760106710724, 1, 1, 1, 0, 1.6486586255873816]
['professor', 'ferocious', 'make', 'noise', 'classroom', 'fry', 'front', 'everybody']
[1, 1, 2.538973871058276, 1, 2.2512917986064953, 2.9444389791664407, 1, 1.6916760106710724, 1.6486586255873816, 1, 1, 1]


Find cosine distance between the sentence vectors

In [0]:
from scipy import spatial

result = 1 - spatial.distance.cosine(sent_vec1, sent_vec2)
print(result)

0.7259211249754025
