# Semantic Similarity using word vectors as features 

### Path to Glove vector pretrained on tweets

In [1]:
path = "glove.twitter.27B/glove.twitter.27B.100d.txt"

### Imports

In [2]:
import numpy as np
import gensim
import math
import nltk
import string
import pandas as pd 
from nltk.stem import WordNetLemmatizer
stopword = nltk.corpus.stopwords.words('english')

### Load Glove Model 

In [3]:

def loadGloveModel(File):
    print("Loading Glove Model")
    f = open(File,'r',encoding='utf-8')
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

In [4]:
model =loadGloveModel(path)

Loading Glove Model
1193514  words loaded!


### Preprocessing of sentences

In [5]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])# It will discard all punctuations
    return text_nopunct

In [6]:
def remove_stopwords_low(sent):
    tokenized_list = sent.split(" ")
    text = [word for word in tokenized_list if word not in stopword]# To remove all stopwords
    sent = " ".join([word.lower() for word in text])
    return sent

In [21]:
wn = nltk.WordNetLemmatizer()

def lemmatize(sent):
    tokenized_list = sent.split(" ")
    text = " ".join([wn.lemmatize(word) for word in tokenized_list])
    return text

In [22]:
def preprocess(sent):
    sent = remove_stopwords_low(sent)
    sent = remove_punct(sent)
    sent = lemmatize(sent)
    
    return sent

###  Average Similarity score

In [7]:
def cosine_similarity(v1,v2):
    "Function to compute cosine similarity of v1 and v2: (v1 dot v2)/{||v1||*||v2||)"
    xx, xy, yy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        xx += x*x
        yy += y*y
        xy += x*y
    return xy/math.sqrt(xx*yy)

In [8]:
def avg_sim(sentence1,sentence2):
    avg1 = [0] * (100)
    avg2 = [0] * (100)

    num_words_sentence1 = len(sentence1.split(" "))
    num_words_sentence2 = len(sentence2.split(" "))

    for w in sentence1.split(" "):
          avg1 += model[w.lower()]

    for w in sentence2.split(" "):
        avg2 += model[w.lower()]

    sim=cosine_similarity(avg1/num_words_sentence1,avg2/num_words_sentence1)
    return (sim)

### Example

In [23]:
sample1_1 = "I love animals"
sample1_2 = "I love pets"


sample2_1 = "Animal lover"
sample2_2 = "drive car"

## PREPROCESSING

samples1_1 = preprocess(sample1_1)
samples1_2 = preprocess(sample1_2)
samples2_1 = preprocess(sample2_1)
samples2_2 = preprocess(sample2_2)

## SIMILARITY
SIM =[avg_sim(samples1_1,sample1_2),avg_sim(samples2_1,samples2_2)]

### Creating Dataframe

In [24]:
sample1 = [sample1_1,sample2_1]
sample2= [sample1_2,sample2_2]
data = {
    "sent1":sample1,
    "sent2":sample2,
    "similarity":SIM
}
df = pd.DataFrame(data , columns=["sent1","sent2","similarity"])

df

Unnamed: 0,sent1,sent2,similarity
0,I love animals,I love pets,0.949795
1,Animal lover,drive car,0.466392
