Importing Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loading the Dataset

In [None]:
import pandas as pd 
import numpy as np
import scipy
import math
from math import sqrt
import os
import tensorflow as ts
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import array 
from collections import Counter
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import math
from collections import Counter
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.probability import FreqDist
from sklearn.metrics import mean_squared_error

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
import requests

def download_sick(f): 

    response = requests.get(f).text

    lines = response.split("\n")[1:]
    lines = [l.split("\t") for l in lines if len(l) > 0]
    lines = [l for l in lines if len(l) == 5]

    df = pd.DataFrame(lines, columns=["idx", "sent_1", "sent_2", "sim", "label"])
    df['sim'] = pd.to_numeric(df['sim'])
    return df
    
sick_train = download_sick("https://raw.githubusercontent.com/alvations/stasis/master/SICK-data/SICK_train.txt")
sick_dev = download_sick("https://raw.githubusercontent.com/alvations/stasis/master/SICK-data/SICK_trial.txt")
sick_test = download_sick("https://raw.githubusercontent.com/alvations/stasis/master/SICK-data/SICK_test_annotated.txt")
sick_all = sick_train.append(sick_test).append(sick_dev)

In [None]:
def normalize(df, feature_names):
    result = df.copy()
    for feature_name in feature_names:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [None]:
sick_all = normalize(sick_all,['sim'])
sick_test = normalize(sick_test,['sim'])
sick_train = normalize(sick_train,['sim'])
sick_dev = normalize(sick_dev,['sim'])

In [None]:
display(sick_train['sim'])


0       0.875
1       0.550
2       0.925
3       0.600
4       0.675
        ...  
4495    0.025
4496    0.000
4497    0.000
4498    0.050
4499    0.000
Name: sim, Length: 4500, dtype: float64

In [None]:
df1 = sick_train['sent_1'].values
len(df1)
df1


array(['A group of kids is playing in a yard and an old man is standing in the background',
       'A group of children is playing in the house and there is no man standing in the background',
       'The young boys are playing outdoors and the man is smiling nearby',
       ..., 'The man is singing heartily and playing the guitar',
       'A man in blue has a yellow ball in the mitt',
       'Three dogs are resting on a sidewalk'], dtype=object)

In [None]:
df2 = sick_train['sent_2'].values
len(df2)
df2

array(['A group of boys in a yard is playing and a man is standing in the background',
       'A group of kids is playing in a yard and an old man is standing in the background',
       'The kids are playing outdoors near a man with a smile', ...,
       'A bicyclist is holding a bike over his head in a group of people',
       'A man is jumping rope outside',
       'The woman with a knife is slicing a pepper'], dtype=object)

In [None]:
pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/6a/e2/84d6acfcee2d83164149778a33b6bdd1a74e1bcb59b2b2cd1b861359b339/sentence-transformers-0.4.1.2.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 3.6MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 7.9MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 40.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     

In [None]:
from sentence_transformers import SentenceTransformer, util
bert_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

100%|██████████| 405M/405M [00:45<00:00, 8.99MB/s]


In [None]:
#Compute embedding for both lists
embedd_1 = bert_model.encode(df1, convert_to_tensor=True)
embedd_2 = bert_model.encode(df2, convert_to_tensor=True)

In [None]:
embedd_1 = bert_model.encode(df1, convert_to_tensor=False)
embedd_2 = bert_model.encode(df2, convert_to_tensor=False)

In [None]:
embedd_1

array([[ 0.17168102, -0.4885451 ,  0.6845204 , ...,  1.0796766 ,
         0.14897749,  0.3299666 ],
       [ 0.261238  ,  0.37181664,  0.8782345 , ...,  0.33737692,
         0.51571053,  0.6526496 ],
       [ 0.15208407, -0.40777782,  1.593268  , ...,  1.5211514 ,
        -0.0061184 ,  0.12791802],
       ...,
       [-0.07930363, -0.03636825,  0.35931805, ...,  0.9103331 ,
         0.33178034, -0.90336496],
       [-0.28406098,  0.49280742, -0.8320779 , ...,  0.25748107,
         0.20045336, -0.5763138 ],
       [ 0.26633155,  0.6992114 ,  0.24496923, ..., -0.04675953,
         0.05139665, -0.234953  ]], dtype=float32)

In [None]:
from scipy.spatial.distance import cosine
sims=[]
for i in range(0,len(embedd_2)):
  sim=(1-cosine(embedd_1[i],embedd_2[i]))
  sims.append(sim)

In [None]:
# #Output the pairs with their score
# for i in range(len(df1)):
#     print("{} \t\t {} \t\t Score: {:.4f}".format(df1[i], df2[i], cosine_scores[i][i]))

In [None]:
sick_train["predicted_sim"] = pd.Series(sims).values
sick_train["diff"] = np.sqrt((sick_train["predicted_sim"] - sick_train["sim"])**2)
# sick_train = sick_train.sort_values("diff", ascending=False)
pearson_correlation = scipy.stats.pearsonr(sims, sick_train['sim'])[0]
spearman_correlation = scipy.stats.spearmanr(sims, sick_train['sim'])[0]
rmse = sqrt(mean_squared_error(sims, sick_train['sim']))
textstr = 'RMSE=%.3f\nPearson Correlation=%.3f\nSpearman Correlation=%.3f'%(rmse, pearson_correlation, spearman_correlation)
sick_train['predicted_sim'] = pd.Series(sims).values
sick_train = sick_train.sort_values('sim')
id = list(range(0, len(sick_train.index)))
sick_train['id'] = pd.Series(id).values

In [None]:
print(textstr)

RMSE=0.160
Pearson Correlation=0.826
Spearman Correlation=0.790


In [None]:
print(textstr)

RMSE=0.160
Pearson Correlation=0.826
Spearman Correlation=0.790


In [None]:
sick_train

Unnamed: 0,idx,sent_1,sent_2,sim,label,predicted_sim,diff,id
4499,10000,Three dogs are resting on a sidewalk,The woman with a knife is slicing a pepper,0.0,NEUTRAL,-0.013866,0.013866,0
2110,4763,A person is dancing in the rain,A man is singing into a microphone,0.0,NEUTRAL,-0.084116,0.084116,1
2119,4788,An elephant is being ridden by a woman,A woman is opening a soda and drinking it,0.0,NEUTRAL,0.057927,0.057927,2
4422,9825,The girl is carrying a sign and a group of peo...,A woman is cleaning a man's face,0.0,NEUTRAL,0.060484,0.060484,3
573,1317,A person is performing a card trick,A man is frying a tortilla,0.0,NEUTRAL,-0.000300,0.000300,4
...,...,...,...,...,...,...,...,...
1426,3257,A man is playing the guitar,The man is playing the guitar,1.0,ENTAILMENT,0.990731,0.009269,4495
1428,3261,A man is playing the guitar,A guitar is being played by a man,1.0,ENTAILMENT,0.925870,0.074130,4496
1435,3277,"The boy, who is young, is running through the ...",The young boy is running through the ocean waves,1.0,ENTAILMENT,0.956792,0.043208,4497
2900,6397,A sheepdog is grouping a herd of sheep,The sheepdog is grouping a herd of sheep,1.0,ENTAILMENT,0.996279,0.003721,4498


In [None]:
from sentence_transformers import SentenceTransformer, util
from wmd import WMD
model = SentenceTransformer('stsb-distilbert-base')

# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
wmdistance=WMD(embeddings1)
#Output the pairs with their score
for i in range(len(sentences1)):
  print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

TypeError: ignored

In [None]:
import _embedding.model as model

ModuleNotFoundError: ignored

In [None]:
import numpy
embeddings = numpy.array([[0.1, 1], [1, 0.1]], dtype=numpy.float32)
nbow = {"first":  ("#1", [0, 1], numpy.array([1.5, 0.5], dtype=numpy.float32)),
        "second": ("#2", [0, 1], numpy.array([0.75, 0.15], dtype=numpy.float32))}
nbow

{'first': ('#1', [0, 1], array([1.5, 0.5], dtype=float32)),
 'second': ('#2', [0, 1], array([0.75, 0.15], dtype=float32))}

In [None]:
pip install wmd

Collecting wmd
[?25l  Downloading https://files.pythonhosted.org/packages/e5/14/e1d122e56607ae49999041f372fa14166eb1e3b838122118d706f9bf1620/wmd-1.3.2.tar.gz (104kB)
[K     |███▏                            | 10kB 11.4MB/s eta 0:00:01[K     |██████▎                         | 20kB 14.3MB/s eta 0:00:01[K     |█████████▍                      | 30kB 10.6MB/s eta 0:00:01[K     |████████████▌                   | 40kB 8.3MB/s eta 0:00:01[K     |███████████████▊                | 51kB 4.4MB/s eta 0:00:01[K     |██████████████████▉             | 61kB 4.9MB/s eta 0:00:01[K     |██████████████████████          | 71kB 4.9MB/s eta 0:00:01[K     |█████████████████████████       | 81kB 5.2MB/s eta 0:00:01[K     |████████████████████████████▏   | 92kB 5.3MB/s eta 0:00:01[K     |███████████████████████████████▍| 102kB 4.3MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 4.3MB/s 
Building wheels for collected packages: wmd
  Building wheel for wmd (setup.py) ... [?25l

In [None]:
cosine_scores

tensor([[ 0.2166,  0.1837, -0.0393],
        [-0.2488, -0.0146,  0.0590],
        [-0.1100,  0.0797,  0.9816]])

In [None]:
embeddings1

tensor([[ 0.5530, -0.0816,  0.2483,  ..., -0.2009, -0.5957, -0.8301],
        [-0.5679, -0.6925, -0.1982,  ...,  1.0700, -0.6926, -1.2325],
        [ 0.4037,  0.1574,  0.3715,  ...,  0.2966,  0.2992, -0.4972]])

In [None]:
from wmd import WMD

ModuleNotFoundError: ignored