In [1]:
cd /content/drive/MyDrive/data_science/text_similarity 

/content/drive/MyDrive/data_science/text_similarity


In [2]:
!ls

new.csv  output.csv  Text_Similarity_Dataset.csv


## Installing Required Libraries

In [3]:
!pip install transformers



In [4]:
!pip install sentence-transformers



## Imports

In [5]:
import os
import re
import numpy as np
import pandas as pd

In [6]:
from sentence_transformers import SentenceTransformer, util

In [7]:
data=pd.read_csv('Text_Similarity_Dataset.csv')

In [8]:
data.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...


In [9]:
#using roberta base model for encoding input sentence into a 768 sized vector 
model = SentenceTransformer('stsb-roberta-base-v2')

## Main

In [10]:
#lowercasing and removing extra symbols
data['text1'] = data['text1'].apply(lambda x: x.lower())
data['text1'] = data['text1'].apply(lambda x: re.sub('[^a-z0-9.!?%$\s]','',x))
data['text2'] = data['text2'].apply(lambda x: x.lower())
data['text2'] = data['text2'].apply(lambda x: re.sub('[^a-z0-9.!?%$\s]','',x))

In [11]:
data.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 21 bolton kieron dyer smashed home t...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteenyearol...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocketsized devices t...


In [12]:
#encoding sentences to get their embeddings
data['embed1'] = data['text1'].apply(lambda x: model.encode(x, convert_to_tensor=True))
data['embed2'] = data['text2'].apply(lambda x: model.encode(x, convert_to_tensor=True))

In [13]:
data.head()

Unnamed: 0,Unique_ID,text1,text2,embed1,embed2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 21 bolton kieron dyer smashed home t...,"[tensor(0.3584), tensor(0.0943), tensor(-0.247...","[tensor(0.4110), tensor(0.0717), tensor(-0.036..."
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...,"[tensor(0.0484), tensor(0.2109), tensor(0.1083...","[tensor(0.2454), tensor(0.5591), tensor(0.5484..."
2,2,young debut cut short by ginepri fifteenyearol...,ruddock backs yapp s credentials wales coach m...,"[tensor(0.6084), tensor(-0.3023), tensor(0.357...","[tensor(0.5243), tensor(-0.1515), tensor(0.732..."
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...,"[tensor(-0.1720), tensor(0.4405), tensor(-0.27...","[tensor(0.0813), tensor(0.3919), tensor(0.3799..."
4,4,be careful how you code a new european directi...,media gadgets get moving pocketsized devices t...,"[tensor(0.2333), tensor(0.3693), tensor(0.6025...","[tensor(0.4373), tensor(1.3950), tensor(-0.320..."


In [14]:
#calculating cosine similarity between the two embeddings columns
data['similarity'] = data.apply(lambda x: util.pytorch_cos_sim(x.embed1, x.embed2), axis = 1)
data['similarity'] = data['similarity'].apply(lambda x : x.item())

In [15]:
#since cosine similarity gives value between -1 and +1, we are using min-max scaling to normalize values between 0 and 1.
data['similarity'] = data['similarity'].apply(lambda x: (x+1)/2)

In [16]:
#saving only text columns and similarity column
data = data.loc[:, ['Unique_ID', 'similarity']]

In [17]:
#saving the dataframe to output csv
data.to_csv('output.csv', index = False)