In [None]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import collections
from sklearn.cluster import KMeans
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import pickle
import sys
from gensim.models import word2vec
import gensim

In [None]:
# Read the given dataset using pandas
text_data = pd.read_csv("Precily_Text_Similarity.csv")
print("Shape of text_data: ", text_data.shape)
text_data.head(3)

In [None]:
# Check if text data has any null values
text_data.isnull().sum()

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
# Preprocess text1
preprocessed_text1 = []
for sentence in tqdm(text_data['text1'].values):
    sent = decontracted(sentence)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    sent = ' '.join(e for e in sent.split() if e not in stopwords.words('english'))
    preprocessed_text1.append(sent.lower().strip())
text_data['text1'] = preprocessed_text1
text_data.head(3)


In [None]:
# Preprocess text2
preprocessed_text2 = []
for sentence in tqdm(text_data['text2'].values):
    sent = decontracted(sentence)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    sent = ' '.join(e for e in sent.split() if e not in stopwords.words('english'))
    preprocessed_text2.append(sent.lower().strip())
text_data['text2'] = preprocessed_text2
text_data.head(3)

In [None]:
def word_tokenizer(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

In [None]:
# Load pre-trained Google News Vectors after downloading the file
wordmodelfile = "GoogleNews-vectors-negative300.bin"
wordmodel = gensim.models.KeyedVectors.load_word2vec_format(wordmodelfile, binary=True)


In [None]:
words = list(wordmodel.word2vec.key_to_index.keys())
self.word2vec = {word:wordmodel.word2vec[word]%EMBEDDING_DIM for word in words}

In [None]:
# Calculate similarity scores
similarity = []
for ind in text_data.index:
    s1 = text_data['text1'][ind]
    s2 = text_data['text2'][ind]

    if s1 == s2:
        similarity.append(0.0)  # 0 means highly similar
    else:
        s1words = word_tokenizer(s1)
        s2words = word_tokenizer(s2)
        vocab = wordmodel.vocab  # the vocabulary considered in the word embeddings

        if len(s1words and s2words) == 0:
            similarity.append(1.0)
        else:
            for word in s1words.copy():
                if word not in vocab:
                    s1words.remove(word)

            for word in s2words.copy():
                if word not in vocab:
                    s2words.remove(word)

            similarity.append(
                (1 - wordmodel.n_similarity(s1words, s2words)))  # 1 means highly dissimilar, 0 means highly similar

In [None]:
  >>> import nltk
  >>> nltk.download('omw-1.4')

In [None]:
# Create a DataFrame with Unique_ID and similarity scores
final_score = pd.DataFrame({'Unique_ID': text_data.Unique_ID, 'Similarity_score': similarity})
final_score.head(3)

In [None]:
# Save the DataFrame as a CSV file
final_score.to_csv('final_score.csv', index=False)