In [1]:
!pip install spacy==3.5.2
!pip install gensim==4.3.1
!pip install scipy==1.10.1
!pip install pyemd==1.0.0
!pip install scikit-learn==1.2.2
!pip install nltk==3.8.1
!pip install datasets==2.12.0
!pip install jupyter==1.0.0
!pip install POT==0.9.0
!pip install keras==2.12.0
!pip install torch==2.0.0
!pip install accelerate==0.18.0
!pip install transformers==4.28.1
!pip install Cython==0.29.34

!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd "drive/MyDrive/Colab Notebooks/nlp proj"

/content/drive/MyDrive/Colab Notebooks/nlp proj


In [3]:
import spacy
from spacy.lang.en import English
import numpy as np
import gensim
from gensim.models import KeyedVectors
import gensim.downloader as api
from scipy.spatial.distance import cosine, jensenshannon, euclidean, cityblock
from scipy.stats import spearmanr
from pyemd import emd
from sklearn.metrics.pairwise import cosine_similarity, paired_distances, euclidean_distances
import nltk
from nltk.corpus import stopwords
import string
from datasets import load_dataset
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.functional import norm
import tensorflow as tf
from nltk.stem import WordNetLemmatizer
import time
from tqdm import tqdm
from siamese_sts.data_loader import STSData
from siamese_sts.siamese_net import SiameseLSTM
from siamese_sts.trainer.train import train_model
from siamese_sts.data_loader.preprocess import Preprocess

In [None]:
# Load spacy model
nlp = spacy.load('en_core_web_sm')
disabled = [component[0] for component in nlp.components if component[0] != 'tok2vec']

In [None]:
# Load word2vec model
w2v_model = api.load('word2vec-google-news-300')

columns_mapping = {
    "sent1": "sentence_A",
    "sent2": "sentence_B",
    "label": "relatedness_score",
}

In [None]:
dataset = load_dataset("sick")
train_dataset = dataset['train'].to_pandas()
test_dataset = dataset['test'].to_pandas()
cur_dataset = train_dataset

In [None]:
preprocess_ = Preprocess('siamese_sts/data_loader/stopwords-en.txt')

def preprocess_text(text):
    text = preprocess_.remove_usernames(text)
    text = preprocess_.remove_punctuations(text)
    text = preprocess_.remove_stopwords(text)
    return text

In [None]:
# Compute distance between texts using WMD
def compute_wmd_distance(text1, text2):
    tokenized_text1 = text1.split()
    tokenized_text2 = text2.split()
    try:
        distance = w2v_model.wmdistance(tokenized_text1, tokenized_text2)
        return distance
    except ValueError:
        return 0

In [None]:
# Compute distance between texts using cosine distance
def compute_cosine_distance(text1, text2):
    vector1 = nlp(text1, disable=disabled).vector
    vector2 = nlp(text2, disable=disabled).vector
    try:
        distance = cosine(vector1, vector2)
        return distance
    except ValueError:
            return 0

In [None]:
# Compute distance between texts using Jensen-Shannon divergence
def compute_jensen_shannon_distance(text1, text2):
    vector1 = nlp(text1, disable=disabled).vector
    vector2 = nlp(text2, disable=disabled).vector
    try:
        distance = jensenshannon(vector1, vector2)
        return distance
    except ValueError:
        return 0

In [None]:
# Compute distance between texts using Euclidean distance
def compute_euclidean_distance(text1, text2):
    vector1 = nlp(text1, disable=disabled).vector
    vector2 = nlp(text2, disable=disabled).vector
    try:
        distance = euclidean(vector1, vector2)
        return distance
    except ValueError:
        return 0

In [None]:
# Compute distance between texts using Manhattan distance
def compute_manhattan_distance(text1, text2):
    vector1 = nlp(text1, disable=disabled).vector
    vector2 = nlp(text2, disable=disabled).vector
    try:
        distance = cityblock(vector1, vector2)
        return distance
    except ValueError:
        return 0

In [None]:
# Compute distance between texts using Earth Mover's Distance
def compute_emd_distance(text1, text2):
    tokenized_text1 = text1.split()
    tokenized_text2 = text2.split()
    distance = 0
    for word1 in tokenized_text1:
        for word2 in tokenized_text2:
            try:
                distance += w2v_model.distance(word1, word2)
            except KeyError:
              pass
    return distance

In [None]:
preprocessed_texts1 = cur_dataset[columns_mapping['sent1']].apply(preprocess_text)
preprocessed_texts2 = cur_dataset[columns_mapping['sent2']].apply(preprocess_text)

In [None]:
start_time = time.time()
wmd_distances = preprocessed_texts1.combine(preprocessed_texts2, compute_wmd_distance)
wmd_time = time.time() - start_time
start_time = time.time()
cosine_distances = preprocessed_texts1.combine(preprocessed_texts2, compute_cosine_distance)
cosine_time = time.time() - start_time
start_time = time.time()
jensen_distances = preprocessed_texts1.combine(preprocessed_texts2, compute_jensen_shannon_distance)
jensen_time = time.time() - start_time
start_time = time.time()
euclidean_distances_ = preprocessed_texts1.combine(preprocessed_texts2, compute_euclidean_distance)
euclidean_time = time.time() - start_time
start_time = time.time()
manhattan_distances = preprocessed_texts1.combine(preprocessed_texts2, compute_manhattan_distance)
manhattan_time = time.time() - start_time
start_time = time.time()
emd_distances = preprocessed_texts1.combine(preprocessed_texts2, compute_emd_distance)
emd_time = time.time() - start_time

In [None]:
wmd_correlation, _ = spearmanr(wmd_distances, cur_dataset[columns_mapping['label']])
cosine_correlation, _ = spearmanr(cosine_distances, cur_dataset[columns_mapping['label']])
jensen_correlation, _ = spearmanr(jensen_distances, cur_dataset[columns_mapping['label']])
euclidean_correlation, _ = spearmanr(euclidean_distances_, cur_dataset[columns_mapping['label']])
manhattan_correlation, _ = spearmanr(manhattan_distances, cur_dataset[columns_mapping['label']])
emd_correlation, _ = spearmanr(emd_distances, cur_dataset[columns_mapping['label']])



In [None]:
print("WMD correlation:", wmd_correlation, "Time:", wmd_time)
print("Cosine similarity correlation:", cosine_correlation, "Time:", cosine_time)
print("Jensen correlation:", jensen_correlation, "Time:", jensen_time)
print("Euclidean correlation:", euclidean_correlation, "Time:", euclidean_time)
print("Manhattan correlation:", manhattan_correlation, "Time:", manhattan_time)
print("EMD correlation:", emd_correlation, "Time:", emd_time)

wmd_correlation = abs(wmd_correlation)
cosine_correlation = abs(cosine_correlation)
jensen_correlation = abs(jensen_correlation)
euclidean_correlation = abs(euclidean_correlation)
manhattan_correlation = abs(manhattan_correlation)
emd_correlation = abs(emd_correlation)

# Determine the best approach based on the highest correlation coefficient
max_correlation = max(wmd_correlation, cosine_correlation, jensen_correlation, euclidean_correlation, manhattan_correlation, emd_correlation)
if max_correlation == wmd_correlation:
    print("WMD distance is the best approach.")
elif max_correlation == cosine_correlation:
    print("Cosine similarity distance is the best approach.")
elif max_correlation == jensen_correlation:
    print("Jensen distance is the best approach.")
elif max_correlation == euclidean_correlation:
    print("Euclidean distance is the best approach.")
elif max_correlation == manhattan_correlation:
    print("Manhattan distance is the best approach.")
else:
    print("EMD distance is the best approach.")

WMD correlation: -0.568819990570633 Time: 15.97286581993103
Cosine similarity correlation: -0.41558549103168596 Time: 23.239484786987305
Jensen correlation: nan Time: 24.855061292648315
Euclidean correlation: -0.39692602055181825 Time: 25.51290488243103
Manhattan correlation: -0.39800411064234925 Time: 24.135200262069702
EMD correlation: -0.01675579576607564 Time: 2.4725728034973145
WMD distance is the best approach.


In [None]:
columns_mapping_spec = {
    "sent1": "sentence_A",
    "sent2": "sentence_B",
    "label": "wmdistance",
}

def prevocab(dataset_):
  dataset_[columns_mapping_spec['label']] = dataset_['clean_sent1'].combine(dataset_['clean_sent2'], compute_wmd_distance)

  return dataset_


In [None]:
dataset_name = "sick"
sick_data = STSData(dataset_name=dataset_name, columns_mapping=columns_mapping, prevocab=prevocab)
sick_dataloaders = sick_data.get_data_loader()
batch_size = 4
output_size = 1
hidden_size = 128
vocab_size = len(sick_data.vocab)
embedding_size = 300
embedding_weights = sick_data.vocab.vectors
lstm_layers = 1
learning_rate = 1e-1
max_epochs = 20
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## init siamese lstm
siamese_lstm = SiameseLSTM(
    batch_size=batch_size,
    output_size=output_size,
    hidden_size=hidden_size,
    vocab_size=vocab_size,
    embedding_size=embedding_size,
    embedding_weights=embedding_weights,
    lstm_layers=lstm_layers,
    device=device,
)

## define optimizer and loss function
optimizer = torch.optim.Adam(params=siamese_lstm.parameters())

del sick_data
del w2v_model

In [None]:
train_model(
    model=siamese_lstm,
    optimizer=optimizer,
    dataloader=sick_dataloaders,
    max_epochs=max_epochs,
    config_dict={"device": device, "model_name": "siamese_lstm"},
)

In [None]:
# Compute distance between texts using WMD
def compute_wmd_distance_siamese(preprocessed_text1, preprocessed_text2):
    tokenized_text1 = preprocessed_text1.split()
    tokenized_text2 = preprocessed_text2.split()
    try:
        distance = w2v_model.wmdistance(tokenized_text1, tokenized_text2)
        return distance
    except ValueError:
        return 0

In [None]:
from tqdm import tqdm
samples = []
labels = []
for row in tqdm(test_dataset.iterrows()):
    left_words_tmp = preprocess_text(row[1][0])
    left_words = [w2v_model.key_to_index[word] if word in w2v_model.index_to_key else 0 for word in left_words_tmp]
    right_words_tmp = preprocess_text(row[1][1])
    right_words = [w2v_model.key_to_index[word] if word in w2v_model.index_to_key else 0 for word in right_words_tmp]
    wmd_distance = compute_wmd_distance_siamese(left_words_tmp, right_words_tmp)
    samples.append([left_words, right_words])
    labels.append(wmd_distance)

In [None]:
max_ = 50
for i in range(len(samples)):
    len_ = len(samples[i][0])
    if len_ > max_:
        samples[i][0] = samples[i][0][: max_]
    else:
        samples[i][0].extend([''] * (max_ - len_))
        
    len_ = len(samples[i][1])
    if len_ > max_:
        samples[i][1] = samples[i][1][: max_]
    else:
        samples[i][1].extend([''] * (max_ - len_))

In [None]:
def euclidean_distance(inputs):
    """
    Calculates the Euclidean distance between two tensors.
    """
    x, y = inputs
    distance = tf.sqrt(tf.reduce_sum(tf.square(x - y), axis=1, keepdims=True))
    return distance

In [None]:
input_shape = (max_,)
input_left = tf.keras.Input(shape=input_shape, dtype='string', name='input_left')
input_right = tf.keras.Input(shape=input_shape, dtype='string', name='input_right')
embedding_layer = tf.keras.layers.Embedding(input_dim=len(w2v_model.index_to_key), output_dim=300, input_length=max_)
lstm_layer = tf.keras.layers.LSTM(150)
left = lstm_layer(embedding_layer(input_left))
right = lstm_layer(embedding_layer(input_right))
distance_layer = tf.keras.layers.Lambda(euclidean_distance)([left, right])
output_layer = tf.keras.layers.Dense(units=1, activation='sigmoid')(distance_layer)
model = tf.keras.Model(inputs=[input_left, input_right], outputs=output_layer)
model.compile(optimizer='adam', loss='mse')

In [None]:
x = [np.array([sample[0] for sample in samples], dtype=np.int32), np.array([sample[1] for sample in samples], dtype=np.int32)]
y = np.array(labels, dtype=np.float32)

In [None]:
# Train the model on the sample data
model.fit(x=x, y=y, batch_size=32, epochs=1)

In [None]:
def compute_wmd_distance_test(tokens1, tokens2):
    try:
        distance = w2v_model.wmdistance(tokens1, tokens2)
        return distance
    except ValueError:
        return 0
    
tokens1all = [text.split() for text in preprocessed_texts1]
tokens2all = [text.split() for text in preprocessed_texts2]

In [None]:
start_time = time.time()
for data in zip(tokens1all, tokens2all):
    wmd_distances = compute_wmd_distance_test(data[0], data[1])
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.40190625190734863 seconds ---


In [None]:
x1 = []
x2 = []
for text1, text2 in zip(preprocessed_texts1, preprocessed_texts2):
    tokenized_text1 = text1.split()
    tokenized_text2 = text2.split()
    
    tmp = [w2v_model.key_to_index[word] if word in w2v_model.index_to_key else 0 for word in tokenized_text1]
    len_ = len(tmp)
    if len_ > max_:
        tmp = tmp[: max_]
    else:
        tmp.extend([0] * (max_ - len_))
    x1.append(tmp)
    
    tmp = [w2v_model.key_to_index[word] if word in w2v_model.index_to_key else 0 for word in tokenized_text2]
    len_ = len(tmp)
    if len_ > max_:
        tmp = tmp[: max_]
    else:
        tmp.extend([0] * (max_ - len_))
    x2.append(tmp)
    

In [None]:
start_time = time.time()
wmd_distances = model.predict([np.array(x1), np.array(x2)])
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.8751041889190674 seconds ---
