In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

In [2]:
# Load the dataset
data = pd.read_csv("/kaggle/input/nli-dataset-for-sentence-understanding/sst2_train.csv")

# Display the first few rows
print(data.head())

                                            sentence  label  idx
0       hide new secretions from the parental units       0    0
1               contains no wit , only labored gags       0    1
2  that loves its characters and communicates som...      1    2
3  remains utterly satisfied to remain the same t...      0    3
4  on the worst revenge-of-the-nerds clichés the ...      0    4


In [3]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Define preprocessing function
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
data['sentence'] = data['sentence'].apply(preprocess_text)

In [6]:
# Load pre-trained BERT model and tokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

In [7]:
# Function to get sentence embeddings
def get_sentence_embedding(sentences):
    inputs = tokenizer(sentences, return_tensors='pt', truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return embeddings

# Batch processing
batch_size = 32
embeddings = []
for i in tqdm(range(0, len(data), batch_size)):
    batch_sentences = data['sentence'][i:i+batch_size].tolist()
    batch_embeddings = get_sentence_embedding(batch_sentences)
    embeddings.extend(batch_embeddings)

data['embedding'] = embeddings

100%|██████████| 2105/2105 [01:34<00:00, 22.35it/s]


In [12]:
# Ensure embeddings column is not empty
data = data.dropna(subset=['embedding'])

In [13]:
# For simplicity, we'll pair each sentence with the next one in the dataset
# Create a new DataFrame to store pairs of sentences and their embeddings
paired_data = pd.DataFrame({
    'sentence1': data['sentence'][:-1],
    'sentence2': data['sentence'][1:],
    'embedding1': data['embedding'][:-1],
    'embedding2': data['embedding'][1:]
})

In [14]:
# Ensure embeddings are numpy arrays
paired_data = paired_data.dropna(subset=['embedding1', 'embedding2'])

In [15]:
# Function to compute cosine similarity
def compute_cosine_similarity(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]

# Compute cosine similarity for each pair of sentences
paired_data['similarity'] = paired_data.apply(lambda row: compute_cosine_similarity(row['embedding1'], row['embedding2']), axis=1)

# Display the first few rows with similarity scores
print(paired_data.head())

                                           sentence1  \
1                          contains wit labored gags   
2  loves characters communicates something rather...   
3        remains utterly satisfied remain throughout   
4              worst clichés filmmakers could dredge   
5             far tragic merit superficial treatment   

                                           sentence2  \
1                          contains wit labored gags   
2  loves characters communicates something rather...   
3        remains utterly satisfied remain throughout   
4              worst clichés filmmakers could dredge   
5             far tragic merit superficial treatment   

                                          embedding1  \
1  [-0.40373817, -0.14012745, -0.3527661, 0.20648...   
2  [-0.33055326, 0.2092333, -0.09828734, -0.27186...   
3  [-0.50376976, 0.08698111, 0.13549073, -0.36353...   
4  [0.035091516, 0.5224591, -0.08824775, 0.159987...   
5  [-0.2025517, 0.11368114, -0.07959042, -0.04

In [16]:
# Assuming we don't have a ground truth similarity column, we'll skip the evaluation step
# If we had a true similarity score, we could use mean_squared_error or another metric for evaluation

# Function to get similarity between two new sentences
def get_similarity(sentence1, sentence2):
    embedding1 = get_sentence_embedding([sentence1])[0]
    embedding2 = get_sentence_embedding([sentence2])[0]
    similarity = compute_cosine_similarity(embedding1, embedding2)
    return similarity

# Example usage
sentence1 = "This is an example sentence."
sentence2 = "This is another example sentence."
similarity_score = get_similarity(sentence1, sentence2)
print(f'Similarity Score: {similarity_score}')

Similarity Score: 0.9913215041160583
