In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses, InputExample
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

  from tqdm.autonotebook import tqdm, trange


In [2]:
!pip install sentence-transformers transformers torch datasets scikit-learn


Collecting sentence-transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.1.0


In [4]:
df = pd.read_csv('/kaggle/input/quora-question-pairs/train.csv.zip')
df = df[['question1', 'question2', 'is_duplicate']]

df['question1'].fillna('', inplace=True)
df['question2'].fillna('', inplace=True)
df['question1'] = df['question1'].astype(str)
df['question2'] = df['question2'].astype(str)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['question1'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['question2'].fillna('', inplace=True)


In [6]:
train_df = df.sample(frac=0.2, random_state=42)
test_df = df.drop(train_df.index)

test_df.to_csv('/kaggle/working/test_set.csv', index=False)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='binary')

def evaluate_bi_encoder(model, test_df):
    # Preprocessing the test data
    test_embeddings_1 = model.encode(test_df['question1'].tolist(), convert_to_tensor=True)
    test_embeddings_2 = model.encode(test_df['question2'].tolist(), convert_to_tensor=True)
    
    # Compute cosine similarities between question pairs
    cosine_scores = torch.nn.functional.cosine_similarity(test_embeddings_1, test_embeddings_2)
    
    # Convert the tensor to numpy and compare element-wise with the threshold (0.5)
    predictions = (cosine_scores > 0.5).cpu().numpy()
    
    # Convert boolean predictions to integers (0 or 1)
    predictions = predictions.astype(int)
    
    labels = test_df['is_duplicate'].values
    
    # Calculate F1-score
    return f1_score(labels, predictions)


In [8]:
bi_encoder_model = SentenceTransformer('bert-base-uncased')

train_samples = [InputExample(texts=[row['question1'], row['question2']], label=float(row['is_duplicate']))
                 for idx, row in train_df.iterrows()]

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)

train_loss = losses.CosineSimilarityLoss(bi_encoder_model)

bi_encoder_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

bi_encoder_model.save('/kaggle/working/bi_encoder_cosine')


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
500,0.178
1000,0.1378


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [9]:
train_loss = losses.ContrastiveLoss(bi_encoder_model)

bi_encoder_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

bi_encoder_model.save('/kaggle/working/bi_encoder_contrastive')


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
500,0.0142
1000,0.013


In [10]:
train_loss = losses.MultipleNegativesRankingLoss(bi_encoder_model)

bi_encoder_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

bi_encoder_model.save('/kaggle/working/bi_encoder_mnr')


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
500,0.6344
1000,0.5377


In [14]:
test_df = pd.read_csv('/kaggle/working/test_set.csv')

# Use a subset of the test set (20%)
test_subset = test_df.sample(frac=0.2, random_state=42)

bi_encoder_model = SentenceTransformer('/kaggle/working/bi_encoder_cosine')
f1_cosine = evaluate_bi_encoder(bi_encoder_model, test_subset)
print(f'F1-Score for Bi-Encoder with Cosine Similarity Loss: {f1_cosine}')

# Evaluate Bi-Encoder with Contrastive Loss
bi_encoder_model = SentenceTransformer('/kaggle/working/bi_encoder_contrastive')
f1_contrastive = evaluate_bi_encoder(bi_encoder_model, test_subset)
print(f'F1-Score for Bi-Encoder with Contrastive Loss: {f1_contrastive}')

# Evaluate Bi-Encoder with Multiple Negatives Ranking Loss
bi_encoder_model = SentenceTransformer('/kaggle/working/bi_encoder_mnr')
f1_mnr = evaluate_bi_encoder(bi_encoder_model, test_subset)
print(f'F1-Score for Bi-Encoder with MNR Loss: {f1_mnr}')

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

F1-Score for Bi-Encoder with Cosine Similarity Loss: 0.7786084805523251


Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

F1-Score for Bi-Encoder with Contrastive Loss: 0.7065604034400469


Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

F1-Score for Bi-Encoder with MNR Loss: 0.600083260795519
