In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [8]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Load dataset
train_file_path = '/kaggle/input/quora-question-pairs/train.csv.zip'
test_file_path = '/kaggle/input/quora-question-pairs/test.csv'

# Load CSV files
train_df = pd.read_csv(train_file_path, compression='zip')
test_df = pd.read_csv(test_file_path)

# Split training data
df_train, df_val = train_test_split(train_df, test_size=0.1, random_state=42)

# Define CrossEncoder with Cosine Similarity
class CrossEncoderCosineSimilarity(torch.nn.Module):
    def __init__(self, model_name):  # Corrected __init__ method
        super(CrossEncoderCosineSimilarity, self).__init__()  # Corrected __init__ method
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze(-1)  # Output as similarity score
        return logits


# Function to create DataLoader
def create_data_loader(df, tokenizer, max_len, batch_size):
    question1_list = df['question1'].astype(str).tolist()
    question2_list = df['question2'].astype(str).tolist()
    
    # Tokenize the inputs
    encodings = tokenizer(
        text=question1_list,
        text_pair=question2_list,
        max_length=max_len,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    labels = torch.tensor(df['is_duplicate'].values)
    
    dataset = TensorDataset(input_ids, attention_mask, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Cosine Similarity Contrastive Loss function
def cosine_similarity_contrastive_loss(logits, labels, margin=0.5):
    # Applying sigmoid to get similarity scores
    probs = torch.sigmoid(logits)
    # Cosine similarity-based contrastive loss
    loss = torch.mean((1 - labels) * torch.pow(probs - 1, 2) +
                      labels * torch.pow(torch.clamp(probs - margin, min=0.0), 2))
    return loss



In [9]:
# Model and training setup
model_name = 'bert-base-uncased'
cross_encoder_cosine = CrossEncoderCosineSimilarity(model_name)
tokenizer = cross_encoder_cosine.tokenizer
max_len = 128
batch_size = 16

train_loader = create_data_loader(df_train, tokenizer, max_len, batch_size)
val_loader = create_data_loader(df_val, tokenizer, max_len, batch_size)

optimizer = torch.optim.Adam(cross_encoder_cosine.parameters(), lr=1e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cross_encoder_cosine.to(device)

for epoch in range(1):  # Example: 2 epochs
    cross_encoder_cosine.train()
    running_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        labels = labels.to(device).float()

        optimizer.zero_grad()
        logits = cross_encoder_cosine(input_ids, attention_mask)
        loss = cosine_similarity_contrastive_loss(logits, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {running_loss / len(train_loader)}")

    # Validation loop
    cross_encoder_cosine.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validation'):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            labels = labels.to(device).float()

            logits = cross_encoder_cosine(input_ids, attention_mask)
            loss = cosine_similarity_contrastive_loss(logits, labels)
            val_loss += loss.item()
    
    print(f"Validation Loss: {val_loss / len(val_loader)}")

# Save the model
torch.save(cross_encoder_cosine.state_dict(), '/kaggle/working/cross_encoder_cosine.pth')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 22742/22742 [1:12:08<00:00,  5.25it/s]


Epoch 1 Loss: 0.025014936688924603


Validation: 100%|██████████| 2527/2527 [02:15<00:00, 18.59it/s]


Validation Loss: 0.019132104914508823
