In [1]:
# Cell 1: Import Libraries
import pandas as pd
import pickle
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import torch
import numpy as np
import re
import os

# Set device and environment variables
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Disable PyTorch compilation for stability
os.environ['TORCH_COMPILE_DISABLE'] = '1'
os.environ['TORCH_DYNAMO_DISABLE'] = '1'

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


# Cell 2: Data Augmentation Function
This cell defines the clean_text function that will be used for data augmentation. It removes URLs, special characters, digits, and extra whitespace from the text.

In [2]:
# Cell 2: Data Augmentation Function
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Cell 3: Data Loading
This cell loads the training and development datasets, as well as the papers data. It also prepares the text data by combining title and abstract for papers.

In [3]:
# Cell 3: Data Loading
# Load train and dev data
train_df = pd.read_csv('../subtask4b_query_tweets_train.tsv', 
                      sep='\t', 
                      names=['post_id', 'tweet_text', 'cord_uid'])
dev_df = pd.read_csv('../subtask4b_query_tweets_dev.tsv', 
                    sep='\t', 
                    names=['post_id', 'tweet_text', 'cord_uid'])

# Load papers data
with open('../subtask4b_collection_data.pkl', 'rb') as f:
    papers_df = pickle.load(f)
papers_df['text'] = papers_df['title'] + '. ' + papers_df['abstract']
paper_dict = dict(zip(papers_df['cord_uid'], papers_df['text']))

# Cell 4: Data Preparation
This cell prepares the training and development examples, applying the clean_text augmentation to the training data.

In [4]:
# Cell 4: Data Preparation (Updated)
# Prepare training examples with clean_text augmentation
train_samples = []
for _, row in train_df.iterrows():
    if row['cord_uid'] in paper_dict:
        tweet = clean_text(row['tweet_text'])
        paper = paper_dict[row['cord_uid']]
        # Convert to string to ensure proper encoding
        tweet = str(tweet)
        paper = str(paper)
        train_samples.append(InputExample(texts=[tweet, paper]))

# Prepare dev examples
dev_samples = []
for _, row in dev_df.iterrows():
    if row['cord_uid'] in paper_dict:
        tweet = str(row['tweet_text'])  # Convert to string
        paper = str(paper_dict[row['cord_uid']])  # Convert to string
        dev_samples.append(InputExample(texts=[tweet, paper]))

In [5]:
# Debug: Check data types
print("Sample train example:")
sample = train_samples[0]
print(f"Tweet type: {type(sample.texts[0])}")
print(f"Paper type: {type(sample.texts[1])}")
print(f"Tweet content: {sample.texts[0][:100]}...")
print(f"Paper content: {sample.texts[1][:100]}...")

Sample train example:
Tweet type: <class 'str'>
Paper type: <class 'str'>
Tweet content: Oral care in rehabilitation medicine oral vulnerability oral muscle wasting and hospitalassociated o...
Paper content: Oral Management in Rehabilitation Medicine: Oral Frailty, Oral Sarcopenia, and Hospital-Associated O...


# Cell 5: Training Configuration
This cell sets up the hyperparameters for training and prints the configuration.

In [7]:
# Cell 5: Training Configuration
# Hyperparameters
learning_rate = 2e-5
batch_size = 16
epochs = 8
warmup_steps = 200
model_name = 'multi-qa-mpnet-base-cos-v1'

# Print configuration
print("Training Configuration:")
print(f"Model: {model_name}")
print(f"Learning rate: {learning_rate}")
print(f"Batch size: {batch_size}")
print(f"Epochs: {epochs}")
print(f"Warmup steps: {warmup_steps}")

Training Configuration:
Model: multi-qa-mpnet-base-cos-v1
Learning rate: 2e-05
Batch size: 16
Epochs: 8
Warmup steps: 200


# Cell 6: Model Initialization and Training
This cell initializes the model, creates the data loader, and starts the training process.

In [8]:
# Cell 6: Model Initialization and Training
# Initialize model
model = SentenceTransformer(model_name)
model.to(device)

# Create data loader with proper batch size
train_dataloader = DataLoader(
    train_samples, 
    shuffle=True, 
    batch_size=batch_size,
    drop_last=True
)

# Initialize loss function with CosineSimilarityLoss and explicit dtype
train_loss = losses.CosineSimilarityLoss(model, loss_fct=torch.nn.CosineSimilarity(dim=0))

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    optimizer_params={'lr': learning_rate},
    show_progress_bar=True,
    max_grad_norm=1.0
)

Iteration:   0%|          | 0/803 [00:13<?, ?it/s]
Epoch:   0%|          | 0/8 [00:13<?, ?it/s]


RuntimeError: Found dtype Long but expected Float

In [9]:
# Debug: Check model configuration
print("Model configuration:")
print(f"Model device: {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")

Model configuration:
Model device: cpu
Model dtype: torch.float32


# Cell 7: Evaluation Function
This cell defines the evaluation function that calculates the Mean Reciprocal Rank (MRR) score.

In [None]:
# Cell 7: Evaluation Function (Updated)
def evaluate_mrr(model, dev_df, papers_df, top_k=5):
    # Convert all text to string type
    dev_texts = [str(text) for text in dev_df['tweet_text'].tolist()]
    paper_texts = [str(text) for text in papers_df['text'].tolist()]
    
    # Encode dev queries
    query_embeddings = model.encode(
        dev_texts, 
        show_progress_bar=True, 
        convert_to_tensor=True,
        device=device
    )
    
    # Encode papers
    paper_embeddings = model.encode(
        paper_texts, 
        show_progress_bar=True, 
        convert_to_tensor=True,
        device=device
    )
    
    # Compute similarity
    paper_norm = torch.nn.functional.normalize(paper_embeddings, p=2, dim=1)
    paper_ids = papers_df['cord_uid'].tolist()
    predictions = []
    
    for query_emb in query_embeddings:
        query_norm = torch.nn.functional.normalize(query_emb.unsqueeze(0), p=2, dim=1)
        similarity = torch.matmul(query_norm, paper_norm.T).squeeze()
        top_indices = torch.topk(similarity, k=min(top_k, len(paper_norm))).indices.tolist()
        preds = [paper_ids[i] for i in top_indices]
        predictions.append(preds)
    
    # Calculate MRR
    mrr = 0
    for i, preds in enumerate(predictions):
        if dev_df.iloc[i]['cord_uid'] in preds:
            rank = preds.index(dev_df.iloc[i]['cord_uid']) + 1
            mrr += 1.0 / rank
    mrr /= len(predictions)
    
    return mrr

# Cell 8: Model Evaluation and Saving
This cell evaluates the model on the development set and saves the fine-tuned model.

In [None]:
# Cell 8: Model Evaluation and Saving
# Evaluate model
mrr_score = evaluate_mrr(model, dev_df, papers_df)
print(f"MRR Score: {mrr_score:.4f}")

# Save model
model_save_path = f"../models/{model_name}_finetuned_clean_text"
model.save(model_save_path)
print(f"Model saved to {model_save_path}")