<h1> Setup 1c </h1>

In [1]:
#make imports

import os
import sys
import torch
import numpy as np
import pandas as pd

In [2]:
#load the sentence transformer model and cosine sim loss function

from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, InputExample, losses, SentencesDataset
from torch.utils.data import DataLoader, Dataset

In [3]:
#load the csv files for train and val

train_df = pd.read_csv('../data/train.csv', sep='\t', on_bad_lines='skip')
dev_df = pd.read_csv('../data/dev.csv', sep='\t', on_bad_lines='skip')

train_sentence1 = train_df['sentence1'].tolist()
train_sentence2 = train_df['sentence2'].tolist()
train_scores = train_df['score'].tolist()


val_sentence1 = dev_df['sentence1'].tolist()
val_sentence2 = dev_df['sentence2'].tolist()
val_scores = dev_df['score'].tolist()

#scale the train scores b/w 0 and 1
train_scores = [float(train_score/5.0) for train_score in train_scores]
val_scores = [float(val_score/5.0) for val_score in val_scores]

In [4]:
#set device as gpu
device = "mps"

#load model
model = SentenceTransformer("all-MiniLM-L6-v2")

#send model to device
model.to(device)

#set train_loss as cosine similarity loss
criterion = losses.CosineSimilarityLoss(model=model)

#set optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [5]:
class TextSimilarityDataset(Dataset):
    def __init__(self, sentences1, sentences2, labels, tokenizer, max_length):
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(self.sentences1[idx], self.sentences2[idx], padding='max_length', max_length=self.max_length, return_tensors='pt')
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return {'input_ids': encoding['input_ids'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0), 'label': label}

# Define data loaders
train_dataloader = TextSimilarityDataset(train_sentence1, train_sentence2, train_scores, model, model.max_seq_length)
val_dataloader = TextSimilarityDataset(val_sentence1, val_sentence2, val_scores, model, model.max_seq_length)

In [6]:
#import tqdm to view progress & wandb to make plots
from tqdm.auto import tqdm
import wandb

wandb.login(key="7ef2e84866a68a6cd33c90b1fa55c8cf8ab2d6e7", relogin=True)
wandb.init(project="nlp_a3", name="setup-1c")
wandb.watch(model)

#finetune the model
num_epochs = 5

for epoch in tqdm(range(num_epochs), desc="Epochs"):
    
    #set the model to train mode
    model.train()
    train_loss = 0
    val_loss = 0

    for train_data in train_dataloader:

        optimizer.zero_grad()
        
        loss = model(train_data)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    with torch.no_grad():
        for val_data in val_dataloader:

            # inputs, labels = inputs.to(device), labels.to(device)
            loss = model(val_data)
            val_loss += loss.item()

    train_loss /= len(train_dataloader)
    val_loss /= len(val_dataloader)

    wandb.log(
            {
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "val_loss": val_loss,
            }
        )
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/mo/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkaif21067[0m ([33mbigmeow[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01117271759999999, max=1.0)…

Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

TypeError: forward() got an unexpected keyword argument 'padding'