<h1> Setup 1c </h1>

In [1]:
#make imports

import os
import sys
import torch
import numpy as np
import pandas as pd

In [2]:
#load the sentence transformer model and cosine sim loss function

from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, InputExample, losses, SentencesDataset
from torch.utils.data import DataLoader, Dataset

In [3]:
#load the csv files for train and val

train_df = pd.read_csv('../data/train.csv', sep='\t', on_bad_lines='skip')
dev_df = pd.read_csv('../data/dev.csv', sep='\t', on_bad_lines='skip')

train_sentence1 = train_df['sentence1'].tolist()
train_sentence2 = train_df['sentence2'].tolist()
train_scores = train_df['score'].tolist()


val_sentence1 = dev_df['sentence1'].tolist()
val_sentence2 = dev_df['sentence2'].tolist()
val_scores = dev_df['score'].tolist()

#scale the train scores b/w 0 and 1
train_scores = [float(train_score/5.0) for train_score in train_scores]
val_scores = [float(val_score/5.0) for val_score in val_scores]

In [4]:
#set device as gpu
device = "mps"

#load model
model = SentenceTransformer('all-MiniLM-L6-v2')

#send model to device
model.to(device)

#set train_loss as cosine similarity loss
criterion = losses.CosineSimilarityLoss(model=model)

#set optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [5]:
class TextSimilarityDataset(Dataset):
    def __init__(self, sentences1, sentences2, labels, tokenizer, max_length):
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(self.sentences1[idx], self.sentences2[idx], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return {'input_ids': encoding['input_ids'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0), 'label': label}
    
#define tokenizer
tokenizer = AutoTokenizer.from_pretrained('all-MiniLM-L6-v2')

#define data loaders
train_dataset = TextSimilarityDataset(train_sentence1, train_sentence2, train_scores, tokenizer, 256)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

val_dataset = TextSimilarityDataset(val_sentence1, val_sentence2, val_scores, tokenizer, 256)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True)

OSError: all-MiniLM-L6-v2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
from tqdm.auto import tqdm
num_epochs = 5

for epoch in tqdm(range(num_epochs), desc="Epochs"):
    
    #set the model to train mode
    model.train()
    train_loss = 0
    val_loss = 0

    for sentences_A, sentences_B, gold_labels in train_dataloader:

        optimizer.zero_grad()

        sentence_embeddings_A = model.encode(sentences_A, convert_to_tensor=True)
        sentence_embeddings_B = model.encode(sentences_B, convert_to_tensor=True)
        sentence_embeddings = [[sentence_embeddingA, sentence_embeddingB] for sentence_embeddingA, sentence_embeddingB in zip(sentence_embeddings_A, sentence_embeddings_B)]

        loss = criterion(sentence_embeddings, gold_labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

In [None]:
# #import tqdm to view progress & wandb to make plots
# from tqdm.auto import tqdm
# import wandb

# wandb.login(key="7ef2e84866a68a6cd33c90b1fa55c8cf8ab2d6e7", relogin=True)
# wandb.init(project="nlp_a3", name="setup-1c")
# wandb.watch(model)

# #finetune the model
# num_epochs = 5

# for epoch in tqdm(range(num_epochs), desc="Epochs"):
    
#     #set the model to train mode
#     model.train()
#     train_loss = 0
#     val_loss = 0

#     for sentences_A, sentences_B, gold_labels in train_dataloader:

#         optimizer.zero_grad()

#         sentence_embeddings_A = model.encode(sentences_A, convert_to_tensor=True)
#         sentence_embeddings_B = model.encode(sentences_B, convert_to_tensor=True)

#         loss = criterion(sentence_embeddings_A, sentence_embeddings_B, gold_labels)
#         loss.backward()
#         optimizer.step()
#         train_loss += loss.item()

#     # model.eval()
#     # with torch.no_grad():
#     #     for val_data in val_dataloader:

#     #         # inputs, labels = inputs.to(device), labels.to(device)
#     #         loss = model(val_data)
#     #         val_loss += loss.item()

#     train_loss /= len(train_dataloader)
#     # val_loss /= len(val_dataloader)

#     wandb.log(
#             {
#             "epoch": epoch + 1,
#             "train_loss": train_loss,
#             # "val_loss": val_loss,
#             }
#         )
# wandb.finish()