In [1]:
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.cuda.amp as amp
from torch.utils.tensorboard import SummaryWriter

from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup

In [2]:
# Device
device=torch.device('cuda:3')
# Hyperparams
epochs=3
"""
Due to In-Batch Negative Samples,
batch_size 32 * accum_steps 16 = 512 is NOT SAME with
batch_size 512 * accum_steps 1 = 512 (Setting in Paper).
"""
batch_size=32
accum_steps=16
lr=5e-5

In [3]:
# Pre-Trained Tokenizer
tokenizer=RobertaTokenizer.from_pretrained("roberta-base")
# Pre-Trained LM
pretrained=RobertaModel.from_pretrained("roberta-base").to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
class NLIDataset(Dataset):
    """
    NLI Dataset for Supervised SimCSE
    """
    def __init__(self, path, tokenizer):
        # Triplet: Sentence, Positive (Entailment), Hard Negative (Contradiction)
        self.sent=[]
        self.pos=[]
        self.neg=[]
        
        # Load Data
        df_nli=pd.read_csv(path)
        
        for index in df_nli.index:
            data=df_nli.loc[index]
            
            self.sent.append(tokenizer.encode(data['sent0']))
            self.pos.append(tokenizer.encode(data['sent1']))
            self.neg.append(tokenizer.encode(data['hard_neg']))
            
        print(len(self.sent), 'data')
    
    def __getitem__(self, idx):
        return self.sent[idx], self.pos[idx], self.neg[idx]
    
    def __len__(self):
        return len(self.sent)

In [5]:
def collate_fn(batch):
    """
    Same Sequence Length on Same Batch
    """
    max_len_sent=0
    max_len_pos=0
    max_len_neg=0
    for sent, pos, neg in batch:
        if len(sent)>max_len_sent: max_len_sent=len(sent)
        if len(pos)>max_len_pos: max_len_pos=len(pos)
        if len(neg)>max_len_neg: max_len_neg=len(neg)
            
    batch_sent=[]
    batch_pos=[]
    batch_neg=[]
    for sent, pos, neg in batch:
        sent.extend([tokenizer.pad_token_id]*(max_len_sent-len(sent)))
        batch_sent.append(sent)
        
        pos.extend([tokenizer.pad_token_id]*(max_len_pos-len(pos)))
        batch_pos.append(pos)
        
        neg.extend([tokenizer.pad_token_id]*(max_len_neg-len(neg)))
        batch_neg.append(neg)
        
    return torch.tensor(batch_sent), torch.tensor(batch_pos), torch.tensor(batch_neg)

In [6]:
# Supervised Dataset from Official GitHub
# https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/nli_for_simcse.csv
dataset_train=NLIDataset(path='../dataset/nli_for_simcse.csv', tokenizer=tokenizer)
dataloader_train=DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

275601 data


In [7]:
class SimCSE(nn.Module):
    """
    Supervised SimCSE
    """
    def __init__(self, pretrained):
        super().__init__()
        
        # Pre-Trained LM
        self.pretrained=pretrained
        
        # Cosine Similarity
        self.cos_sim=nn.CosineSimilarity(dim=-1)
        # Temperature
        self.temp=0.05
        
        # Contrastive Loss
        self.loss=nn.CrossEntropyLoss()
        
    def pooler(self, x):
        # [CLS] without MLP
        return x.last_hidden_state[:,0,:]
    
    def get_embedding(self, x):
        # Return Sentence Representation
        x=self.pretrained(x)
        return self.pooler(x)
    
    def forward(self, sent, pos, neg):
        # Forward
        sent=self.pretrained(sent)
        pos=self.pretrained(pos)
        neg=self.pretrained(neg)
        
        # Pooling
        repr_sent=self.pooler(sent)
        repr_pos=self.pooler(pos)
        repr_neg=self.pooler(neg)
        
        # Cosine Similarity
        sim_pos=self.cos_sim(repr_sent.unsqueeze(1), repr_pos.unsqueeze(0))/self.temp
        sim_neg=self.cos_sim(repr_sent.unsqueeze(1), repr_neg.unsqueeze(0))/self.temp
        
        # Contrastive Loss
        sim=torch.cat([sim_pos, sim_neg], dim=1)
        label=torch.arange(sim.size(0)).long().to(sim.device)
        loss=self.loss(sim, label)
        
        return loss

In [None]:
# Model: Supervised SimCSE
model=SimCSE(pretrained=pretrained).to(device)
model.train()

# Optimizer, Scheduler
optimizer=AdamW(model.parameters(), lr=lr)
scheduler=get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=int(epochs*len(dataset_train)/(accum_steps*batch_size))
)

# Mixed Precision: GradScaler
scaler=amp.GradScaler()

# Tensorboard
writer=SummaryWriter()

step_global=0
for epoch in range(epochs):
    loss_=0
    optimizer.zero_grad()
    for step, (sent, pos, neg) in enumerate(dataloader_train):
        # Load on Device
        sent=sent.to(device)
        pos=pos.to(device)
        neg=neg.to(device)
        
        # Forward
        with amp.autocast():
            loss=model(sent, pos, neg)
            loss=loss/accum_steps
        # Backward
        scaler.scale(loss).backward()
        loss_+=loss.item()
        
        # Step
        if (step+1)%accum_steps==0:
            step_global+=1
            
            # Tensorboard
            writer.add_scalar(
                f'loss_train/SimCSE_Sup_batch{int(accum_steps*batch_size)}_lr{lr}_epochs{epochs}',
                loss_,
                step_global
            )
            loss_=0
            
            # Optimizer, Scheduler
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
            
            if (step_global+1)%250==0:
                # Eval Phase
                # Save Model
                model.to(torch.device('cpu'))
                torch.save(
                    model,
                    f'../model/SimCSE_Sup_batch{int(accum_steps*batch_size)}_lr{lr}_step{step_global+1}'
                )
                model.to(device)



### Evaluation on STS-B

In [1]:
import torch
import torch.nn as nn

from transformers import RobertaTokenizer

import numpy as np
from scipy import spatial, stats

In [2]:
class SimCSE(nn.Module):
    """
    Supervised SimCSE
    """
    def __init__(self, pretrained):
        super().__init__()
        
        # Pre-Trained LM
        self.pretrained=pretrained
        
        # Cosine Similarity
        self.cos_sim=nn.CosineSimilarity(dim=-1)
        # Temperature
        self.temp=0.05
        
        # Contrastive Loss
        self.loss=nn.CrossEntropyLoss()
        
    def pooler(self, x):
        # [CLS] without MLP
        return x.last_hidden_state[:,0,:]
    
    def get_embedding(self, x):
        x=self.pretrained(x)
        return self.pooler(x)
    
    def forward(self, sent, pos, neg):
        # Forward
        sent=self.pretrained(sent)
        pos=self.pretrained(pos)
        neg=self.pretrained(neg)
        
        # Pooling
        repr_sent=self.pooler(sent)
        repr_pos=self.pooler(pos)
        repr_neg=self.pooler(neg)
        
        # Cosine Similarity
        sim_pos=self.cos_sim(repr_sent.unsqueeze(1), repr_pos.unsqueeze(0))/self.temp
        sim_neg=self.cos_sim(repr_sent.unsqueeze(1), repr_neg.unsqueeze(0))/self.temp
        
        # Contrastive Loss
        sim=torch.cat([sim_pos, sim_neg], dim=1)
        label=torch.arange(sim.size(0)).long().to(sim.device)
        loss=self.loss(sim, label)
        
        return loss

In [3]:
# Device
device=torch.device('cuda:3')

# Pre-Trained Tokenizer
tokenizer=RobertaTokenizer.from_pretrained("roberta-base")

# Load Trained Model: Supervised SimCSE
model=torch.load('../model/SimCSE_Sup_batch512_lr5e-05_step250').to(device)

In [4]:
# STS Benchmark Dataset
# https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark
with open('../dataset/stsbenchmark/sts-test.csv', 'r') as f:
    stsb_test=f.read()
    f.close()

In [5]:
# Eval Mode
model.eval()

preds=[]
labels=[]
for data in stsb_test.split('\n')[:-1]:
    label, sent1, sent2=data.split('\t')[4:7]
    labels.append(float(label))
    
    repr_sent1=model.get_embedding(tokenizer.encode(sent1, return_tensors='pt').to(device))
    repr_sent2=model.get_embedding(tokenizer.encode(sent2, return_tensors='pt').to(device))
    
    pred=1-spatial.distance.cosine(np.array(repr_sent1.detach().cpu()), np.array(repr_sent2.detach().cpu()))
    preds.append(pred)

In [6]:
np.corrcoef(preds, labels)

array([[1.       , 0.8337637],
       [0.8337637, 1.       ]])

In [7]:
stats.spearmanr(preds, labels)

SpearmanrResult(correlation=0.8380499314067523, pvalue=0.0)