In [1]:
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.cuda.amp as amp
from torch.utils.tensorboard import SummaryWriter

from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup

In [2]:
# Device
device=torch.device("cuda:3")
# Hyperparams
epochs=1
"""
Due to In-Batch Negative Samples,
batch_size 16 * accum_steps 8 = 128 is NOT SAME with
batch_size 128 * accum_steps 1 = 128.
"""
batch_size=16
accum_steps=8
lr=5e-5

In [3]:
# Pre-Trained Tokenizer
tokenizer=RobertaTokenizer.from_pretrained("roberta-base")
# Pre-Trained LM
pretrained=RobertaModel.from_pretrained("roberta-base").to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
class WikiDataset(Dataset):
    """
    Randomly Sampled Wikipedia-Sentences for Unsupervised SimCSE
    """
    def __init__(self, path, tokenizer):
        # Pair: Sentence, Positive (=Sentence)
        self.sent=[]
        self.pos=[]
        
        # Load Data
        df_wiki=pd.read_csv(path)
        
        for index in df_wiki.index:
            data=df_wiki.loc[index]
            
            # Encode
            enc0=tokenizer.encode(data["sent0"])
            # Truncate
            if len(enc0)>512:
                enc0=enc0[:511]+[tokenizer.eos_token_id]
            # Append
            self.sent.append(enc0)
            
            # Encode
#             enc1=tokenizer.encode(data["sent1"])
            # Truncate
#             if len(enc1)>512:
#                 enc1=enc1[:511]+[tokenizer.eos_token_id]
            # Append
            self.pos.append(enc0)
            
        print(len(self.sent), "data")
    
    def __getitem__(self, idx):
        return self.sent[idx], self.pos[idx]
    
    def __len__(self):
        return len(self.sent)

In [5]:
def collate_fn(batch):
    """
    Same Sequence Length on Same Batch
    """
    max_len_sent=0
    max_len_pos=0
    for sent, pos in batch:
        if len(sent)>max_len_sent: max_len_sent=len(sent)
        if len(pos)>max_len_pos: max_len_pos=len(pos)
            
    batch_sent=[]
    batch_pos=[]
    for sent, pos in batch:
        sent.extend([tokenizer.pad_token_id]*(max_len_sent-len(sent)))
        batch_sent.append(sent)
        
        pos.extend([tokenizer.pad_token_id]*(max_len_pos-len(pos)))
        batch_pos.append(pos)
        
    return torch.tensor(batch_sent), torch.tensor(batch_pos)

In [6]:
# Unsupervised Dataset from Official GitHub
# https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt
with open("../dataset/wiki1m_for_simcse.txt", "r") as f:
    data=f.read().split("\n")
    f.close()
data.remove("")
print(len(data), "data")

# Save Dataset as CSV File
pd.DataFrame({"sent0": data, "sent1": data}).to_csv("../dataset/wiki1m_for_simcse.csv")
# Load Dataset, DataLoader
dataset_train=WikiDataset(path="../dataset/wiki1m_for_simcse.csv", tokenizer=tokenizer)
dataloader_train=DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

1000000 data


Token indices sequence length is longer than the specified maximum sequence length for this model (667 > 512). Running this sequence through the model will result in indexing errors


1000000 data


In [7]:
class SimCSE(nn.Module):
    """
    Unsupervised SimCSE
    """
    def __init__(self, pretrained):
        super().__init__()
        
        # Pre-Trained LM
        self.pretrained=pretrained
        # Pooling Layer: MLP (Train Only)
        self.mlp=nn.Linear(self.pretrained.config.hidden_size, self.pretrained.config.hidden_size)
        
        # Cosine Similarity
        self.cos_sim=nn.CosineSimilarity(dim=-1)
        # Temperature
        self.temp=0.05
        
        # Contrastive Loss
        self.loss=nn.CrossEntropyLoss()
        
    def pooler(self, x):
        # [CLS] with MLP (Train Only)
        x=x.last_hidden_state[:,0,:]
        return self.mlp(x)
    
    def get_embedding(self, x):
        # Return Sentence Representation
        x=self.pretrained(x)
        return x.last_hidden_state[:,0,:]
    
    def forward(self, sent, pos):
        # Forward
        sent=self.pretrained(sent)
        pos=self.pretrained(pos)
        
        # Pooling
        repr_sent=self.pooler(sent)
        repr_pos=self.pooler(pos)
        
        # Cosine Similarity
        sim=self.cos_sim(repr_sent.unsqueeze(1), repr_pos.unsqueeze(0))/self.temp
        
        # Contrastive Loss
        label=torch.arange(sim.size(0)).long().to(sim.device)
        loss=self.loss(sim, label)
        
        return loss

In [None]:
# Model: Unsupervised SimCSE
model=SimCSE(pretrained=pretrained).to(device)
model.train()

# Optimizer, Scheduler
optimizer=AdamW(model.parameters(), lr=lr)
scheduler=get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=int(epochs*len(dataset_train)/(accum_steps*batch_size))
)

# Mixed Precision: GradScaler
scaler=amp.GradScaler()

# Tensorboard
writer=SummaryWriter()

step_global=0
for epoch in range(epochs):
    _loss=0
    optimizer.zero_grad()
    for step, (sent, pos) in enumerate(dataloader_train):
        # Load on Device
        sent=sent.to(device)
        pos=pos.to(device)
        
        # Forward
        with amp.autocast():
            loss=model(sent, pos)
            loss=loss/accum_steps
        # Backward
        scaler.scale(loss).backward()
        _loss+=loss.item()
        
        # Step
        if (step+1)%accum_steps==0:
            step_global+=1
            
            # Tensorboard
            writer.add_scalar(
                f'loss_train/SimCSE_Unsup_batch{int(accum_steps*batch_size)}_lr{lr}_epochs{epochs}',
                _loss,
                step_global
            )
            _loss=0
            
            # Optimizer, Scheduler
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
            
            if (step_global+1)%250==0:
                # Eval Phase
                # Save Model
                model.to(torch.device('cpu'))
                torch.save(
                    model,
                    f'../model/SimCSE_Unsup_batch{int(accum_steps*batch_size)}_lr{lr}_step{step_global+1}'
                )
                model.to(device)
                model.train()



### Evaluation on STS-B

In [1]:
import torch
import torch.nn as nn

from transformers import RobertaTokenizer

import numpy as np
from scipy import spatial, stats

In [2]:
class SimCSE(nn.Module):
    """
    Unsupervised SimCSE
    """
    def __init__(self, pretrained):
        super().__init__()
        
        # Pre-Trained LM
        self.pretrained=pretrained
        # Pooling Layer: MLP (Train Only)
        self.mlp=nn.Linear(self.pretrained.config.hidden_size, self.pretrained.config.hidden_size)
        
        # Cosine Similarity
        self.cos_sim=nn.CosineSimilarity(dim=-1)
        # Temperature
        self.temp=0.05
        
        # Contrastive Loss
        self.loss=nn.CrossEntropyLoss()
        
    def pooler(self, x):
        # [CLS] with MLP (Train Only)
        x=x.last_hidden_state[:,0,:]
        return self.mlp(x)
    
    def get_embedding(self, x):
        # Return Sentence Representation
        x=self.pretrained(x)
        return x.last_hidden_state[:,0,:]
    
    def forward(self, sent, pos):
        # Forward
        sent=self.pretrained(sent)
        pos=self.pretrained(pos)
        
        # Pooling
        repr_sent=self.pooler(sent)
        repr_pos=self.pooler(pos)
        
        # Cosine Similarity
        sim=self.cos_sim(repr_sent.unsqueeze(1), repr_pos.unsqueeze(0))/self.temp
        
        # Contrastive Loss
        label=torch.arange(sim.size(0)).long().to(sim.device)
        loss=self.loss(sim, label)
        
        return loss

In [3]:
# Device
device=torch.device("cuda:2")

# Pre-Trained Tokenizer
tokenizer=RobertaTokenizer.from_pretrained("roberta-base")

# Load Trained Model: Supervised SimCSE
model=torch.load("../model/SimCSE_Unsup_batch128_lr5e-05_step2500").to(device)

In [4]:
# STS Benchmark Dataset
# https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark
with open("../dataset/stsbenchmark/sts-test.csv", "r") as f:
    stsb_test=f.read()
    f.close()

In [5]:
# Eval Mode
model.eval()

preds=[]
labels=[]
for data in stsb_test.split("\n")[:-1]:
    label, sent1, sent2=data.split("\t")[4:7]
    labels.append(float(label))
    
    repr_sent1=model.get_embedding(tokenizer.encode(sent1, return_tensors="pt").to(device))
    repr_sent2=model.get_embedding(tokenizer.encode(sent2, return_tensors="pt").to(device))
    
    pred=1-spatial.distance.cosine(np.array(repr_sent1.detach().cpu()), np.array(repr_sent2.detach().cpu()))
    preds.append(pred)

In [6]:
np.corrcoef(preds, labels)

array([[1.        , 0.80130688],
       [0.80130688, 1.        ]])

In [7]:
stats.spearmanr(preds, labels)

SpearmanrResult(correlation=0.7889213716498475, pvalue=1.6754146165373438e-293)