In [1]:
import torch
from transformers import AutoTokenizer, AutoModel

import numpy as np
from scipy import spatial, stats

from models import (
    SupervisedSimCSE,
    UnsupervisedSimCSE,
    PrefixSupervisedSimCSE,
    SupervisedCPT,
    UnsupervisedCPT,
    PrefixSupervisedCPT,
    PrefixUnsupervisedCPT
)

In [2]:
# Device
device=torch.device("cuda:3")

# Load Pre-Trained Tokenizer, LM
tokenizer=AutoTokenizer.from_pretrained("gpt2")
pretrained=AutoModel.from_pretrained("gpt2").to(device)

# Add Pad Token: [PAD]
if tokenizer.pad_token==None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    pretrained.resize_token_embeddings(len(tokenizer))

# Load Trained Model: SimCSE
# model=SupervisedSimCSE(pretrained=pretrained)
# model=UnsupervisedSimCSE(pretrained=pretrained)

# Load Trained Model: SimCSE with Prefix-Tuning
# model=PrefixSupervisedSimCSE(base_config=pretrained.config, preseqlen=5, hidden_dim=512)

# Load Trained Model: CPT
# model=SupervisedCPT(pretrained=pretrained)
# model=UnsupervisedCPT(pretrained=pretrained)

# Load Trained Model: CPT with Prefix-Tuning
#model=PrefixSupervisedCPT(base_config=pretrained.config, preseqlen=5, hidden_dim=512)
model=PrefixUnsupervisedCPT(base_config=pretrained.config, preseqlen=5, hidden_dim=512)

model.load_state_dict(torch.load("../model/cpt-unsup-prefix(gpt2)_preseqlen5_hidden512_batch192_lr0.0001_step6500.pth"))
model=model.to(device)

Using pad_token, but it is not set yet.


In [3]:
# STS Benchmark Dataset
# https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark
with open("./dataset/stsbenchmark/sts-test.csv", "r") as f:
    stsb_test=f.read()
    f.close()

In [4]:
# Eval Mode
pretrained.eval()
model.eval()

preds=[]
labels=[]
for data in stsb_test.split('\n')[:-1]:
    label, sent1, sent2=data.split('\t')[4:7]
    labels.append(float(label))
    
    # General Setting (without Prefix-Tuning)
#     repr_sent1=model.get_embedding(tokenizer.encode(sent1, return_tensors='pt').to(device))
#     repr_sent2=model.get_embedding(tokenizer.encode(sent2, return_tensors='pt').to(device))
    
    # with Prefix-Tuning
    repr_sent1=model.get_embedding(
        pretrained=pretrained,
        x=tokenizer.encode(sent1, return_tensors='pt').to(device)
    )
    repr_sent2=model.get_embedding(
        pretrained=pretrained,
        x=tokenizer.encode(sent2, return_tensors='pt').to(device)
    )
    
    pred=1-spatial.distance.cosine(np.array(repr_sent1.detach().cpu()), np.array(repr_sent2.detach().cpu()))
    preds.append(pred)

In [5]:
np.corrcoef(preds, labels)

array([[1.        , 0.69801416],
       [0.69801416, 1.        ]])

In [6]:
stats.spearmanr(preds, labels)

SpearmanrResult(correlation=0.6908329011074216, pvalue=3.451984920212133e-196)