In [None]:
import os
from tqdm import tqdm
import seaborn as sns
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from sklearn.manifold import TSNE 
from nli_dataset import load_data


In [None]:
model_id = 'bert-base-uncased'
device = torch.device("cuda")


model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=3)

tokenizer = AutoTokenizer.from_pretrained(model_id)

train_datasets = load_data(tokenizer, 'validation_matched')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 

In [None]:
model_keys = ['MMD', 'FULL', 'SUP', 'DC']
model_files = ['outputs/with_mmd_2_0_seed_2222/epoch_1_240.pth', 'outputs/original_2_0_seed_2222/epoch_5_240.pth',
'outputs/supervised_2_seed_2222/epoch_240.pth', 'outputs/with_dc_2_0_seed_2222/epoch_5_240.pth']

In [None]:
def evaluate(datasetloader, is_source):
    X = []
    Y = []
    with torch.no_grad():
        num = 0
        for samples in tqdm(datasetloader):
            input_ids, attention_mask, labels = \
                samples['input_ids'].to(device), samples['attention_mask'].to(device), samples['labels'].to(device)

            hidden_1 = model.bert(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            hidden_state_1 = hidden_1[1]  # (bs, dim)
            
            if is_source:
                env_label = ['0' for _ in range(hidden_state_1.size(0))]
            else:
                env_label = ['1' for _ in range(hidden_state_1.size(0))]
        
            X.append(hidden_state_1)
            Y.extend(env_label)
    return X, Y

In [None]:
# checkpoint = torch.load('outputs/with_dc_2_0_seed_2222/epoch_5_240.pth', map_location='cpu')

res = []
feat_cols = None
for cp in model_files:
    checkpoint = torch.load(cp, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    
    model.eval()
    model.cuda()
    
    source_id = 2
    target_id = 0

    X, Y = [], []
    for i in [source_id, target_id]:
        datasetloader = torch.utils.data.DataLoader(train_datasets[i], batch_size=500, collate_fn=data_collator)
        temp_X, temp_Y = evaluate(datasetloader, True if i == 0 else False)

        X.extend(temp_X)
        Y.extend(temp_Y)

    X = torch.cat(X, axis=0)
    if not feat_cols:
        feat_cols = [ 'pixel'+str(i) for i in range(X.shape[1]) ]
    df = pd.DataFrame(X.cpu(),columns=feat_cols)
    df['y'] = Y
    df['label'] = df['y'].apply(lambda i: str(i))
    res.append(df)
    


In [None]:
np.random.seed(20)

for idx, df in enumerate(res):
    print(f'{model_keys[idx]}: ')
    rndperm = np.random.permutation(df.shape[0])

    N = 10000
    df_subset = df.loc[rndperm[:N],:].copy()
    data_subset = df_subset[feat_cols].values
    tsne = TSNE(n_components=2, verbose=1, perplexity=100, n_iter=300, random_state=20)
    tsne_results = tsne.fit_transform(data_subset)
    df_subset['X'] = tsne_results[:,0]
    df_subset['Y'] = tsne_results[:,1]
    plt.figure(figsize=(6,6))
    sns.scatterplot(
        x="X", y="Y",
        hue="y",
        palette=sns.color_palette('cubehelix', 2),
        data=df_subset,
        legend="full",
        alpha=0.5
    )
