In [None]:
import os
import sys
sys.path.append("..")
os.environ['CUDA_VISIBLE_DEVICES'] = "5"

In [None]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from models.vae import VAE, Encoder

In [None]:
# Instantialte tsne, specify cosine metric
tsne = TSNE(random_state = 0, n_iter = 1000, metric = 'cosine')

In [None]:
vae_config = {
    "encoder": {
        "model_name": 'bert-base-uncased',
        "tokenizer_name": 'bert-base-uncased',
        "hidden_size": 768,
        "latent_size": 128,
    }
}

In [None]:
encoder = Encoder(vae_config["encoder"])

In [None]:
checkpoint = torch.load(os.path.join("../checkpoints", f'encoder_latest.pt'))
encoder.load_state_dict(checkpoint, strict=False)

In [None]:
def get_embedding(text):
    tokenized_content = encoder.tokenizer("Hello world!", return_tensors="pt")
    mean, logvar, outputs = encoder(
        input_ids=tokenized_content["input_ids"], 
        attention_mask=tokenized_content["attention_mask"],
    )
    return mean.view(-1).detach().numpy()

In [None]:
gender_df = pd.read_csv("/home/mingzhe/Projects/DebiasODE/src/data/gender_dataset.csv", index_col=0)

In [None]:
embeddings = list()
labels = list()
count = 0

for index, row in gender_df.iterrows():
    embeddings += [get_embedding(row.text)]
    labels += [row.label]
    count += 1
    print(count)

In [10]:
# Fit and transform
embeddings2d = tsne.fit_transform(embeddings)



In [11]:
# Create DF
embeddingsdf = pd.DataFrame()
# Add game names
embeddingsdf['label'] = labels
# Add x coordinate
embeddingsdf['x'] = embeddings2d[:,0]
# Add y coordinate
embeddingsdf['y'] = embeddings2d[:,1]

In [14]:
embeddings_f = embeddingsdf[embeddingsdf.label=="female"]
embeddings_m = embeddingsdf[embeddingsdf.label=="male"]

# Check
# embeddings_f.head()
# embeddings_m.head()

Unnamed: 0,label,x,y
1,male,0.226863,-0.567042
4,male,0.226863,-0.567042
7,male,0.226863,-0.567042
10,male,0.226863,-0.567042
13,male,0.226863,-0.567042


In [None]:
# Set figsize
fig, ax = plt.subplots(figsize=(6,6))
# Scatter points, set alpha low to make points translucent
ax.scatter(embeddingsdf.x, embeddingsdf.y, alpha=.5, c="red")
ax.scatter(embeddingsdf.x, embeddingsdf.y, alpha=.5, c="blue")
plt.title('Scatter plot of games using t-SNE')
plt.show()