# Embeddings Experiment 03 - Implementation of Embedding Averaging (Initialization)

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config

In [None]:
# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
new_tokens = ['<FUNC1_STT>', '<FUNC2_STT>']

In [None]:
# Add the new words to the tokenizer's vocabulary


tokenizer.add_tokens(new_tokens)

# Resize the GPT-2 model's embedding layer to accommodate the new tokens
model.resize_token_embeddings(len(tokenizer))

In [None]:
params = model.state_dict()
embeddings = params['transformer.wte.weight']
pre_expansion_embeddings = embeddings[:-2,:]
mu = torch.mean(pre_expansion_embeddings, dim=0)
n = pre_expansion_embeddings.size()[0]
sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
dist = torch.distributions.multivariate_normal.MultivariateNormal(
        mu, covariance_matrix=1e-5*sigma)

In [None]:
new_embeddings = torch.stack(tuple((dist.sample() for _ in range(3))), dim=0)
embeddings[-3:,:] = new_embeddings
params['transformer.wte.weight'][-3:,:] = new_embeddings
model.load_state_dict(params)

In [None]:
sent2 = 'Dogs are great because they are '
tokenizer.decode(model.generate(**tokenizer(sent2, return_tensors='pt'), do_sample=True)[0])

In [None]:
# Print embeddings before training
print("Embeddings before training:")
new_token_embeddings = model.transformer.wte.weight[-2:]  # Get embeddings for the new tokens
print(new_token_embeddings)

In [None]:
embeddings_np = new_token_embeddings.detach().cpu().numpy()

# Get corresponding words for the embeddings
words = tokenizer.convert_ids_to_tokens(range(len(tokenizer)), skip_special_tokens=True)

# Print words corresponding to the embeddings
for i, embedding in enumerate(embeddings_np):
    word = words[-2 + i]  # Get the word corresponding to the embedding
    print(f"Embedding {i+1}: Word: {word}, Embedding: {embedding[0]}, {embedding[1]}, ... {embedding[-1]}")

In [None]:
from torch.nn.utils.rnn import pad_sequence

dataset_path = 'usage_dataset_2.txt'

def tokenize_sentence(sentence):
    tokens = tokenizer.encode(sentence, add_special_tokens=False)
    return tokens

def process_dataset(dataset):
    tokenized_dataset = []
    sentences = dataset.split('\n')
    for sentence in sentences:
        if sentence.strip() != '':
            tokens = tokenize_sentence(sentence)
            tokenized_dataset.append(tokens)
    return tokenized_dataset

# Read the dataset file
with open(dataset_path, 'r', encoding='utf-8') as file:
    dataset = file.read()

# Tokenize each sentence individually
tokenized_dataset = process_dataset(dataset)

# Pad the tokenized sequences
padded_dataset = pad_sequence([torch.tensor(tokens) for tokens in tokenized_dataset], batch_first=True)

# Convert tokenized dataset to PyTorch tensors
inputs = padded_dataset[:, :-1]  # Exclude the last token for prediction
labels = padded_dataset[:, 1:]   # Shift the input to generate labels

In [None]:
# Convert tokenized dataset to PyTorch tensors
inputs = torch.tensor(padded_dataset[:-1]).unsqueeze(0)  # Exclude the last token for prediction
labels = torch.tensor(padded_dataset[1:]).unsqueeze(0)   # Shift the input to generate labels

In [None]:
print(inputs[0])
print(inputs.shape)
print(labels[0])
print(labels.shape)

In [None]:
import numpy as np

print(type(inputs.squeeze().detach().numpy()))

input_tokens = tokenizer.decode(inputs.squeeze().numpy(), skip_special_tokens=True)
label_tokens = tokenizer.decode(labels.squeeze(), skip_special_tokens=True)

print("Input tokens:", input_tokens)
print("Label tokens:", label_tokens)



In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

In [None]:
# Set the model in training mode
model.to(device)
model.train()

# Fine-tune the model
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(40):  # You can adjust the number of epochs as needed
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs = model(inputs, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

In [None]:
# Print embeddings after training
print("\nEmbeddings after training:")
new_token_embeddings = model.transformer.wte.weight[-2:]  # Get embeddings for the new tokens
print(new_token_embeddings)

In [None]:
# Save the fine-tuned model
save_path = 'fine_tuned_model.pth'
torch.save(model.state_dict(), save_path)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between each pair of new embeddings
similarity_matrix = cosine_similarity(new_token_embeddings.cpu().detach().numpy())

# Print cosine similarity matrix
print("\nCosine Similarity Matrix:")
print(similarity_matrix)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between each pair of new embeddings
similarity_matrix = cosine_similarity(new_token_embeddings.cpu().detach().numpy())

# Print cosine similarity matrix
print("\nCosine Similarity Matrix:")
print(similarity_matrix)

In [None]:
embedding_layer = model.transformer.wte

# Specify the character for which you want to retrieve the embedding
character = "!"

# Convert the character to its corresponding token ID using the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
token_id = tokenizer.encode(character, add_special_tokens=False)[0]

# Retrieve the embedding vector for the token ID
embedding = embedding_layer.weight[token_id]

# Print the embedding vector
print("Embedding for character '{}':".format(character))
print(embedding)

In [None]:
def generate_text(seed_text, max_length=50, temperature=0.8):
    input_ids = tokenizer.encode(seed_text, return_tensors='pt')
    input_ids = input_ids.to(device)

    attention_mask = torch.ones_like(input_ids)

    # Generate text
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        temperature=temperature,
        num_return_sequences=1
    )

    # Decode and return generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

In [None]:
generated_text = generate_text("I looked through the thesaurus and found that the synonym for pretty is")
print("Generated text:", generated_text)

In [None]:
generated_text = generate_text("I found that the opposite of hate is")
print("Generated text:", generated_text)

In [None]:
generated_text = generate_text("Yesterday, my dog and I walked to the")
print("Generated text:", generated_text)

In [None]:
model = model.to('cpu')
tokenizer.decode(model.generate(**tokenizer("I found that another word for like is", return_tensors='pt'), do_sample=True)[0])

In [None]:
new_embeddings =
embeddings[-3:,:] = new_embeddings
params['transformer.wte.weight'][-3:,:] = new_embeddings
model.load_state_dict(params)

In [None]:
def find_similar_words(word, top_k=15):
    word_embedding = model.transformer.wte.weight[tokenizer.encode(word)[0]].detach().cpu().numpy()

    # Compute cosine similarity between word embedding and all other embeddings
    embeddings = model.transformer.wte.weight.detach().cpu().numpy()
    similarities = cosine_similarity([word_embedding], embeddings)[0]

    # Get indices of top-k similar words
    top_indices = similarities.argsort()[-top_k:][::-1]

    # Decode and return top-k similar words
    similar_words = [tokenizer.decode([index]) for index in top_indices]
    return similar_words

In [None]:
# Find similar words based on embeddings

for token in new_tokens:
  similar_words = find_similar_words(token)
  print("\nSimilar words to", token + ":")
  for similar_word in similar_words:
      print(similar_word)
