In [7]:
%load_ext autoreload
%autoreload 2
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from typing import List, Tuple, Dict, Union, Any, Optional

try:
    os.chdir(os.path.join(os.getcwd(), '../../Language-Technology-Project'))
    print(os.getcwd())
except:
    print("ALready in current dir")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
ALready in current dir


# Generative pipeline

In [8]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')
model = GPT2LMHeadModel.from_pretrained('gpt2-xl').to(device)

print(f"size_gb(model) = {model.num_parameters() * 4 / 1024**3:.2f} GB")
print("vocab size:", tokenizer.vocab_size)

size_gb(model) = 5.80 GB
vocab size: 50257


In [9]:
text = "Man is a doctor as a women is a"
encoded_input = tokenizer(text, return_tensors='pt').to(device)
output = model(**encoded_input)
print(output.logits.shape)
next_token_logits = output.logits[:, -1, :]
likelihoods = torch.softmax(next_token_logits, dim=-1)
sorted_likelihoods, sorted_indices = torch.sort(likelihoods, descending=True)
print("Top 5 tokens:")
for i in range(5):
    print(f"{i+1}: {tokenizer.decode(sorted_indices[0, i])} ({sorted_likelihoods[0, i]:.2f})")

torch.Size([1, 9, 50257])
Top 5 tokens:
1:  woman (0.11)
2:  doctor (0.08)
3:  nurse (0.05)
4:  mother (0.04)
5:  man (0.03)


In [10]:
import torch

def get_likelihoods_of_words_given_context(prompt: str, words: list, model, tokenizer):
    """
    Returns a list of likelihoods of words given a prompt.
    """
    word_probs = []

    for word in words:
        word_token = tokenizer.encode(word, add_special_tokens=False)
        tokens = tokenizer(prompt, return_tensors='pt')['input_ids'].to(device)
        tokens = torch.cat([tokens, torch.tensor(word_token).unsqueeze(0).to(device)], dim=-1)
        outputs = model(tokens)
        logits = outputs[0]
        probabilities = torch.softmax(logits, dim=-1)

        if len(word_token) == 1:
            word_prob = probabilities[0, -1, word_token[0]].item()
            word_probs.append(word_prob)
        else:
            word_prob = 1.0
            for idx in range(len(word_token)):
                word_prob *= probabilities[0, -(len(word_token) - idx), word_token[idx]].item()
            word_probs.append(word_prob)

    return list(zip(words, word_probs))
# example
prompt = "Man is a doctor as a women is a"
get_likelihoods_of_words_given_context(prompt, ["nurse", "doctor", "teacher"], model, tokenizer)


[('nurse', 1.080520402501984e-10),
 ('doctor', 6.820724138378864e-06),
 ('teacher', 1.1379043641373526e-11)]

## Category classifier

In [18]:
from settings import categories

class LikelihoodBasedCategoryClassifier:

    def __init__(self, categories):
        self.categories = categories
    
    def __construct_classification_prompt(self, sentence):
        prompt = "multiple choice question: \n"
        expectation2category = {}
        for i, category in enumerate(self.categories):
            name = category["name"]
            description = category["description"]
            examples = category["examples"]
            example = examples[np.random.randint(len(examples))]
            prompt += f"{i+1}. {name}: {description} (e.g., {example})\n"
            expectation2category[i+1] = name
        prompt += f"\nWhich category does the following sentence belong to?"
        return prompt, expectation2category

    def classify(self, sentence: str) -> List[float]:
        prompt, expectations = self.__construct_classification_prompt(sentence)
        likelihoods = get_likelihoods_of_words_given_context(prompt, expectations.values(), model, tokenizer)
        return likelihoods

In [19]:
classifier = LikelihoodBasedCategoryClassifier(categories)
# take a random category 
category = categories[np.random.randint(len(categories))]
sentence = np.random.choice(category["examples"])

print(f"sentence {sentence} belongs to category {category['name']}")
likelihoods = classifier.classify(sentence)
print(likelihoods)

sentence Independence is important for me to make my own decisions. belongs to category self-direction
{'name': 'universalism', 'description': 'Understanding, appreciation, tolerance, and protection for the welfare of all people and for nature.', 'examples': ['Equality is important to me.', 'It is good to accept and try to understand those who are different from oneself.'], 'children': []}
{'name': 'self-direction', 'description': 'Independent thought and action; choosing, creating, exploring.', 'examples': ['It is good to search for the truth and think in a rational and unbiased way.', 'Independence is important for me to make my own decisions.'], 'children': []}
{'name': 'stimulation', 'description': 'It is good to experience excitement, novelty, and change.', 'examples': ['Have an exciting life: arguments towards allowing people to experience foreign places and special activities or having perspective-changing experiences', 'Have a varied life: arguments towards allowing people to e