## Introduction to the GPT family

In [None]:
from transformers import pipeline, set_seed, AutoTokenizer, AutoModelForCausalLM, AutoModel
from torch import tensor, numel
from bertviz import model_view, head_view
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

set_seed(42)

In [None]:
# pipeline object in transformers provides easy access to transformer usage
MODEL = 'gpt2'

generator = pipeline('text-generation', model=MODEL)

# finish the sentence
generator("Hello, I'm a language model and I", max_length=30, num_return_sequences=3)

In [None]:
# load up a tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

'Sinan' in tokenizer.get_vocab()

In [None]:
tokenizer.encode('Sinan loves a beautiful day')

In [None]:
# encode a string and then convert the ids back into tokens. Note the Ġ character denoting a space before the token
tokenizer.convert_ids_to_tokens(tokenizer.encode('Sinan loves a beautiful day'))


In [None]:
tokenizer.encode('Sinan loves a beautiful day')  # ids

In [None]:
encoded = tokenizer.encode('Sinan loves a beautiful day', return_tensors='pt')  # as a pytorch tensor

encoded

In [None]:
# load up a tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL)

In [None]:
model

In [None]:
encoded

In [None]:
model.transformer.wte(encoded)

In [None]:
model.transformer.wte(encoded).shape  # 1 item in batch x 6 tokens x token dimension

In [None]:
model.transformer.wpe(tensor([0, 1, 2, 3, 4, 5]).reshape(1, 6)).shape  # manually create position vectors

In [None]:
# create GPT input
initial_input = model.transformer.wte(encoded) + model.transformer.wpe(tensor([0, 1, 2, 3, 4, 5]).reshape(1, 6))

initial_input.shape


In [None]:
initial_input = model.transformer.drop(initial_input)  # run our input through the model's initual dropout later
initial_input

In [None]:
model.lm_head

In [None]:
for module in model.transformer.h:  # run the initial_input through every decoder in the stack
    initial_input = module(initial_input)[0]
    
initial_input = model.transformer.ln_f(initial_input)  # and then the final layer norm

In [None]:
initial_input

In [None]:
# same as just running through the model
(initial_input == model(encoded, output_hidden_states=True).hidden_states[-1]).all()

In [None]:
model(encoded).logits.shape

In [None]:
total_params = 0
for param in model.parameters():
    total_params += numel(param)
    
print(f'Number of params: {total_params:,}')

## Masked multi-headed attention

In [None]:
import torch
import pandas as pd


In [None]:
phrase = 'My friend was right about this class. It is so fun!'
encoded_phrase = tokenizer(phrase, return_tensors='pt')

response = model(**encoded_phrase, output_attentions=True, output_hidden_states=True)

len(response.attentions)

In [None]:
encoded_phrase

In [None]:
response.attentions[-1].shape  # represtnations from the final decoder

In [None]:
encoded_phrase['input_ids'].shape

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0])

tokens

In [None]:
# Layer index 9, head 0. Check out the almost 60% attention the token it is giving to the token class
arr = response.attentions[9][0][0]

n_digits = 3

attention_df = pd.DataFrame((torch.round(arr * 10**n_digits) / (10**n_digits)).detach()).applymap(float)

attention_df.columns = tokens
attention_df.index = tokens

attention_df


In [None]:
head_view(response.attentions, tokens)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0]) 
model_view(response.attentions, tokens)

In [None]:
response.hidden_states[-1].shape

In [None]:
response.logits

In [None]:
response.logits.shape

In [None]:
# look at the top next token in the auto-regressive language modelling task
pd.DataFrame(
    zip(tokens, tokenizer.convert_ids_to_tokens(response.logits.argmax(2)[0])), 
    columns=['Sequence up until', 'Next token with highest probability']
)

In [None]:
generator('My friend was right', max_length=4, num_return_sequences=5)

In [None]:
generator(phrase, max_length=20, num_return_sequences=1, do_sample=False)  # greedy search

In [None]:
generator(phrase, max_length=20, num_return_sequences=1, do_sample=True)  # greedy search with sampling

## Pre-training GPT

In [None]:
from transformers import pipeline, set_seed
from torch import tensor

generator = pipeline('text-generation', model=MODEL, tokenizer=tokenizer)
set_seed(0)

In [None]:
# Bias
generator("The holocaust was", max_length=10, num_return_sequences=10, temperature=0.8, num_beams=2)

In [None]:
generator("Jewish people are", max_length=10, num_return_sequences=10, temperature=0.8, num_beams=2)

In [None]:
generator("Christian people are", max_length=10, num_return_sequences=10, temperature=0.8, num_beams=2)

In [None]:
generator("The earth is", max_length=10, num_return_sequences=10, temperature=0.8, num_beams=2)

## Few-shot learning

In [None]:
print(generator("""Sentiment Analysis
Text: I hate it when my phone battery dies.
Sentiment: Negative
###
Text: My day has been really great!
Sentiment: Positive
###
Text: Not a fan when it is cloudy
Sentiment:""", top_k=2, temperature=0.1, max_length=55)[0]['generated_text'])

In [None]:
print(generator("""Question/Answering
C: Google was founded in 1998 by Larry Page and Sergey Brin while they were Ph.D. students at Stanford University in California. Together they own about 14 percent of its shares and control 56 percent of the stockholder voting power through supervoting stock.
Q: When was Google founded?
A: 1998
###
C: Hugging Face is a company which develops social AI-run chatbot applications. It was established in 2016 by Clement Delangue and Julien Chaumond. The company is based in Brooklyn, New York, United States.
Q: What does Hugging Face develop?
A: social AI-run chatbot applications
###
C: The New York Jets are a professional American football team based in the New York metropolitan area. The Jets compete in the National Football League (NFL) as a member club of the league's American Football Conference (AFC) East division.
Q: What division do the Jets play in?
A:""", top_k=5, num_beams=2, max_length=215, temperature=0.5)[0]['generated_text'])


In [None]:
## Zero Shot Learning

In [None]:
# Same question as before, with no previous examples ie Zero-shot learning. Hit or miss
print(generator(
    '''Question/Answering
C: The New York Jets are a professional American football team based in the New York metropolitan area. The Jets compete in the National Football League (NFL) as a member club of the league's American Football Conference (AFC) East division.
Q: What division do the Jets play in?
A:''',
    top_k=5, num_beams=2, max_length=80, temperature=0.5)[0]['generated_text']
)

In [None]:
# Zero-shot doesn't work as much with the sentiment analysis example
print(generator("""Sentiment Analysis
Text: This new music video was so good
Sentiment:""", top_k=2, temperature=0.1, max_length=55)[0]['generated_text'])

In [None]:
# Zero-shot abstractive summarization

In [None]:
to_summarize = """This training will focus on how the GPT family of models are used for NLP tasks including abstractive text summarization and natural language generation. The training will begin with an introduction to necessary concepts including masked self attention, language models, and transformers and then build on those concepts to introduce the GPT architecture. We will then move into how GPT is used for multiple natural language processing tasks with hands-on examples of using pre-trained GPT-2 models as well as fine-tuning these models on custom corpora.

GPT models are some of the most relevant NLP architectures today and it is closely related to other important NLP deep learning models like BERT. Both of these models are derived from the newly invented transformer architecture and represent an inflection point in how machines process language and context.

The Natural Language Processing with Next-Generation Transformer Architectures series of online trainings provides a comprehensive overview of state-of-the-art natural language processing (NLP) models including GPT and BERT which are derived from the modern attention-driven transformer architecture and the applications these models are used to solve today. All of the trainings in the series blend theory and application through the combination of visual mathematical explanations, straightforward applicable Python examples within hands-on Jupyter notebook demos, and comprehensive case studies featuring modern problems solvable by NLP models. (Note that at any given time, only a subset of these classes will be scheduled and open for registration.)"""

In [None]:
print(generator(
    f"""Summarization Task:\n{to_summarize}\nTL;DR:""", 
    max_length=400, num_beams=5, temperature=0.7
)[0]['generated_text'].split('TL;DR:')[1])


In [None]:

# Initialize the GPT-2 model without the next token predictor (the causal part)
model = AutoModel.from_pretrained(MODEL)


In [None]:
def get_embedding(sentence, word):
    inputs = tokenizer(sentence, return_tensors='pt', add_special_tokens=True)
    token_ids = inputs['input_ids'][0].tolist()
    tokens = [tokenizer.decode([token_id]) for token_id in token_ids]
    try:
        # Find the first occurrence of 'word' in the list; this will give you an index.
        token_idx = token_ids.index(tokenizer.encode(word)[0])
    except ValueError:
        print(f"Word '{word}' not found in tokens.")
        return None

    print(f"Token index of {word} is {token_idx}")
    
    outputs = model(**inputs)
    return outputs.last_hidden_state[0][token_idx].detach().numpy().reshape(1, -1)


In [None]:
generator("I love my pet python", max_length=30, num_return_sequences=3)

In [None]:
# Embeddings
python_pet_embedding = get_embedding('I love my pet python', ' python')
python_language_embedding = get_embedding('I love coding in python', ' python')
snake_alone_embedding = get_embedding('snake', 'snake')
coding_alone_embedding = get_embedding('coding', 'coding')

# Calculate cosine similarities
similarity_pet_snake = cosine_similarity(python_pet_embedding, snake_alone_embedding)
similarity_language_snake = cosine_similarity(python_language_embedding, snake_alone_embedding)
similarity_pet_coding = cosine_similarity(python_pet_embedding, coding_alone_embedding)
similarity_language_coding = cosine_similarity(python_language_embedding, coding_alone_embedding)


# Create the subplots
fig, axs = plt.subplots(1, 2, figsize=(15, 6))

# Chart for Pet Context
labels_pet = ['Pet-Snake', 'Pet-Coding']
values_pet = [similarity_pet_snake[0][0], similarity_pet_coding[0][0]]

axs[0].barh(labels_pet, values_pet, color='skyblue')
axs[0].set_xlabel('Cosine Similarity')
axs[0].set_title('Cosine Similarity for "Python" in Pet Context (GPT-2)')
for i, v in enumerate(values_pet):
    axs[0].text(v, i, " {:.2f}".format(v), va='center', color='blue')

# Chart for Language Context
labels_language = ['Language-Snake', 'Language-Coding']
values_language = [similarity_language_snake[0][0], similarity_language_coding[0][0]]

axs[1].barh(labels_language, values_language, color='skyblue')
axs[1].set_xlabel('Cosine Similarity')
axs[1].set_title('Cosine Similarity for "Python" in Language Context (GPT-2)')
for i, v in enumerate(values_language):
    axs[1].text(v, i, " {:.2f}".format(v), va='center', color='blue')

# Show the plots
plt.tight_layout()
plt.show()


In [None]:
# We don't see that same stark difference like we did in BERT. For one, gpt2 isn't THAT good of a language model
#  on it's own and secondly, auto-regressive models just aren't as good at this task at the same parameter level

# GPT2 and bert have roughly the same number of parameters but bert is just WAY better at context clues than GPT2 is
# GPT2 would need many more parameters to be as good as bert.... ie gpt 3, chatGPT , and gpt-4

In [None]:

# Embeddings
python_pet_embedding = get_embedding('python, a programming language, is great.', 'python')
python_language_embedding = get_embedding('python, and by python I mean the snake, is a super cool animal.', 'python')

# When python is the first word, the embedding is exactly the same no matter what comes next
(python_pet_embedding == python_language_embedding).all()

In [None]:

# Embeddings
python_pet_embedding = get_embedding('What is a python? A programming language.', ' python')
python_language_embedding = get_embedding('What is a python? A snake.', ' python')

# Even with the words (albeit the same for each sentence) before python, the embedding is the same
(python_pet_embedding == python_language_embedding).all()


In [None]:
python_pet_embedding[0][:10]

In [None]:
python_language_embedding[0][:10]