# **Introducción a la familia GPT**

In [1]:
!pip install transformers




In [2]:
!pip install torch --upgrade




In [3]:
!pip install bertviz


Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[K     |████████████████████████████████| 157 kB 6.7 MB/s eta 0:00:01
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 36.7 MB/s eta 0:00:01
Installing collected packages: sentencepiece, bertviz
Successfully installed bertviz-1.4.0 sentencepiece-0.1.99


In [4]:
from transformers import pipeline, set_seed, GPT2Tokenizer, GPT2LMHeadModel
from torch import tensor, numel
from bertviz import model_view

#Necesario para uniformizar los valores aleatorios y permitir comparaciones entre los distintos entrenamientos.
set_seed(42)

In [5]:
generator = pipeline('text-generation', model='gpt2')

generator("Hello, I'm a language model and I", max_length=30, num_return_sequences=3)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model and I want to be able to create the tools to communicate with people in real time using a Web model (or"},
 {'generated_text': "Hello, I'm a language model and I'm happy with this.\n\nWith Clojure and its Clojure equivalent, we can use an expression as the"},
 {'generated_text': 'Hello, I\'m a language model and I can solve these problems with my own code," he says. "To solve these problems I just write code'}]

In [6]:
generator = pipeline('text-generation', model='gpt2')

# GPT2 no es bueno con el español, asi que no se puede utilizar para español
generator("Hola, Soy un modelo de lenguaje grande, y yo", max_length=30, num_return_sequences=3)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hola, Soy un modelo de lenguaje grande, y yo que seguent lambrado da que la cami de'},
 {'generated_text': 'Hola, Soy un modelo de lenguaje grande, y yo en la vista. Nuen a vez, y un'},
 {'generated_text': 'Hola, Soy un modelo de lenguaje grande, y yo y la suo. Soy de lenguaje grande'}]

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

'Jesús' in tokenizer.get_vocab()

False

In [8]:
tokenizer.convert_ids_to_tokens(tokenizer.encode('Jesús'))

['Jes', 'Ãº', 's']

In [9]:
tokenizer.convert_ids_to_tokens(tokenizer.encode('Jesús está trabajando con Colab'))

['Jes',
 'Ãº',
 's',
 'Ġest',
 'Ã¡',
 'Ġtr',
 'ab',
 'aj',
 'ando',
 'Ġcon',
 'ĠCol',
 'ab']

In [10]:
# Ġ  Indica un espacio o el comienzo de una nueva palabra
# 'ab' es un segundo token de la palabra Colab, por eso no lleva Ġ
tokenizer.convert_ids_to_tokens(tokenizer.encode('Jesus is working with Colab'))

['Jesus', 'Ġis', 'Ġworking', 'Ġwith', 'ĠCol', 'ab']

In [11]:
tokenizer.encode('Jesus is working with Colab')

[28219, 318, 1762, 351, 1623, 397]

In [12]:
encoded = tokenizer.encode('Jesus is working with Colab', return_tensors='pt')

encoded

tensor([[28219,   318,  1762,   351,  1623,   397]])

In [13]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [14]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# (wte): Embedding(50257, 768)  Embedding de tokens, 50257 tokens posibles en el vocabulario, representado por un vector de 768 posiciones
# (wpe): Embedding(1024, 768) Embedding de posicion, GPT utiliza para entender el orden de los tokens en una secuencia, secuencias de hasta 1024 tokens
# GPT no cuenta con un token type o de segmento a diferencia de BERT
# Dropout , para prevenir el sobreajuste en el entrenamiento, una parte de las neuronas se apaga aleatoriamente en cada iteracion, en este caso el 10% de las caracteristicas
# ModuleList , 12 capas, , GPT solo usa la parte decoder del transformer, y es unidireccional


In [15]:
model.transformer.wte(encoded).shape

torch.Size([1, 6, 768])

In [16]:
model.transformer.wpe(tensor([0, 1, 2, 3, 4, 5]).reshape(1, 6)).shape

torch.Size([1, 6, 768])

In [17]:
initial_input = model.transformer.wte(encoded) + model.transformer.wpe(tensor([0, 1, 2, 3, 4, 5]).reshape(1, 6))

initial_input.shape

torch.Size([1, 6, 768])

In [18]:
initial_input = model.transformer.drop(initial_input)
initial_input

tensor([[[-0.1652, -0.3763,  0.1320,  ..., -0.1793,  0.0511,  0.0114],
         [ 0.0142, -0.0437, -0.0393,  ...,  0.1487, -0.0278, -0.0255],
         [-0.0494, -0.0113,  0.1260,  ..., -0.0617,  0.1436, -0.1091],
         [ 0.0641, -0.0634,  0.1348,  ...,  0.0502,  0.1263,  0.0279],
         [ 0.1671,  0.0801,  0.3194,  ..., -0.0437,  0.0439, -0.1914],
         [-0.1401, -0.0309,  0.0758,  ...,  0.1693, -0.1506, -0.0306]]],
       grad_fn=<AddBackward0>)

In [19]:
model.lm_head

Linear(in_features=768, out_features=50257, bias=False)

In [20]:
for module in model.transformer.h:
    initial_input = module(initial_input)[0]

initial_input = model.transformer.ln_f(initial_input)

In [21]:
(initial_input == model(encoded, output_hidden_states=True).hidden_states[-1]).all()

tensor(True)

In [22]:
total_params = 0
for param in model.parameters():
    total_params += numel(param)

print(f'Number of params: {total_params:,}')

Number of params: 124,439,808
