In [None]:
!pip3 install git+https://github.com/huggingface/transformers

In [None]:
!pip3 install jupyterlab ipywidgets bertviz xformers evaluate matplotlib

#Tokenizador

In [None]:
from transformers import BertModel, BertTokenizer

modelName = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(modelName)
model = BertModel.from_pretrained(modelName)

In [None]:
tokenized = tokenizer("Leí una buena novela.")
print(tokenized)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"])
print(tokens)

# Codificación posicional

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def encodePositions(num_tokens, depth, n=10000):
    positionalMatrix = np.zeros((num_tokens, depth))
    for row in range(num_tokens):
        for col in np.arange(int(depth/2)):
            denominator = np.power(n, 2*col/depth)
            positionalMatrix[row, 2*col] = np.sin(row/denominator)
            positionalMatrix[row, 2*col+1] = np.cos(row/denominator)
    return positionalMatrix

In [None]:
positionalMatrix = encodePositions(50, 256)
fig = plt.matshow(positionalMatrix)
plt.gcf().colorbar(fig)

# Auto-atención

In [None]:
from bertviz.transformers_neuron_view import BertModel, BertTokenizer
from bertviz.neuron_view import show

tokenizer_viz = BertTokenizer.from_pretrained(modelName)
model_viz = BertModel.from_pretrained(modelName)
show(model_viz, "bert", tokenizer_viz, "Leí una buena novela.", display_mode="light", head=11)

In [None]:
show(model_viz, "bert", tokenizer_viz, "La atención es una idea novedosa.", display_mode="light", head=11)

[Para seguir prácticando](https://https://huggingface.co/spaces/exbert-project/exbert)

# Modelo GPT-2

In [None]:
from transformers import pipeline
generator = pipeline('text-generation', model='gpt2')
generator("Leí una buena novela.", max_length=30, num_return_sequences=5, truncation=True)

In [None]:
generator("Esta película me pareció muy larga.", max_length=300, num_return_sequences=5, truncation=True)

In [None]:
generator("Star Trek" , max_length=100, num_return_sequences=5, truncation=True)

# Modelo Large GTP-2

In [None]:
generator = pipeline('text-generation', model='gpt2-large')
generator("Leí una buena novela.", max_length=30, num_return_sequences=5, truncation=True)

# GPT2 Fine-Tuning

In [None]:
!wget https://raw.githubusercontent.com/huggingface/transformers/main/examples/pytorch/language-modeling/run_clm.py

In [None]:
!pip install transformers[torch]

In [None]:
%%bash

python run_clm.py \
    --model_name_or_path gpt2 \
    --dataset_name imdb \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --do_train \
    --do_eval \
    --output_dir /tmp/test-clm

In [None]:
from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2LMHeadModel

dir = "/tmp/test-clm"
generator = pipeline('text-generation', model=GPT2LMHeadModel.from_pretrained(dir), tokenizer=GPT2Tokenizer.from_pretrained(dir))
generator("Leí una buena novela.", max_length=30, num_return_sequences=5, truncation=True)

In [None]:
generator("Esta película me pareció muy larga.", max_length=300, num_return_sequences=5, truncation=True)


In [None]:
generator("Star Trek", max_length=100, num_return_sequences=5, truncation=True)