<a href="https://colab.research.google.com/github/Alanawd/AIDS/blob/main/transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tokenize

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
encoded_input = tokenizer(["Hello, I'm a single sentence!", "I'm a second sentence", "I loved the show!"], padding=True, return_tensors='pt')
encoded_input

In [None]:
tokenizer.decode(encoded_input['input_ids'][2])

In [None]:
encoded_input['input_ids'][1][encoded_input['attention_mask'][1]==1]

In [None]:
tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0])

# Load dataset

In [None]:
import transformers
from datasets import list_datasets


In [None]:
datasets_list = list_datasets()
len(datasets_list)
print(', '.join(dataset for dataset in datasets_list))
# https://huggingface.co/datasets

In [None]:
from datasets import load_dataset
dataset = load_dataset('trec')

In [None]:
dataset

In [None]:
dataset['train'].shuffle()[:5]

In [None]:
dataset['train'].features

In [None]:
dataset['train'].features['label-coarse'].names

In [None]:
dataset = dataset.remove_columns("label-fine")
dataset = dataset.rename_column('label-coarse', 'labels')

In [None]:
dataset['train'].features

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length")

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

In [None]:
train_dataset = tokenized_datasets['train']
small_train_dataset = train_dataset.shuffle().select(range(500))
eval_dataset = tokenized_datasets["test"]

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6, n_layers=3)

In [None]:
model.to('cuda')

In [None]:
model.config.n_layers

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments("uni_twitter/bert_trainer/",
                                  evaluation_strategy="epoch", 
                                  num_train_epochs = 10,
                                  per_device_train_batch_size=32)

In [None]:
training_args

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=eval_dataset
)

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
model.device

In [None]:
sentence = 'Where are my keys?'
tokens = tokenizer(sentence, return_tensors='pt')
tokens
# tokenizer.decode(tokens['input_ids']), tokens
preds = model(tokens['input_ids'].to(model.device))[0]
idx = preds.argmax(-1)

small_train_dataset.features['labels'].names[idx]

for a,b in zip(small_train_dataset.features['labels'].names, preds.softmax(-1).detach().cpu().numpy()[0]):
    print(a, b)
    

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics)
trainer.train()

In [None]:
sentence = 'Where are my keys?'
tokens = tokenizer(sentence, return_tensors='pt')
tokens
# tokenizer.decode(tokens['input_ids']), tokens
preds = model(tokens['input_ids'].to(model.device))[0]
idx = preds.argmax(-1)

small_train_dataset.features['labels'].names[idx]

for a,b in zip(small_train_dataset.features['labels'].names, preds.softmax(-1).detach().cpu().numpy()[0]):
    print(a, b)

In [None]:
trainer.evaluate()

In [None]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    
    inputs = tokenizer(text, return_tensors='pt')
    # perform inference to our model
    outputs = model(inputs['input_ids'].to(model.device))
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return small_train_dataset.features['labels'].names[probs.argmax()], probs.max().detach().cpu().item()

In [None]:
get_prediction(['How are you?'])

# GPT-2 Text Generation

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
tokenized = tokenizer("what are you going to do?", return_tensors='pt')
tokenized

In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id = tokenizer.eos_token_id)
model.to('cuda')

In [None]:
ids = tokenizer("The shawshank")['input_ids']
[tokenizer.decode(x) for x in ids]

In [None]:
text = "I lost my keys"
input_ids = tokenizer.encode(text, return_tensors='pt')
# tokenizer.convert_ids_to_tokens(input_ids[0])
input_ids

In [None]:
output = model.generate(input_ids.to(model.device), 
max_length = 10000, 
num_beams = 10,
no_repeat_ngram_size  = 2)

In [None]:
tokenizer.decode(output.squeeze(0))