<a href="https://colab.research.google.com/github/CinthiaS/distill-roberta/blob/main/DistillRoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.0-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.0
Looking in in

In [2]:
import torch
from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, BertConfig, BertForMaskedLM, BertTokenizer

# Definir as configurações dos modelos
teacher_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
teacher_model = RobertaForMaskedLM.from_pretrained('roberta-base')
roberta_vocab_size = teacher_tokenizer.vocab_size

student_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
student_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
student_model.resize_token_embeddings(roberta_vocab_size)

new_bert_vocab_size = student_model.config.vocab_size
print("Novo tamanho do vocabulário do BERT:", new_bert_vocab_size)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Novo tamanho do vocabulário do BERT: 50265


In [3]:
texts = [
    "Eu amo esse filme!",
    "Que dia maravilhoso!",
    "Estou muito feliz com essa notícia.",
    "Esse livro é incrível!",
    "Não gostei desse restaurante.",
    "Que péssima experiência!",
    "Estou muito chateado com essa situação."
]
labels = [1, 1, 1, 1, 0, 0, 0] 

# Tokenizar os textos usando o tokenizador do modelo professor (Roberta)
input_ids = []
attention_masks = []
for text in texts:
    encoded_text = teacher_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(encoded_text['input_ids'])
    attention_masks.append(encoded_text['attention_mask'])

# Concatenar os tensores de entrada
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Definir os dados de treinamento
train_dataset = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
# Definir o otimizador e a função de perda
optimizer = torch.optim.Adam(student_model.parameters(), lr=1e-4)
criterion = torch.nn.KLDivLoss()

# Definir a função de treinamento
def train(teacher_model, student_model, dataloader, optimizer, criterion, temperature):
    teacher_model.eval()
    student_model.train()
    for input_ids, attention_mask, labels in dataloader:
        optimizer.zero_grad()
        with torch.no_grad():
            teacher_logits = teacher_model(input_ids=input_ids, attention_mask=attention_mask).logits / temperature
        student_logits = student_model(input_ids=input_ids, attention_mask=attention_mask).logits / temperature
        loss = criterion(torch.nn.functional.log_softmax(student_logits, dim=-1), torch.nn.functional.softmax(teacher_logits, dim=-1))
        loss.backward()
        optimizer.step()

    return student_model

def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predictions = torch.max(logits, dim=1)

            loss = criterion(logits, labels)
            total_loss += loss.item()

            correct_predictions += torch.sum(predictions == labels).item()
            total_predictions += len(labels)

    average_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions

    return average_loss, accuracy


# Treinar o modelo
num_epochs = 1
temperature = 2.0
for epoch in range(num_epochs):
    train(teacher_model, student_model, train_dataloader, optimizer, criterion, temperature)
    average_loss, accuracy = evaluate(student_model, train_dataloader, criterion)




In [None]:
# Definir o otimizador e a função de perda
optimizer = torch.optim.Adam(student_model.parameters(), lr=1e-4)
criterion = torch.nn.KLDivLoss()

# Definir a função de treinamento
def train(teacher_model, student_model, dataloader, optimizer, criterion, temperature):
    teacher_model.eval()
    student_model.train()
    for input_ids, attention_mask, labels in dataloader:
        optimizer.zero_grad()
        with torch.no_grad():
            teacher_logits = teacher_model(input_ids=input_ids, attention_mask=attention_mask).logits / temperature
        student_logits = student_model(input_ids=input_ids, attention_mask=attention_mask).logits / temperature
        loss = criterion(torch.nn.functional.log_softmax(student_logits, dim=-1), torch.nn.functional.softmax(teacher_logits, dim=-1))
        loss.backward()
        optimizer.step()

    return student_model

def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predictions = torch.max(logits, dim=1)

            loss = criterion(logits, labels)
            total_loss += loss.item()

            correct_predictions += torch.sum(predictions == labels).item()
            total_predictions += len(labels)

    average_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions

    return average_loss, accuracy


# Treinar o modelo
num_epochs = 1
temperature = 2.0
for epoch in range(num_epochs):
    model = train(teacher_model, student_model, train_dataloader, optimizer, criterion, temperature)
    average_loss, accuracy = evaluate(model, train_dataloader, criterion)


