In [1]:
!pip install datasets
!pip install torch
!pip install transformers
!pip install tqdm
!pip install pandas
!pip install scikit-learn
!pip install wandb

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16
Collecting wandb
  Downloading wandb-0.16.3-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 

In [34]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import wandb
wandb.init(project="roberta_imdb_")

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Accuracy,▁
F1 Score,▁
Precision,▁
Recall,▁
train_loss,█▇▃▃▃▂▁▃▃▁▃▃▃▂▂
val_loss,▁

0,1
Accuracy,50.44
F1 Score,0.16116
Precision,100.0
Recall,0.08065
train_loss,0.23323
val_loss,0.69322


In [33]:
dataset = load_dataset("imdb")
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2).to('cuda')

max_length = 512
tokenized_texts = [tokenizer(review, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt') for review in dataset['train']['text']]
labels = torch.tensor(dataset['train']['label']).to('cuda')

dataset = TensorDataset(torch.cat([t['input_ids'] for t in tokenized_texts]).to('cuda'),
                        torch.cat([t['attention_mask'] for t in tokenized_texts]).to('cuda'),
                        labels)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
val_size = int(0.1 * len(dataset))
test_size = int(0.1 * len(dataset))
train_size = len(dataset) - val_size - test_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

In [36]:
batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [37]:
import torch.nn.functional as F
def evaluate_model(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []
    total_loss = 0.0
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1)
            predictions.extend(batch_predictions.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            loss = F.cross_entropy(logits, labels)
            total_loss += loss.item()

    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    confusion_mat = confusion_matrix(true_labels, predictions)

    avg_loss = total_loss / len(dataloader)
    metrics = {
        'Accuracy': accuracy * 100,
        'Precision': precision * 100,
        'Recall': recall * 100,
        'F1 Score': f1 * 100,
    }

    return avg_loss, metrics

In [38]:
#желательно глянуть качество валидации до обучения

val_loss, val_metrics = evaluate_model(model, val_dataloader)
wandb.log({"val_loss": val_loss, **val_metrics})
print(f"Validation Loss: {val_loss}")
print(f"Validation Metrics: {val_metrics}")

Validation Loss: 0.7028587778536276
Validation Metrics: {'Accuracy': 49.72, 'Precision': 49.72, 'Recall': 100.0, 'F1 Score': 66.4173123163238}


In [39]:
def train_model(model, train_dataloader, epochs=3):
    optimizer = AdamW(model.parameters(), lr=1e-5)
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        batch_loss = 0.0
        progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False)
        for batch in progress_bar:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            batch_loss += loss.item()
            loss.backward()
            optimizer.step()
            progress_bar.set_postfix({'Loss': total_loss / (progress_bar.n + 1e-12)})

            if (progress_bar.n + 1) % 100 == 0:
                # при if progress_bar.n % 100 == 0 лучше логировать total_loss
                #только за последние 100 итераций для повышения чувствительности;
                #сюда же докинуть валидацию
                wandb.log({"train_loss": batch_loss / 100})
                batch_loss = 0.0
                # wandb.log({"train_loss": total_loss / (progress_bar.n + 1e-12)})

        val_loss, val_metrics = evaluate_model(model, val_dataloader)
        wandb.log({"val_loss": val_loss, **val_metrics})
        print(f"Validation Loss: {val_loss}")
        print(f"Validation Metrics: {val_metrics}")

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")

        #accuracy, precision, recall, f1 на трейне тоже не помешает (раз в эпоху)
        train_loss, train_metrics = evaluate_model(model, train_dataloader)
        wandb.log({"train_loss": train_loss, **train_metrics})
        print(f"Train Metrics: {train_metrics}")


train_model(model, train_dataloader)



Validation Loss: 0.15942182057736495
Validation Metrics: {'Accuracy': 94.16, 'Precision': 93.22301024428684, 'Recall': 95.17296862429606, 'F1 Score': 94.18789808917197}
Epoch 1, Loss: 0.20193188140373677
Train Metrics: {'Accuracy': 97.03500000000001, 'Precision': 96.41442155309034, 'Recall': 97.6818866031109, 'F1 Score': 97.04401575195654}




Validation Loss: 0.19075024056920822
Validation Metrics: {'Accuracy': 93.4, 'Precision': 91.01978691019787, 'Recall': 96.21882542236526, 'F1 Score': 93.54712553773953}
Epoch 2, Loss: 0.11729157439675182
Train Metrics: {'Accuracy': 97.7, 'Precision': 96.33421078287998, 'Recall': 99.1570496738585, 'F1 Score': 97.72524972801898}




Validation Loss: 0.1785076699523142
Validation Metrics: {'Accuracy': 94.28, 'Precision': 92.24270353302612, 'Recall': 96.62107803700724, 'F1 Score': 94.38113948919448}
Epoch 3, Loss: 0.07599148281707895
Train Metrics: {'Accuracy': 98.86, 'Precision': 98.02702969320312, 'Recall': 99.71901655795283, 'F1 Score': 98.86578449905483}


In [40]:
def evaluate_model_(model, test_dataloader, model_name):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1)
            predictions.extend(batch_predictions.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    confusion_mat = confusion_matrix(true_labels, predictions)
    results = {
        'Accuracy': accuracy * 100,
        'Precision': precision * 100,
        'Recall': recall * 100,
        'F1 Score': f1 * 100,
    }
    df = pd.DataFrame(results, index=[model_name])
    print(f"Results for {model_name}:")
    print(df)
    print("Confusion Matrix:")
    print(confusion_mat)
    print("\n" + "=" * 50 + "\n")

evaluate_model_(model, test_dataloader, 'RoBERTa')

Results for RoBERTa:
         Accuracy  Precision     Recall  F1 Score
RoBERTa     95.12  93.918919  96.826625  95.35061
Confusion Matrix:
[[1127   81]
 [  41 1251]]




In [41]:
model.save_pretrained("/content/model_directory")
tokenizer.save_pretrained("/content/tokenizer_directory")

('/content/tokenizer_directory/tokenizer_config.json',
 '/content/tokenizer_directory/special_tokens_map.json',
 '/content/tokenizer_directory/vocab.json',
 '/content/tokenizer_directory/merges.txt',
 '/content/tokenizer_directory/added_tokens.json')

In [42]:
imdb_dataset = load_dataset("imdb")

random_indices = torch.randperm(len(imdb_dataset['train']))[:int(0.2 * len(imdb_dataset['train']))]
imdb_reviews = imdb_dataset['train'].select(random_indices)

tokenized_imdb_texts = [tokenizer(review['text'], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt') for review in imdb_reviews]
labels_imdb = torch.tensor([example['label'] for example in imdb_reviews]).to('cuda')

batch_size_imdb = 2
imdb_dataloader = DataLoader(TensorDataset(torch.cat([t['input_ids'] for t in tokenized_imdb_texts]).to('cuda'),
                                            torch.cat([t['attention_mask'] for t in tokenized_imdb_texts]).to('cuda'),
                                            labels_imdb),
                             batch_size=batch_size_imdb,
                             shuffle=False)

evaluate_model_(model, imdb_dataloader, 'RoBERTa on IMDb (pre-trained on IMDb)')


Results for RoBERTa on IMDb (pre-trained on IMDb):
                                       Accuracy  Precision    Recall  \
RoBERTa on IMDb (pre-trained on IMDb)     98.12  97.296238  99.00319   

                                        F1 Score  
RoBERTa on IMDb (pre-trained on IMDb)  98.142292  
Confusion Matrix:
[[2423   69]
 [  25 2483]]




In [43]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name_hf = "aychang/roberta-base-imdb"
model_hf = AutoModelForSequenceClassification.from_pretrained(model_name_hf).to('cuda')
tokenizer_hf = AutoTokenizer.from_pretrained(model_name_hf)

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [44]:
evaluate_model_(model_hf, imdb_dataloader, 'RoBERTa on IMDb (pre-trained on IMDb)')

Results for RoBERTa on IMDb (pre-trained on IMDb):
                                       Accuracy  Precision     Recall  \
RoBERTa on IMDb (pre-trained on IMDb)      98.5  98.601678  98.405104   

                                        F1 Score  
RoBERTa on IMDb (pre-trained on IMDb)  98.503293  
Confusion Matrix:
[[2457   35]
 [  40 2468]]




# Сохраню модель на гугл диск

In [45]:
import os

output_dir = 'robertik'
os.makedirs(output_dir, exist_ok=True)

In [46]:
tokenizer.save_pretrained(output_dir)
model.save_pretrained(output_dir)

In [None]:
import shutil

shutil.make_archive(output_dir, 'zip', output_dir)

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
shutil.move(output_dir + '.zip', '/content/gdrive/MyDrive/')