In [5]:
!pip install datasets
!pip install torch
!pip install transformers
!pip install tqdm
!pip install pandas
!pip install scikit-learn
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.3-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.40.4-py2.py3-none-any.whl (257 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.9/257.9 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wa

In [6]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import wandb
wandb.init(project="roberta_amazonovna")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [12]:
dataset = load_dataset("mteb/amazon_reviews_multi")
en_reviews = dataset['train'].filter(lambda example: example['id'].startswith('en_')).shuffle(seed=42).select([i for i in range(10000)])

# бин классификация
en_reviews = en_reviews.filter(lambda example: example['label'] != 2)
en_reviews = en_reviews.map(lambda example: {'label': 0 if example['label'] in [0, 1] else 1, 'text': example['text']})


In [9]:
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2).to('cuda')

max_length = 512
tokenized_texts = [tokenizer(review['text'], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt') for review in en_reviews]
labels = torch.tensor([example['label'] for example in en_reviews]).to('cuda')

dataset = TensorDataset(torch.cat([t['input_ids'] for t in tokenized_texts]).to('cuda'),
                        torch.cat([t['attention_mask'] for t in tokenized_texts]).to('cuda'),
                        labels)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
val_size = int(0.1 * len(dataset))
test_size = int(0.1 * len(dataset))
train_size = len(dataset) - val_size - test_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

In [11]:
batch_size = 2
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [24]:
import torch.nn.functional as F
def evaluate_model(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []
    total_loss = 0.0
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1)
            predictions.extend(batch_predictions.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            loss = F.cross_entropy(logits, labels)
            total_loss += loss.item()

    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    confusion_mat = confusion_matrix(true_labels, predictions)

    avg_loss = total_loss / len(dataloader)
    metrics = {
        'Accuracy': accuracy * 100,
        'Precision': precision * 100,
        'Recall': recall * 100,
        'F1 Score': f1 * 100,
    }

    return avg_loss, metrics

In [25]:
def train_model(model, train_dataloader, epochs=3):
    optimizer = AdamW(model.parameters(), lr=1e-5)
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False)
        for batch in progress_bar:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            progress_bar.set_postfix({'Loss': total_loss / (progress_bar.n + 1e-12)})

            if progress_bar.n % 100 == 0:
                wandb.log({"train_loss": total_loss / (progress_bar.n + 1e-12)})

        wandb.log({"train_loss_epoch": total_loss / len(train_dataloader)})
        val_loss, val_metrics = evaluate_model(model, val_dataloader)
        wandb.log({"val_loss": val_loss, **val_metrics})

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")


train_model(model, train_dataloader)



Epoch 1, Loss: 0.13574830663447163




Epoch 2, Loss: 0.08636044746493281




Epoch 3, Loss: 0.057344216059076705


In [27]:
def evaluate_model_(model, test_dataloader, model_name):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1)
            predictions.extend(batch_predictions.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    confusion_mat = confusion_matrix(true_labels, predictions)
    results = {
        'Accuracy': accuracy * 100,
        'Precision': precision * 100,
        'Recall': recall * 100,
        'F1 Score': f1 * 100,
    }
    df = pd.DataFrame(results, index=[model_name])
    print(f"Results for {model_name}:")
    print(df)
    print("Confusion Matrix:")
    print(confusion_mat)
    print("\n" + "=" * 50 + "\n")

evaluate_model_(model, test_dataloader, 'RoBERTa')

Results for RoBERTa:
          Accuracy  Precision     Recall   F1 Score
RoBERTa  94.199243  93.842365  94.776119  94.306931
Confusion Matrix:
[[366  25]
 [ 21 381]]




In [28]:
model.save_pretrained("/content/model_directory")
tokenizer.save_pretrained("/content/tokenizer_directory")

('/content/tokenizer_directory/tokenizer_config.json',
 '/content/tokenizer_directory/special_tokens_map.json',
 '/content/tokenizer_directory/vocab.json',
 '/content/tokenizer_directory/merges.txt',
 '/content/tokenizer_directory/added_tokens.json')

In [39]:
imdb_dataset = load_dataset("imdb")

random_indices = torch.randperm(len(imdb_dataset['train']))[:int(0.2 * len(imdb_dataset['train']))]
imdb_reviews = imdb_dataset['train'].select(random_indices)

tokenized_imdb_texts = [tokenizer(review['text'], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt') for review in imdb_reviews]
labels_imdb = torch.tensor([example['label'] for example in imdb_reviews]).to('cuda')

batch_size_imdb = 2
imdb_dataloader = DataLoader(TensorDataset(torch.cat([t['input_ids'] for t in tokenized_imdb_texts]).to('cuda'),
                                            torch.cat([t['attention_mask'] for t in tokenized_imdb_texts]).to('cuda'),
                                            labels_imdb),
                             batch_size=batch_size_imdb,
                             shuffle=False)

evaluate_model_(model, imdb_dataloader, 'RoBERTa on IMDb (pre-trained on Amazon)')


Results for RoBERTa on IMDb (pre-trained on Amazon):
                                         Accuracy  Precision    Recall  \
RoBERTa on IMDb (pre-trained on Amazon)     89.98  87.909126  92.42915   

                                          F1 Score  
RoBERTa on IMDb (pre-trained on Amazon)  90.112493  
Confusion Matrix:
[[2216  314]
 [ 187 2283]]




In [47]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name_hf = "aychang/roberta-base-imdb"
model_hf = AutoModelForSequenceClassification.from_pretrained(model_name_hf).to('cuda')
tokenizer_hf = AutoTokenizer.from_pretrained(model_name_hf)

In [48]:
evaluate_model_(model_hf, imdb_dataloader, 'RoBERTa on IMDb (pre-trained on IMDb)')

Results for RoBERTa on IMDb (pre-trained on IMDb):
                                       Accuracy  Precision     Recall  \
RoBERTa on IMDb (pre-trained on IMDb)     98.26  97.832196  98.663968   

                                        F1 Score  
RoBERTa on IMDb (pre-trained on IMDb)  98.246321  
Confusion Matrix:
[[2476   54]
 [  33 2437]]




# Сохраню модель на гугл диск

In [31]:
import os

output_dir = 'robertik'
os.makedirs(output_dir, exist_ok=True)

In [32]:
tokenizer.save_pretrained(output_dir)
model.save_pretrained(output_dir)

In [33]:
import shutil

shutil.make_archive(output_dir, 'zip', output_dir)

'/content/robertik.zip'

In [34]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [35]:
shutil.move(output_dir + '.zip', '/content/gdrive/MyDrive/')

'/content/gdrive/MyDrive/robertik.zip'