In [1]:
!pip install transformers
!pip install wandb
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!gdown 1rZWTVWVFUIqEpTGUEkgPo9fG7Env_lWg

Downloading...
From: https://drive.google.com/uc?id=1rZWTVWVFUIqEpTGUEkgPo9fG7Env_lWg
To: /content/split_data.csv
  0% 0.00/2.55M [00:00<?, ?B/s]100% 2.55M/2.55M [00:00<00:00, 210MB/s]


In [3]:
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

import torch
import torchmetrics
from torch.optim import AdamW, lr_scheduler, Adam
from torch.utils.data import Dataset, DataLoader, random_split

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!wandb login

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
import wandb


wandb.init(
    project="UFO",

    config={
        "architecture": "transformer",
        "dataset": "mydata",
    }
)

[34m[1mwandb[0m: Currently logged in as: [33mnotdiff[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
class textDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len=512):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': torch.tensor(encoding['input_ids']).flatten(),
            'attention_mask': torch.tensor(encoding['attention_mask']).flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [7]:
class CustomTextClassifier:
    def __init__(self, model_path, tokenizer_path, n_classes=2, models_save_path='/content/best.pt'):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.models_save_path = models_save_path
        self.max_len = 512

        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes)

        self.model.to(self.device)

    def init_helpers(self, texts, targets, lr, report_step=250, train_val_test=[0.90, 0.10, 0], batch_size=64, dataset_class=textDataset):
        dataset = dataset_class(texts, targets, self.tokenizer)


        self.train_data, self.val_data, self.test_data = random_split(dataset, train_val_test)

        self.val_loader = DataLoader(self.val_data, batch_size=batch_size, shuffle=True)
        self.train_loader = DataLoader(self.train_data, batch_size=batch_size, shuffle=True)

        self.report_step = report_step

        self.loss_func = torch.nn.CrossEntropyLoss()
        self.optimizer = Adam(self.model.parameters(), lr)
        self.lr_scheduler = lr_scheduler.LinearLR(self.optimizer)

    def eval(self):
        self.model = self.model.train()
        losses = []
        correct_predicts = 0
        f1_scores = []
        f1 = torchmetrics.F1Score(task="multiclass", num_classes=3)

        with torch.no_grad():
            for batch in self.val_loader:
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                targets = batch["targets"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                preds = torch.argmax(outputs.logits, dim=1)
                loss = self.loss_func(outputs.logits, targets)

                f1_scores.append(f1(preds.cpu(), targets.cpu()))

                correct_predicts += torch.sum(preds == targets)
                losses.append(loss.item())

        val_acc = correct_predicts / len(self.val_data)
        val_loss = np.mean(losses)
        val_f1 = np.mean(f1_scores)
        return val_acc, val_loss, val_f1

    def train_one_epoch(self):
        self.model = self.model.train()
        losses = []
        correct_predicts = 0


        report_counter = 0

        for batch in self.train_loader:
            input_ids = batch["input_ids"].to(self.device)
            attention_mask = batch["attention_mask"].to(self.device)
            targets = batch["targets"].to(self.device)

            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = self.loss_func(outputs.logits, targets)


            correct_predicts += torch.sum(preds == targets)
            losses.append(loss.item())

            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.progress_bar.update(1)

            if report_counter % self.report_step == 0:
                val_acc, val_loss, val_f1 = self.eval()
                if self.best_f1 < val_f1:
                    torch.save(self.model, self.models_save_path)
                    self.best_f1 = val_f1

                wandb.log({"ruBert-base_F1": val_f1, "ruBert-base_Acc": val_acc, 'ruBert-base_loss': val_loss})

            report_counter += 1


        self.lr_scheduler.step()



    def train(self, epochs):
        self.progress_bar = tqdm(range(len(self.train_loader)*epochs))
        self.best_f1 = 0
        for epoch in range(epochs):
            self.train_one_epoch()

            # val_acc, val_loss = self.eval()

            # print(f'Epoch: {epoch + 1}/{epochs} completed')
            # print(f'Val loss {val_loss} accuracy {val_acc}')
            # if val_acc > best_accuracy:
            #     torch.save(self.model, self.model_save_path)
            #     best_accuracy = val_acc

        # print('Training completed best accuracy is', best_accuracy.item())
        # self.model = torch.load(self.model_save_path)

    def predict(self, text, ind_to_labels):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        out = {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

        input_ids = out["input_ids"].to(self.device)
        attention_mask = out["attention_mask"].to(self.device)

        outputs = self.model(
            input_ids=input_ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0)
        )

        prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

        return ind_to_labels[prediction]

In [8]:
data = pd.read_csv('split_data.csv')
texts, targets = data['text'], data['class']

In [9]:
num_epochs = 15
lr = 3e-4

In [10]:
classifier = CustomTextClassifier('cointegrated/rubert-tiny', 'cointegrated/rubert-tiny', n_classes=3, models_save_path='/content/drive/MyDrive/models/best.pt')
classifier.init_helpers(texts, targets, lr, batch_size=64, train_val_test=[0.98, 0.02, 0], report_step=100)

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny a

In [11]:
# classifier.model.classifier = torch.nn.Sequential(
#     torch.nn.Linear(312, 312),
#     torch.nn.Linear(312, 3)
# )
# classifier.model.to(classifier.device);

In [12]:
classifier.train(num_epochs)

  0%|          | 0/4230 [00:00<?, ?it/s]

  'input_ids': torch.tensor(encoding['input_ids']).flatten(),
  'attention_mask': torch.tensor(encoding['attention_mask']).flatten(),


KeyboardInterrupt: ignored

In [None]:
classifier.model = torch.load('/content/drive/MyDrive/models/best.pt')

In [None]:
from IPython.display import clear_output
from time import sleep

while True:
    text = input()
    print(classifier.predict(text, {1: 'Другое', 0: 'Требования', 2: 'Условия'}))
    sleep(3)
    clear_output()