In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
torch.manual_seed(22)

<torch._C.Generator at 0x77fcddbf2b10>

Constants

In [4]:
CONTEXT = 768
BATCH = 8

VALID_PROP = 0.3

Model

In [5]:
model_id = "ai-forever/rugpt3small_based_on_gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)
model.score = torch.nn.Linear(768, 3)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ai-forever/rugpt3small_based_on_gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50264, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=True)
)

Tuning

In [7]:
for param in model.parameters():
    param.requires_grad = False

In [8]:
for param in model.score.parameters():
    param.requires_grad = True

for param in model.transformer.ln_f.parameters():
    param.requires_grad = True

for param in model.transformer.h[-1].parameters():
    param.requires_grad = True

for param in model.transformer.h[-2].parameters():
    param.requires_grad = True

for param in model.transformer.h[-3].parameters():
    param.requires_grad = True

In [9]:
print("Параметры с градиентами:")
total_params = 0
trainable_params = 0
for name, param in model.named_parameters():
    total_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
        print(f"✓ {name}")

print(f"\nОбучаемые параметры: {trainable_params}/{total_params} ({trainable_params/total_params*100:.1f}%)")

Параметры с градиентами:
✓ transformer.h.9.ln_1.weight
✓ transformer.h.9.ln_1.bias
✓ transformer.h.9.attn.c_attn.weight
✓ transformer.h.9.attn.c_attn.bias
✓ transformer.h.9.attn.c_proj.weight
✓ transformer.h.9.attn.c_proj.bias
✓ transformer.h.9.ln_2.weight
✓ transformer.h.9.ln_2.bias
✓ transformer.h.9.mlp.c_fc.weight
✓ transformer.h.9.mlp.c_fc.bias
✓ transformer.h.9.mlp.c_proj.weight
✓ transformer.h.9.mlp.c_proj.bias
✓ transformer.h.10.ln_1.weight
✓ transformer.h.10.ln_1.bias
✓ transformer.h.10.attn.c_attn.weight
✓ transformer.h.10.attn.c_attn.bias
✓ transformer.h.10.attn.c_proj.weight
✓ transformer.h.10.attn.c_proj.bias
✓ transformer.h.10.ln_2.weight
✓ transformer.h.10.ln_2.bias
✓ transformer.h.10.mlp.c_fc.weight
✓ transformer.h.10.mlp.c_fc.bias
✓ transformer.h.10.mlp.c_proj.weight
✓ transformer.h.10.mlp.c_proj.bias
✓ transformer.h.11.ln_1.weight
✓ transformer.h.11.ln_1.bias
✓ transformer.h.11.attn.c_attn.weight
✓ transformer.h.11.attn.c_attn.bias
✓ transformer.h.11.attn.c_proj.weight

Dataset

In [10]:
import pandas as pd
import numpy as np

from analysis.precomposed.TorchDataset import TorchDataset

In [11]:
data = pd.read_csv("./analysis/data/dataset_onehot.csv")

In [12]:
data_val = data.sample(n=int(data.shape[0]*VALID_PROP))
data_train = data.drop(index=data_val.index)

data_train.reset_index(inplace=True, drop=True)
data_val.reset_index(inplace=True, drop=True)

In [13]:
ds_train = TorchDataset(data= data_train, tokenizer = tokenizer, context_l = CONTEXT)
ds_valid = TorchDataset(data= data_val, tokenizer = tokenizer, context_l = CONTEXT)

Let's go!
Let's go!


In [14]:
from torch.utils.data import DataLoader

dl_train = DataLoader(ds_train, batch_size= BATCH, shuffle= True)
dl_valid = DataLoader(ds_valid, batch_size= BATCH, shuffle= True)

Training

In [15]:
import torch.nn as nn
import torch.optim as opt
from tqdm import tqdm

In [16]:
model = model.to(device)

In [17]:
EPOCHS = 20
TOTAL_BATCHES = len(dl_train)

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = opt.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-5)
scheduler = opt.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

best_val_loss = float('inf')

In [19]:
pbar = tqdm(range(EPOCHS), desc="Epoch")
for epoch in pbar:
    model.train()
    train_losses = []
    for batch_idx, (data, targets) in enumerate(dl_train):
        data, targets = data.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(data)
        if hasattr(outputs, 'logits'):
            loss = criterion(outputs.logits, targets)
        else:
            loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    model.eval()
    valid_losses = []
    with torch.no_grad():
        for batch_idx, (data, targets) in enumerate(dl_valid):
            data, targets = data.to(device), targets.to(device)
            outputs = model(data)
            if hasattr(outputs, 'logits'):
                v_loss = criterion(outputs.logits, targets)
            else:
                v_loss = criterion(outputs, targets)
            valid_losses.append(v_loss.item())

    avg_train_loss = sum(train_losses) / len(train_losses)
    avg_val_loss = sum(valid_losses) / len(valid_losses)
    scheduler.step()
    pbar.set_postfix({
        'train_loss': f'{avg_train_loss:.4f}',
        'val_loss': f'{avg_val_loss:.4f}',
        'lr': f'{scheduler.get_last_lr()[0]:.2e}'
    })

    if (avg_val_loss < best_val_loss and avg_val_loss < 0.25):
        best_val_loss = avg_val_loss
        torch.save(model, f'./analysis/neural_training/rugpt3s_{int(best_val_loss*10**6)}.pth')
        print("Saved")

Epoch:   0%|                                                                                                               | 0/20 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2192 > 2048). Running this sequence through the model will result in indexing errors
Epoch:  10%|████▋                                          | 2/20 [1:23:49<12:34:35, 2515.30s/it, train_loss=0.2371, val_loss=0.2024, lr=1.00e-04]

Saved


Epoch:  25%|███████████▊                                   | 5/20 [3:28:37<10:25:37, 2502.51s/it, train_loss=0.1655, val_loss=0.2023, lr=1.00e-05]

Saved


Epoch:  65%|██████████████████████████████▌                | 13/20 [9:40:02<5:12:19, 2677.11s/it, train_loss=0.0705, val_loss=0.2767, lr=1.00e-06]


KeyboardInterrupt: 

Validation

In [20]:

model = torch.load("./analysis/neural_training/rugpt3s_202349.pth", weights_only=False)

In [22]:
from sklearn.metrics import accuracy_score
import numpy as np

def calculate_class_accuracy(model, dataloader, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.eval()
    model.to(device)
    
    all_predictions = []
    all_targets = []
    _batch = 0
    
    with torch.no_grad():
        for batch in dataloader:
            _batch+=1
            print(f"[PREDICTIONS] BATCH: {_batch}/{len(dataloader)}")
            if isinstance(batch, (list, tuple)):
                inputs = batch[0]
                targets = batch[1]
            else:
                inputs = batch
                targets = batch['labels']
            
            inputs = inputs.to(device)
            targets = targets.to(device)
            
            outputs = model(inputs)
            
            if isinstance(outputs, (tuple, list)):
                logits = outputs[0]
            else:
                logits = outputs
            
            predictions = torch.argmax(logits.logits, dim=1)
            all_predictions.extend(predictions.cpu().numpy())
            all_targets.extend(torch.argmax(targets, dim=1).cpu().numpy())
    
    predictions = np.array(all_predictions)
    targets = np.array(all_targets)
    
    class_accuracies = {}
    
    for class_idx, class_name in enumerate(['SPAM', 'TOXIC', 'OK']):
        class_mask = targets == class_idx
        if np.sum(class_mask) > 0:
            class_accuracy = accuracy_score(
                targets[class_mask], 
                predictions[class_mask]
            )
            class_accuracies[class_name] = class_accuracy
        else:
            class_accuracies[class_name] = 0.0
    
    total_accuracy = accuracy_score(targets, predictions)
    class_accuracies['TOTAL'] = total_accuracy
    
    return class_accuracies



In [23]:
accuracy_results = calculate_class_accuracy(model, dl_valid)
for class_name, acc in accuracy_results.items():
    print(f"{class_name}: {acc:.4f}")

[PREDICTIONS] BATCH: 1/710
[PREDICTIONS] BATCH: 2/710
[PREDICTIONS] BATCH: 3/710
[PREDICTIONS] BATCH: 4/710
[PREDICTIONS] BATCH: 5/710
[PREDICTIONS] BATCH: 6/710
[PREDICTIONS] BATCH: 7/710
[PREDICTIONS] BATCH: 8/710
[PREDICTIONS] BATCH: 9/710
[PREDICTIONS] BATCH: 10/710
[PREDICTIONS] BATCH: 11/710
[PREDICTIONS] BATCH: 12/710
[PREDICTIONS] BATCH: 13/710
[PREDICTIONS] BATCH: 14/710
[PREDICTIONS] BATCH: 15/710
[PREDICTIONS] BATCH: 16/710
[PREDICTIONS] BATCH: 17/710
[PREDICTIONS] BATCH: 18/710
[PREDICTIONS] BATCH: 19/710
[PREDICTIONS] BATCH: 20/710
[PREDICTIONS] BATCH: 21/710
[PREDICTIONS] BATCH: 22/710
[PREDICTIONS] BATCH: 23/710
[PREDICTIONS] BATCH: 24/710
[PREDICTIONS] BATCH: 25/710
[PREDICTIONS] BATCH: 26/710
[PREDICTIONS] BATCH: 27/710
[PREDICTIONS] BATCH: 28/710
[PREDICTIONS] BATCH: 29/710
[PREDICTIONS] BATCH: 30/710
[PREDICTIONS] BATCH: 31/710
[PREDICTIONS] BATCH: 32/710
[PREDICTIONS] BATCH: 33/710
[PREDICTIONS] BATCH: 34/710
[PREDICTIONS] BATCH: 35/710
[PREDICTIONS] BATCH: 36/710
[

In [38]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [39]:
tokenizer.save_pretrained("./analysis/neural_training/rugpt3_spam_toxic_classifier/")
model.save_pretrained("./analysis/neural_training/rugpt3_spam_toxic_classifier/")

In [44]:
repo_id = "arseso/gpt2_spam_toxic_classifier"

In [45]:
model.push_to_hub(
    repo_id,
    commit_message="0.20235"
)

# Загружаем токенизатор
tokenizer.push_to_hub(
    repo_id,
    commit_message="Upload tokenizer for ruGPT-3 small"
)

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/arseso/gpt2_spam_toxic_classifier/commit/c90a571c2a7495796bf8ca46876e5a408e812e13', commit_message='Upload tokenizer for ruGPT-3 small', commit_description='', oid='c90a571c2a7495796bf8ca46876e5a408e812e13', pr_url=None, repo_url=RepoUrl('https://huggingface.co/arseso/gpt2_spam_toxic_classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='arseso/gpt2_spam_toxic_classifier'), pr_revision=None, pr_num=None)