In [1]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, LongformerModel, BertForSequenceClassification, get_linear_schedule_with_warmup, AutoTokenizer, LongformerTokenizer
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import torch.nn.functional as F
import warnings
import os
import random
from argparse import ArgumentParser
import torch.nn.functional as F


warnings.filterwarnings("ignore")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

LR = 1e-05
DROPOUT = 0.4
WARMUP = 2


df_0 = pd.read_csv('../input/roberta-aug/augmented_splited_0_5.csv')
df_1 = pd.read_csv('../input/roberta-aug/augmented_splited_1_5_roberta.csv')
df_5 = pd.read_csv('../input/roberta-aug/augmented_splited_5_5_roberta.csv')

df = pd.DataFrame(columns=df_0.columns)
labels = [0,1,5]

for i, row in df_0.iterrows():
    if int(row.label) not in labels:
        df = df.append(row)

df = df.append(df_0[df_0.label == 0])
df = df.append(df_1[df_1.label == 1])
df = df.append(df_5[df_5.label == 5])



def seed_everything(seed_value=42):
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything()

PRE_TRAINED_MODEL_NAME = 'ltgoslo/norbert2'
MAX_LEN = 512   
BATCH_SIZE = 4
EPOCHS = 10

class_names = df.label.unique()
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

class Dataset(Dataset):
  def __init__(self, texts, targets, tokenizer, max_len):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, item):
    text = str(self.texts[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      truncation=True,
      return_tensors='pt',
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = Dataset(
    texts=df.text.to_numpy(),
    targets=df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size
  )


def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):

  y_true, y_pred = [], []
  model = model.train()
  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    y_true += targets.tolist()
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    y_pred += preds.tolist()

    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  f1 = f1_score(y_true, y_pred, average='macro')

  return correct_predictions.double() / n_examples, np.mean(losses), f1

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  y_true, y_pred = [], []
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      y_true += targets.tolist()
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      y_pred += preds.tolist()
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  f1 = f1_score(y_true, y_pred, average='macro')
  report = classification_report(y_true, y_pred)
  return correct_predictions.double() / n_examples, np.mean(losses), f1, report


def test_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  y_true, y_pred = [], []
  with torch.no_grad():
    for d in tqdm(data_loader):
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      y_true += targets.tolist()
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      y_pred += preds.tolist()
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  f1 = f1_score(y_true, y_pred, average='macro')
  report = classification_report(y_true, y_pred)
  return correct_predictions.double() / n_examples, f1, report, y_true, y_pred


Downloading:   0%|          | 0.00/374k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/383 [00:00<?, ?B/s]

In [2]:
df_train = df[df['split'] == 'train']
df_val = df[df['split'] == 'dev']
df_test = df[df['split'] == 'test']

print(f'Train samples: {len(df_train)}')
print(f'Validation samples: {len(df_val)}')
print(f'Test samples: {len(df_test)}')

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

Train samples: 40318
Validation samples: 4360
Test samples: 4351


In [3]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    for param in self.bert.parameters():
      param.requires_grad = False

    inner_features = self.bert.config.hidden_size

    self.hidden_layers = nn.ModuleList([
            nn.Linear(inner_features, inner_features)
            for _ in range(4)
        ])
    self.drop = nn.Dropout(0.2) 
    self.batch_norm = nn.BatchNorm1d(num_features=inner_features)

    self.out = nn.Linear(inner_features, n_classes)

  def forward(self, input_ids, attention_mask):
    
    bert_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False
    )
    last_hidden_state, pooled_output = bert_output

    x = pooled_output
    for layer in self.hidden_layers:
            x = layer(x) 
            x = self.batch_norm(x)
            x = x.relu() 
            x = self.drop(x)

    return self.out(x)

In [4]:
# class SentimentClassifier_lstm(nn.Module):

#   def __init__(self, n_classes):
#     super(SentimentClassifier_lstm, self).__init__()
#     self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
#     for param in self.bert.parameters():
#       param.requires_grad = False

#     inner_features = self.bert.config.hidden_size
    
#     self.lstm = nn.LSTM(input_size=inner_features,
#                              hidden_size=inner_features,
#                              num_layers=3,
#                              batch_first=True,
#                              bidirectional=True)

#     self.out = nn.Linear(inner_features*2, n_classes)

#   def forward(self, input_ids, attention_mask):
    
#     bert_output = self.bert(
#       input_ids=input_ids,
#       attention_mask=attention_mask,
#       return_dict=False
#     )
#     last_hidden_state, pooled_output = bert_output
#     states, _ = self.lstm(pooled_output)
    
#     return self.out(states)

In [None]:
#model = SentimentClassifier(len(class_names))
model = SentimentClassifier_lstm(len(class_names))
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=LR, correct_bias=True)  #попробовать correct_bias=True
#optimizer = optim.NAdam(model.parameters(), lr=LR)

total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=WARMUP,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

  
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    train_acc, train_loss, train_f1 = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    print()
    print(f'Train loss {train_loss} accuracy {train_acc} f1 {train_f1}')

    val_acc, val_loss, val_f1, report = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
    print()
    print(f'Val   loss {val_loss} accuracy {val_acc} f1 {val_f1}')
    print(report)


test_acc, test_f1, test_report, y_true, y_pred = test_model(
    model,
    test_data_loader,
    loss_fn,
    device,
    len(df_val)
  )

print()
print('-------------TESTINGS-----------------')
print()
print(f'Test accuracy {test_acc} f1 {test_f1}')
print(test_report)

print()
print()
print('Y TRUE: ', y_true)
print('Y PREDICTED', y_pred)


Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at ltgoslo/norbert2 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/10
----------

Train loss 0.8469788270965688 accuracy 0.5768391289250459 f1 0.6248458984407435

Val   loss 10.86970188700516 accuracy 0.0639908256880734 f1 0.020047424013796073
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        51
           1       0.00      0.00      0.00       225
           2       0.00      0.00      0.00       708
           3       0.00      0.00      0.00      1413
           4       0.00      0.00      0.00      1684
           5       0.06      1.00      0.12       279

    accuracy                           0.06      4360
   macro avg       0.01      0.17      0.02      4360
weighted avg       0.00      0.06      0.01      4360

Epoch 2/10
----------

Train loss 0.7959356843331988 accuracy 0.6276601021876085 f1 0.6811043863292575

Val   loss 11.095301511813199 accuracy 0.0639908256880734 f1 0.020047424013796073
              precision    recall  f1-score   support

           0       0.00      0.