In [None]:
%%capture
!pip install transformers sentencepiece

In [None]:
%%capture
!wget https://github.com/danilamilo/hh_compete/raw/main/data_split.zip
!unzip /content/data_split.zip

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer

In [None]:
data_train = pd.read_csv('/content/data_train.csv').fillna('')
data_eval = pd.read_csv('/content/data_valid.csv').fillna('')
data_test = pd.read_csv('/content/data_test.csv').fillna('')

In [None]:
target_columns = ['tag_1','tag_2','tag_3','tag_4','tag_5','tag_6','tag_7']

In [None]:
labels_train = data_train[target_columns].to_numpy()
labels_eval = data_eval[target_columns].to_numpy()
labels_test = data_test[target_columns].to_numpy()

In [None]:
def get_texts(data):
    return [('Плюсы: ' + str(positive) + ' Минусы: ' + str(negative)) for positive, negative in zip(list(data['positive']), list(data['negative']))]

In [None]:
texts_train = get_texts(data_train)
texts_eval = get_texts(data_eval)
texts_test = get_texts(data_test)

In [None]:
path = 'cointegrated/LaBSE-en-ru'

In [None]:
class HHDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(path)

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(list(texts_train), truncation=True, padding=True)
valid_encodings = tokenizer(list(texts_eval), truncation=True, padding=True)

In [None]:
dataset_train = HHDataset(train_encodings, labels_train)
dataset_eval = HHDataset(valid_encodings, labels_eval)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    path,
    num_labels=labels_train.shape[1],
    problem_type="multi_label_classification"
    )

Downloading:   0%|          | 0.00/492M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not i

In [None]:
BATCH_SIZE = 8
EPOCHS = 4

In [None]:
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(
    output_dir = '/content/',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.005,
    logging_steps=50000,
    learning_rate=2e-5,
    save_strategy='no'
)

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # loss_fct = torch.nn.BCELoss()
        # loss_fct = torch.nn.CrossEntropyLoss()
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_eval
)

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

***** Running training *****
  Num examples = 37623
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18812


Step,Training Loss


In [None]:
trainer.evaluate()

In [None]:
def get_prediction(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits)
    return probs

In [None]:
predictions = [get_prediction(text).cpu().numpy()[0] for text in texts_test]

In [None]:
predictions_labeled = []
for prediction in predictions:
    predictions_labeled.append([1 if p > 0.5 else 0 for p in prediction])

In [None]:
from sklearn.metrics import f1_score
print('f1', f1_score(predictions_labeled, labels_test, average=None, zero_division=0))

In [None]:
tokenizer.save_pretrained('/drive/MyDrive/ml/hh/models/labse_1-7_v1/')
model.save_pretrained('/drive/MyDrive/ml/hh/models/labse_1-7_v1/')