In [None]:
!pip install datasets stop-words transformers evaluate
!pip install accelerate -U

In [2]:
import pandas as pd
import numpy as np
import evaluate
import torch

In [3]:
!unzip /content/train.csv.zip -d /content/train
!unzip /content/test.csv.zip -d /content/test

Archive:  /content/train.csv.zip
  inflating: /content/train/train.csv  
Archive:  /content/test.csv.zip
  inflating: /content/test/test.csv  


In [4]:
train_df = pd.read_csv('/content/train/train.csv')

In [5]:
train_df

Unnamed: 0,text,source
0,Российская сборная лидирует по итогам командно...,mchsgov
1,#СоветМЧС #МЧС #МЧСРОССИИ,mchsgov
2,Инспекторы ГИБДД Москвы приняли участие во Все...,mospolice
3,В Главную Военно-Морскую Базу БФ в г.Балтийск ...,mil
4,Обвиняемые в хищении денежных средств у 32 пож...,mospolice
...,...,...
9876,Расчеты комплексов С-300 выполнили боевые пуск...,mil
9877,#Repost @nataliapetrasheva<br>・・・<br>#щитилира...,mospolice
9878,В столичной полиции наградили победителей реги...,mospolice
9879,ВДВ России получили все бронеавтомобили «Тигр»...,mil


In [6]:
train_df['source'].unique()

array(['mchsgov', 'mospolice', 'mil', 'russianpost'], dtype=object)

In [7]:
target_to_idx = {'mchsgov': 0, 'mospolice': 1, 'mil': 2, 'russianpost': 3}

train_df['source'] = train_df['source'].map(target_to_idx)

In [8]:
train_df['text'].isna().sum()

87

In [9]:
train_df = train_df.dropna()

## Text preprocessing

In [10]:
from stop_words import get_stop_words
import string
import re

stop_words = get_stop_words('ru')

def clean_txt(text):
    text = str(text)
    text = text.lower()
    cleaned_text = re.sub(r'[*_]', '', text)
    cleaned_text = re.sub(r'\n', '', cleaned_text)
    cleaned_text = ' '.join([i for i in cleaned_text.split() if i not in stop_words]) # drop russian stop words
    cleaned_text = cleaned_text.translate(str.maketrans('', '', string.punctuation)) # drop punctuation
    cleaned_text = re.sub(r'\d+', '', cleaned_text) # drop digits (optionally)
    #cleaned_text = ' '.join([word for word in cleaned_text.split() if len(word) > 2]) # drop short words (optionally)
    return cleaned_text

In [11]:
train_df['text'] = train_df['text'].apply(lambda x: clean_txt(x))
train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['text'] = train_df['text'].apply(lambda x: clean_txt(x))


Unnamed: 0,text,source
0,российская сборная лидирует итогам командного ...,0
1,советмчс мчс мчсроссии,0
2,инспекторы гибдд москвы приняли участие всерос...,1
3,главную военноморскую базу бф гбалтийск прибыл...,2
4,обвиняемые хищении денежных средств пожилых г...,1
...,...,...
9876,расчеты комплексов с выполнили боевые пуски ра...,2
9877,repost nataliapetrashevabr・・・brщитилира сопран...,1
9878,столичной полиции наградили победителей регион...,1
9879,вдв россии получили бронеавтомобили «тигр» нов...,2


In [12]:
train_df = train_df.rename(columns={'source': 'target'})

## Do the same for test data

In [60]:
test_df = pd.read_csv('/content/test/test.csv')

In [61]:
test_df['text'] = test_df['text'].apply(lambda x: clean_txt(x))
test_df

## Initializing a model and creating a dataset and a dataloader

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

In [17]:
from datasets import Dataset, DatasetDict

In [18]:
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

print(train_dataset)
print(test_dataset)

Dataset({
    features: ['text', 'target'],
    num_rows: 9794
})
Dataset({
    features: ['text'],
    num_rows: 1732
})


In [19]:
train_dataset = train_dataset.train_test_split(test_size=0.2)

In [20]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'target'],
        num_rows: 7835
    })
    test: Dataset({
        features: ['text', 'target'],
        num_rows: 1959
    })
})

In [21]:
model_name = 'FacebookAI/roberta-base'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [22]:
def preprocess_function(examples):
    # we don't need padding here, because of DataCollatorWithPadding
    # we don't need return_tensors='pt', because Dataset
    # contain only lists, it changes only in the moment of using
    inputs = tokenizer(examples["text"], max_length=512, truncation=True)
    inputs["labels"] = examples["target"]

    return inputs

In [23]:
tokenized_dataset = train_dataset.map(preprocess_function, batched=True,
                                      remove_columns=['text', 'target'])

Map:   0%|          | 0/7835 [00:00<?, ? examples/s]

Map:   0%|          | 0/1959 [00:00<?, ? examples/s]

In [24]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7835
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1959
    })
})

In [25]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,
                                        max_length=512,
                                        padding='max_length',
                                        return_tensors='pt')

In [26]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

## Fine tune bert for multi label classification

In [29]:
from transformers import Trainer, TrainingArguments
from transformers import logging
logging.set_verbosity_debug()
#from transformers import get_linear_schedule_with_warmup

#def compute_metrics(eval_pred):
#    """
#    if you want some extraordinary metric istead of cross_entropy by default
#    """
#    predictions, labels = eval_pred
#    predictions = np.argmax(predictions, axis=1)
#    return {'accuracy': accuracy_score(labels, predictions)}

num_epochs = 5
batch_size = 5
num_training_examples = len(tokenized_dataset['train'])
num_training_steps = (num_training_examples // batch_size) * num_epochs  # num_train_epochs=5 and per_device_train_batch_size=4

training_args = TrainingArguments(
    output_dir='./model_output',
    num_train_epochs=num_epochs,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=3,
    load_best_model_at_end=True,
    #metric_for_best_model='accuracy', # needs to be capable with compute metrics return name (key)
    gradient_accumulation_steps=2, # gradient accumulation before updating the parameters of the model
    logging_steps=10,
)

#this is used by default in trainer, but if you want dirty hand work it's here:
#optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
#scheduler = get_linear_schedule_with_warmup(
#    optimizer=optimizer,
#    num_warmup_steps=300,
#    num_training_steps=num_training_steps
#)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    #compute_metrics=compute_metrics,
    #optimizers=(optimizer, scheduler) # and here
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [30]:
trainer.train()

Currently training with a batch size of: 5
***** Running training *****
  Num examples = 7,835
  Num Epochs = 5
  Instantaneous batch size per device = 5
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 2
  Total optimization steps = 3,915
  Number of trainable parameters = 124,648,708


Epoch,Training Loss,Validation Loss
0,0.4524,0.360596
2,0.3001,0.244735


***** Running Evaluation *****
  Num examples = 1959
  Batch size = 5
Saving model checkpoint to ./model_output/checkpoint-783
Configuration saved in ./model_output/checkpoint-783/config.json
Model weights saved in ./model_output/checkpoint-783/model.safetensors
tokenizer config file saved in ./model_output/checkpoint-783/tokenizer_config.json
Special tokens file saved in ./model_output/checkpoint-783/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1959
  Batch size = 5
Saving model checkpoint to ./model_output/checkpoint-1567
Configuration saved in ./model_output/checkpoint-1567/config.json
Model weights saved in ./model_output/checkpoint-1567/model.safetensors
tokenizer config file saved in ./model_output/checkpoint-1567/tokenizer_config.json
Special tokens file saved in ./model_output/checkpoint-1567/special_tokens_map.json


Epoch,Training Loss,Validation Loss
0,0.4524,0.360596
2,0.212,0.30402
4,0.0285,0.21143


***** Running Evaluation *****
  Num examples = 1959
  Batch size = 5
Saving model checkpoint to ./model_output/checkpoint-2350
Configuration saved in ./model_output/checkpoint-2350/config.json
Model weights saved in ./model_output/checkpoint-2350/model.safetensors
tokenizer config file saved in ./model_output/checkpoint-2350/tokenizer_config.json
Special tokens file saved in ./model_output/checkpoint-2350/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1959
  Batch size = 5
Saving model checkpoint to ./model_output/checkpoint-3134
Configuration saved in ./model_output/checkpoint-3134/config.json
Model weights saved in ./model_output/checkpoint-3134/model.safetensors
tokenizer config file saved in ./model_output/checkpoint-3134/tokenizer_config.json
Special tokens file saved in ./model_output/checkpoint-3134/special_tokens_map.json
Deleting older checkpoint [model_output/checkpoint-783] due to args.save_total_limit
***** Running Evaluation *****
  Num examples =

TrainOutput(global_step=3915, training_loss=0.20764124791778976, metrics={'train_runtime': 4086.7303, 'train_samples_per_second': 9.586, 'train_steps_per_second': 0.958, 'total_flos': 1.03009827907584e+16, 'train_loss': 0.20764124791778976, 'epoch': 4.996809189534142})

Loading best model from ./model_output/checkpoint-3134

### Testing best checkpoint

In [46]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = './model_output/checkpoint-3134'
tokenizer_path = './model_output/checkpoint-3134'

model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

loading configuration file ./model_output/checkpoint-3134/config.json
Model config RobertaConfig {
  "_name_or_path": "./model_output/checkpoint-3134",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.40.0",
  "type_vocab_size": 1,
  "

In [47]:
text_example = test_dataset["text"][0]
print(text_example)

полигоне «погоново» воронежской области прошло батальонное тактическое учение подразделениями общевойсковой армии зво br brмотострелки танкисты артиллеристы военные инженеры медики поддержке оперативнотактической авиации отработали новые приемы совместных действий обороне населенного пункта развитии контрнаступления br brв ходе обучения командиры батальонов дивизионов увидели уникальный опыт ведения танковых стрельб укрытия стрельб большие дальности имитируя артподготовку большом удалении  километров


In [48]:
input_ids = tokenizer.encode(
    text_example,
    return_tensors="pt",
    max_length=512,
    truncation=True,
    ).to('cuda')

In [49]:
input_ids.shape

torch.Size([1, 512])

In [51]:
model = model.to('cuda')
clf = model(input_ids=input_ids)

In [54]:
clf

SequenceClassifierOutput(loss=None, logits=tensor([[-1.7151, -2.2082,  6.2217, -2.4457]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [56]:
probabilities = torch.nn.functional.softmax(clf.logits, dim=-1)
probabilities

tensor([[3.5706e-04, 2.1807e-04, 9.9925e-01, 1.7197e-04]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)

In [58]:
pred_ids = torch.argmax(probabilities, dim=-1)
pred_ids # 'mil': 2

tensor([2], device='cuda:0')

In [None]:
# {'mchsgov': 0, 'mospolice': 1, 'mil': 2, 'russianpost': 3}

In [63]:
test_dataset

Dataset({
    features: ['text'],
    num_rows: 1744
})

In [64]:
res = pd.read_csv('/content/sample_submission (1).csv')

In [66]:
res.head(1)

Unnamed: 0,Id,Category
0,0,mil


In [74]:
from torch.utils.data import DataLoader
from tqdm import tqdm

test_encodings = tokenizer(test_dataset['text'], truncation=True, padding=True, return_tensors="pt")
test_loader = DataLoader(test_encodings['input_ids'], batch_size=8)

model.eval()

predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        batch = batch.to(model.device)
        outputs = model(batch)

        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        pred_ids = torch.argmax(probabilities, dim=-1)
        predictions.extend(pred_ids.cpu().numpy())

Predicting:   0%|          | 0/218 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Predicting: 100%|██████████| 218/218 [00:54<00:00,  3.97it/s]


In [77]:
res['Category'] = predictions

In [79]:
idx_to_target = {0: 'mchsgov', 1: 'mospolice', 2: 'mil', 3: 'russianpost'}

res['Category'] = res['Category'].map(idx_to_target)

In [80]:
res.head(3)

Unnamed: 0,Id,Category
0,0,mil
1,1,russianpost
2,2,russianpost


In [84]:
res['Category'].unique()

array(['mil', 'russianpost', 'mchsgov', 'mospolice'], dtype=object)

In [81]:
res.to_csv('result_kaggle_vk_clf_4_labels.csv')

## The same roberta fine-tuning, but without trainer