# Принцип работы

## Решение основано на BERT. 
1. На вход для обучения подаётся файл по пути из **train_path**
2. Текст чистится
3. Создаётся двумерный массив из 0 и 1, где 1 - наличие метки
4. Создаётся кастомный датасет, в котором происходит обработка данных для BERT модели
5. Данные для обчения делятся на две части в соотношении 4 к 1, где 4 - данные для обучения
6. Подгружается модель с дополнительными слоями, и настраиваются параметры обучения
7. Происходит обучение модели (требуется мощная видеокарта и значительное время на обучение)
8. Загружается тестовый датасет по пути из **test_path**, в котором также чистится текст, а затем также через другой кастомный датасет готовится для BERT модели
9. Данные подаются модели и полученный ответ обрабатывается и сохраняется в новый столбец датафрейма
10. Столбец с текстом удаляется и полученный датафрейм экспортируется по пути из **ans_path**

# Преимущества и недостатки использования BERT

---
## Преимущества
* Высокая точность работы
* Учитывание контекста
* Понимание большого количества слов, в том числе, которых нет в данных для обучения
---
## Недостатки
* Ресурсозатратность
* Низкая скорость работы по сравнению с ML моделями

In [3]:
!pip install transformers



**Установка библиотек** 

Обучение происходило на kaggle. Если на устройстве нет ниже указанных библиотек, то запустите их установку.

In [None]:
!pip install -U scikit-learn
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install pandas
!pip install numpy

In [54]:
import time
import datetime
import re

import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
import pandas as pd
import numpy as np

In [76]:
train_path = "/kaggle/input/sirius-ai-task-2/train.csv"
test_path = "/kaggle/input/sirius-ai-task-2/test.csv"
ans_path = '/kaggle/working/solution.csv'
model_path = "cointegrated/rubert-tiny2"

In [12]:
df = pd.read_csv(train_path)
df=df.fillna('empty')
print(df.head(10))

Unnamed: 0,text,labels
0,"And this year, the number will be over 150,000...","{'science', 'family', 'mobility'}"
1,MR. SPICER: I think the campaign will make dec...,"{'economy', 'languages', 'style'}"
2,You dont have to test every person in the stat...,{'family'}
3,And Dr. Fauci is going to emphasize this about...,"{'science', 'family', 'history'}"
4,SANDERS: Certainly in a number of the conversa...,"{'science', 'news'}"
5,PRESIDENT MACRON: But Im very honored and very...,{'politics'}
6,But we will be meeting at the G20.,"{'science', 'economy'}"
7,And are you worried if thats encouraged?,"{'family', 'style', 'affair'}"
8,"Here, at this memorial, the names of those 40 ...",{'mobility'}
9,We hope we still have hope that the dispute wi...,"{'economy', 'news', 'style'}"


In [13]:
data=df[:3000000].astype(str)

In [15]:
data['text'] = data['text'].replace(r"[0-9!#()$\,\'\-\.*+/:;<=>?@[\]^_`{|}\"]+", ' ', regex=True)
data['text'] = data['text'].replace(r'\W+', ' ', regex=True)
data['text'] = data['text'].str.lower()
print(data[:10])

Unnamed: 0,text,labels
0,and this year the number will be over ventilat...,"{'science', 'family', 'mobility'}"
1,mr spicer i think the campaign will make decis...,"{'economy', 'languages', 'style'}"
2,you dont have to test every person in the stat...,{'family'}
3,and dr fauci is going to emphasize this about ...,"{'science', 'family', 'history'}"
4,sanders certainly in a number of the conversat...,"{'science', 'news'}"
5,president macron but im very honored and very ...,{'politics'}
6,but we will be meeting at the g,"{'science', 'economy'}"
7,and are you worried if thats encouraged,"{'family', 'style', 'affair'}"
8,here at this memorial the names of those men a...,{'mobility'}
9,we hope we still have hope that the dispute wi...,"{'economy', 'news', 'style'}"


In [16]:
type_class = ['work', 'news', 'sports', 'music', 'movies',
'politics', 'phones', 'self-driving_cars',
'family', 'cars', 'climate_change', 'languages',
'business', 'health', 'science', 'style', 'opinion',
'economy', 'history', 'technology', 'affair', 'development', 'mobility']

In [17]:
n_class = np.zeros(shape=(len(data), len(type_class)))

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
patterns = r"['{},]+"

for i, label in enumerate(data['labels']):
    label=label.replace('self-driving cars','self-driving_cars')
    label=label.replace('climate change','climate_change')
    for cur_babel in re.sub(patterns, '', label).split():
        n_class[i][type_class.index(cur_babel)]=1
print(n_class0

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [78]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 2
LEARNING_RATE = 1e-05
tokenizer = AutoTokenizer.from_pretrained(model_path)

tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [25]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [42]:
class CustomDataset(Dataset):

    def __init__(self, text, n_class, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.comment_text = text
        self.targets = n_class
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [38]:
text = data['text'].values

In [39]:
print(len(text), len(n_class))

3000000 3000000


In [45]:
print(len(type_class))

23


In [40]:
train_text, test_text, train_labels, test_labels = train_test_split(
    text,
    n_class,
    test_size=0.2,
    random_state=42
)

print(train_text, test_text, train_labels)

['parthak hey i just yyyyhhgfgghhhghggghggddf '
 'mrstsitsipas a real life angel '
 ' ynkmin only real ot stans can interact with this tweet ' ...
 ' between two diverse sets take a quick break and walk about to get refreshed and to change gears for the next lot'
 ' wall street is so hungry for the billion they can taste it'
 ' teamdjpoppa everything you want youre going to get claim it '] ['shouldve put bread on that hoe u d '
 'people will talk and i ll say oh that reminds me of '
 'hope a condom was used ' ...
 'cant believe everyone is still talking about this bounty from '
 'thanks for the mention maxx bhivechat '
 'ask the people who see drones in the air '] [[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 1. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [43]:
training_set = CustomDataset(train_text, train_labels, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_text, test_labels, tokenizer, MAX_LEN)

In [44]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [79]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained(model_path, problem_type="multi_label_classification")
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(312, 23)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83828, 312, padding_idx=0)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=Tru

In [56]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [81]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [82]:
def train(epoch):
    model.train()
    for ep,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if ep%10000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [84]:
train_iterator = tqdm(range(EPOCHS), desc=f'Эпоха {epoch + 1}/{EPOCHS}', dynamic_ncols=True)
for epoch in train_iterator:
    train(epoch)

Эпоха 1/2:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 0, Loss:  0.7127525806427002
Epoch: 0, Loss:  0.2617749869823456
Epoch: 0, Loss:  0.2791476845741272
Epoch: 0, Loss:  0.23417741060256958
Epoch: 0, Loss:  0.22045785188674927
Epoch: 0, Loss:  0.2616928517818451
Epoch: 0, Loss:  0.18694284558296204
Epoch: 0, Loss:  0.2663562595844269
Epoch: 0, Loss:  0.19533266127109528
Epoch: 0, Loss:  0.2477191537618637
Epoch: 0, Loss:  0.20592565834522247
Epoch: 0, Loss:  0.20182037353515625
Epoch: 0, Loss:  0.17509141564369202
Epoch: 0, Loss:  0.1616492122411728
Epoch: 0, Loss:  0.2657707929611206
Epoch: 0, Loss:  0.2490372508764267
Epoch: 0, Loss:  0.2027098387479782
Epoch: 0, Loss:  0.23795954883098602
Epoch: 0, Loss:  0.2242349237203598
Epoch: 0, Loss:  0.18644185364246368
Epoch: 0, Loss:  0.21211247146129608
Epoch: 0, Loss:  0.20297101140022278
Epoch: 0, Loss:  0.17800959944725037
Epoch: 0, Loss:  0.17146632075309753
Epoch: 0, Loss:  0.25955912470817566
Epoch: 0, Loss:  0.19328466057777405
Epoch: 0, Loss:  0.22279319167137146
Epoch: 0, Lo

Эпоха 1/2:   0%|          | 0/2 [42:18<?, ?it/s]


KeyboardInterrupt: 

In [63]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for ep, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [65]:
test_iterator = tqdm(range(EPOCHS), desc=f'Эпоха {epoch + 1}/{EPOCHS}', dynamic_ncols=True)
for epoch in test_iterator:
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.4
    f1_score_weighted = metrics.f1_score(targets, outputs, average='weighted')
    print(f"F1 Score Weighted = {weighted}")

Эпоха 1/2:   0%|          | 0/2 [06:21<?, ?it/s]


KeyboardInterrupt: 

In [85]:
class InputDataset(Dataset):

    def __init__(self, text, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.comment_text = text
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [74]:
data_ans = pd.read_csv(test_path)
data_ans['text'] = data_ans['text'].replace("[0-9!#()$\,\'\-\.*+/:;<=>?@[\]^_`{|}\"]+", ' ', regex=True)
data_ans['text'] = data_ans['text'].replace(r'\s+', ' ', regex=True)
data_ans['text'] = data_ans['text'].str.lower()

text_pred = data_ans['text'].values


prediction_data = InputDataset(text_pred, tokenizer, MAX_LEN)
pred_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
prediction_loader = DataLoader(prediction_data, **pred_params)

In [88]:
print(len(prediction_loader))

932498


In [89]:
t0 = time.time()
model.eval()
fin_outputs=[]
with torch.no_grad():
    for ep, data in enumerate(prediction_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids)
        if ep%10000 == 0: print(f"Выполнено {ep/len(prediction_loader)}, потраченное время: {datetime.timedelta(seconds=time.time()-t0)}")
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
print("Всё готово")

Выполнено 0.0, потраченное время: 0:00:00.638991
Выполнено 0.0010723883590098853, потраченное время: 0:00:05.863877
Выполнено 0.0021447767180197705, потраченное время: 0:00:10.849075
Выполнено 0.003217165077029656, потраченное время: 0:00:15.929173
Выполнено 0.004289553436039541, потраченное время: 0:00:20.949594
Выполнено 0.005361941795049427, потраченное время: 0:00:26.068085
Выполнено 0.006434330154059312, потраченное время: 0:00:31.127195
Выполнено 0.007506718513069197, потраченное время: 0:00:35.853376
Выполнено 0.008579106872079082, потраченное время: 0:00:40.375626
Выполнено 0.009651495231088967, потраченное время: 0:00:44.913679
Выполнено 0.010723883590098853, потраченное время: 0:00:49.412289
Выполнено 0.011796271949108738, потраченное время: 0:00:53.904140
Выполнено 0.012868660308118623, потраченное время: 0:00:58.353693
Выполнено 0.013941048667128508, потраченное время: 0:01:02.838052
Выполнено 0.015013437026138395, потраченное время: 0:01:07.352042
Выполнено 0.0160858253851

In [95]:
print(len(fin_outputs)0

3729990

In [92]:
type_rez_class = ['work', 'news', 'sports', 'music', 'movies',
'politics', 'phones', 'self-driving cars',
'family', 'cars', 'climate change', 'languages',
'business', 'health', 'science', 'style', 'opinion',
'economy', 'history', 'technology', 'affair', 'development', 'mobility']

In [97]:
rez=[]
for index, val in enumerate(fin_outputs):
    temp_list = [(el, i)for i, el in enumerate(val)]
    temp_list.sort(reverse=True)
    ind = 0
    p=[]
    while len(p)< 3 and ind<len(temp_list):
        if temp_list[ind][0] > 0.4:
            p.append(f"'{type_rez_class[temp_list[ind][1]]}'")
            ind+=1
        else:
            break
    if len(p) == 0:
        p.append(f"'{type_rez_class[np.argmax(val)]}'")
    if len(p) == 1:
        rez.append('{'+str(p)[1:-1]+'}')
    else:
        rez.append('"{'+", ".join(p)+'}"')

In [109]:
data_ans['labels'] = rez
data_ans.drop('text', axis=1, inplace=True)
data_ans.to_csv(ans_path, index=True)

Unnamed: 0,labels
10,"{""'mobility'""}"
11,"{""'sports'""}"
12,"{""'science'""}"
13,"{""'health'""}"
14,"{""'sports'""}"
...,...
3729985,"{""'affair'""}"
3729986,"{""'development'""}"
3729987,"{""'languages'""}"
3729988,"""{'science', 'technology'}"""
