**Загружаем библиотеки**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
!pip install transformers
import random
import torch
import transformers
import torch.nn as nn
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

tqdm.pandas()

device = torch.device('cuda')

**Загружаем и обрабатываем данные**

In [2]:
DS_PATH = '/content/drive/MyDrive/Datasets for ORG-4(SentimentAnalysisWithObjectDetection)/'
DF = pd.read_csv(DS_PATH + 'women_with_predict.csv',encoding='utf-8',lineterminator='\n')


#DF['label'] = DF['label'].apply(lambda x: x+1)
data = DF[['text','label']]

data_l0 = data.loc[DF['label'] == 0]
data_l1 = data.loc[DF['label'] == 1]
data_l2 = data.loc[DF['label'] == 2]
data_l = [[data_l0, 0], [data_l1, 0], [data_l2, 0]]
data = pd.DataFrame(columns=['text', 'label'] )

min_count_of_entries_in_labels = min(len(data_l0), len(data_l1), len(data_l2))
for i in range(min_count_of_entries_in_labels * 3):
  j = random.randint(0,2)
  data = data.append(data_l[j][0].iloc[data_l[j][1]])
  if data_l[j][1] < min_count_of_entries_in_labels - 1:
    data_l[j][1] += 1

tmp = round(len(data)* 0.6)
tmp_1 = round(len(data) * 0.8)

train_data = data.iloc[0:tmp]
val_data = data.iloc[tmp:tmp_1]
test_data = data.iloc[tmp_1:len(data)]

train_text = train_data['text'].astype('str')
train_labels = train_data['label'].astype('int')
val_text = val_data['text'].astype('str')
val_labels = val_data['label'].astype('int')
test_text = test_data['text'].astype('str')
test_labels = test_data['label'].astype('int')

**Загружаем модель DeepPavlov/rubert-base-cased-sentence**

In [3]:
bert = AutoModel.from_pretrained('DeepPavlov/rubert-base-cased-sentence')

tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-sentence')

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

**Загружаем данные в токенизатор**

In [4]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.values,
    max_length = 65,
    padding = 'max_length',
    truncation = True
)
tokens_val = tokenizer.batch_encode_plus(
    val_text.values,
    max_length = 65,
    padding = 'max_length',
    truncation = True
)
tokens_test = tokenizer.batch_encode_plus(
    test_text.values,
    max_length = 65,
    padding = 'max_length',
    truncation = True
)

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.values)

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.values)

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.values)
batch_size = 32


train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

val_data =  TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size = batch_size)

test_data = TensorDataset(test_seq, test_mask)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

**Добавляем полносвязный слой к нашей модели**

In [5]:
for param in bert.parameters():
    param.requires_grad = False

class BERT_Arch(nn.Module):
    
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,3)
        self.softmax = nn.Softmax(dim = -1)
    
    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask = mask, return_dict = False)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [6]:
model = BERT_Arch(bert)

model = model.to(device)
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)



**Балансировка весов**

In [7]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)

print(class_weights)

weights = torch.tensor(class_weights, dtype = torch.float)
weights = weights.to(device)
cross_entropy = nn.CrossEntropyLoss()
epochs = 3

[0.99472763 1.01421612 0.99135878]


**Функция обучения**

In [8]:
def train():
    model.train()
    total_loss, total_accuracy = 0, 0
    total_preds = []
  
    for step, batch in tqdm(enumerate(train_dataloader), total = len(train_dataloader)):
        
        batch = [r.to(device) for r in batch]
        sent_id,mask,labels = batch
        model.zero_grad()
        preds = model(sent_id, mask)
        loss = cross_entropy(preds, labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        preds = preds.detach().cpu().numpy()
        total_preds.append(preds)
        
    avg_loss = total_loss / len(train_dataloader)
    total_preds = np.concatenate(total_preds, axis = 0)
    
    return avg_loss, total_preds

**Функция валидации**

In [9]:
def evaluate():
    model.eval()
    total_loss, total_accuracy = 0,0
    total_preds = []

    for step, batch in tqdm(enumerate(val_dataloader), total = len(val_dataloader)):
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        
        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = cross_entropy(preds, labels)
            total_loss += loss.item()
            preds = preds.detach().cpu().argmax(axis=1).numpy()
            total_preds.append(preds)

    avg_loss = total_loss / len(val_dataloader)
    total_preds = np.concatenate(total_preds, axis = 0)
    return avg_loss, total_preds

**Обучение**

In [10]:
best_valid_loss = float('inf')

train_losses = []
valid_losses = []

for epoch in range(epochs):
    print('\n Epoch{:} / {:}'.format(epoch+1, epochs))
    
    train_loss, _ = train()
    valid_loss, _ = evaluate()
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    print(f'\nTraining loss: {train_loss:.3f}')
    print(f'Validation loss: {valid_loss:.3f}')


 Epoch1 / 3


100%|██████████| 1510/1510 [03:07<00:00,  8.05it/s]
100%|██████████| 504/504 [01:01<00:00,  8.24it/s]



Training loss: 0.936
Validation loss: 0.881

 Epoch2 / 3


100%|██████████| 1510/1510 [03:04<00:00,  8.18it/s]
100%|██████████| 504/504 [01:01<00:00,  8.24it/s]



Training loss: 0.883
Validation loss: 0.869

 Epoch3 / 3


100%|██████████| 1510/1510 [03:04<00:00,  8.18it/s]
100%|██████████| 504/504 [01:01<00:00,  8.24it/s]



Training loss: 0.874
Validation loss: 0.864


**Сохраняем веса при лучших результатах**

In [11]:
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [12]:
import gc
from datetime import timedelta
import time
import sklearn.metrics

gc.collect()
torch.cuda.empty_cache()

list_seq = np.array_split(test_seq, 50)
list_mask = np.array_split(test_mask, 50)
start = time.time()

predictions = []
for num, elem in enumerate(list_seq):
    with torch.no_grad():
        preds = model(elem.to(device), list_mask[num].to(device))
        predictions.append(preds.detach().cpu().argmax(axis=1).numpy())

**Тестирование модели**

In [13]:
flat_preds = []
for i in predictions:
  for j in i:
    flat_preds.append(j)

data_label = test_labels.to_list()

In [14]:
print("Accuracy {:4.2f}".format(sklearn.metrics.accuracy_score(data_label, flat_preds)))
print("Precision {:4.2f}, recall {:4.2f}, F1 {:4.2f}".format(*[a(data_label, flat_preds, average="macro") for a in (sklearn.metrics.precision_score, sklearn.metrics.recall_score, f1_score)]))
print("Validation took: {:}".format(timedelta(seconds = int(time.time() - start))))

Accuracy 0.68
Precision 0.67, recall 0.68, F1 0.67
Validation took: 0:00:53
