In [7]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_transformers import BertTokenizer, BertConfig
from pytorch_transformers import AdamW, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import math
from pytorch_transformers import BertTokenizer, BertConfig

In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [36]:
texts = pd.read_csv('D:/Project/Work_Project/mafia/data/processed/Dataset_text_1.csv', encoding='utf8')
texts = texts.dropna(axis='index', how='any', subset=['CLASS'])

mackup = pd.read_csv('D:/Project/Work_Project/mafia/data/processed/MACKUP/train.csv').ID
texts = texts.loc[texts.ID.isin(mackup)]
texts.index = np.arange(len(texts))

In [39]:
mir_texts = texts[texts['CLASS'] == 0]
maf_texts = texts[texts['CLASS'] == 1]

sentences = np.concatenate([maf_texts['TEXT'].values, mir_texts['TEXT'].values])

sentences = ['[CLS] ' + str(sentence) + ' [SEP]' for sentence in sentences]
labels = [[1] for _ in range(maf_texts.shape[0])] + [[0] for _ in range(mir_texts.shape[0])]



In [41]:
from pytorch_transformers import BertTokenizer, BertConfig


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print (tokenized_texts[0])

['[CLS]', 'г', '##о', '##р', '##о', '##д', 'з', '##д', '##е', '##с', '##ь', 'м', '##ы', 'с', '##е', '##и', '##ч', '##а', '##с', 'в', '##с', '##е', 'з', '##на', '##к', '##о', '##м', '##и', '##м', '##с', '##я', 'в', '##с', '##е', 'р', '##а', '##з', '##г', '##ов', '##а', '##р', '##и', '##в', '##а', '##л', '##и', 'д', '##о', '##б', '##р', '##ы', '##и', 'в', '##е', '##ч', '##е', '##р', 'с', '##п', '##а', '##с', '##и', '##б', '##о', 'с', '##п', '##а', '##с', '##и', '##б', '##о', 'и', '##г', '##р', '##о', '##к', 'н', '##о', '##м', '##е', '##р', 'о', '##д', '##и', '##н', '[SEP]']


In [42]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(
    input_ids,
    maxlen=100,
    dtype="long",
    truncating="post",
    padding="post"
)
attention_masks = [[float(i>0) for i in seq] for seq in input_ids]

Token indices sequence length is longer than the specified maximum sequence length for this model (773 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1011 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (703 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (602 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

In [65]:
train_inputs = torch.tensor(input_ids)
train_labels = torch.tensor(labels)
train_masks = torch.tensor(attention_masks)
train_masks = train_masks.long()

In [66]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(
    train_data,
    sampler=RandomSampler(train_data),
    batch_size=32
)

In [61]:
from pytorch_transformers import AdamW, BertForSequenceClassification
from pytorch_transformers import BertForQuestionAnswering, BertForTokenClassification

In [57]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [58]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)



In [67]:
from IPython.display import clear_output

# Будем сохранять loss во время обучения
# и рисовать график в режиме реального времени
train_loss_set = []
train_loss = 0


# Обучение
# Переводим модель в training mode
model.train()

for _ in range(5):
  for step, batch in enumerate(train_dataloader):
      # добавляем батч для вычисления на GPU
      batch = tuple(t.to(device) for t in batch)
      # Распаковываем данные из dataloader
      b_input_ids, b_input_mask, b_labels = batch
      
      # если не сделать .zero_grad(), градиенты будут накапливаться
      optimizer.zero_grad()
      
      # Forward pass
      loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

      train_loss_set.append(loss[0].item())  
      
      # Backward pass
      loss[0].backward()
      
      # Обновляем параметры и делаем шаг используя посчитанные градиенты
      optimizer.step()

      # Обновляем loss
      train_loss += loss[0].item()
      
      # Рисуем график
      clear_output(True)
      plt.plot(train_loss_set)
      plt.title("Training loss")
      plt.xlabel("Batch")
      plt.ylabel("Loss")
      plt.show()
      
print("Loss на обучающей выборке: {0:.5f}".format(train_loss / len(train_dataloader)))


# Валидация
# Переводим модель в evaluation mode
model.eval()

valid_preds, valid_labels = [], []

for batch in validation_dataloader:   
    # добавляем батч для вычисления на GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Распаковываем данные из dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # При использовании .no_grad() модель не будет считать и хранить градиенты.
    # Это ускорит процесс предсказания меток для валидационных данных.
    with torch.no_grad():
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    # Перемещаем logits и метки классов на CPU для дальнейшей работы
    logits = logits[0].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    batch_preds = np.argmax(logits, axis=1)
    batch_labels = np.concatenate(label_ids)     
    valid_preds.extend(batch_preds)
    valid_labels.extend(batch_labels)

print("Процент правильных предсказаний на валидационной выборке: {0:.2f}%".format(
    accuracy_score(valid_labels, valid_preds) * 100
))

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.cuda.IntTensor instead (while checking arguments for embedding)