In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
import accelerate 

Загружаем данные.

In [6]:
text_df = pd.read_csv('polarity-dataset.csv')
text_df.head()

Unnamed: 0,sentence,label
0,"simplistic , silly and tedious .",-1
1,"it's so laddish and juvenile , only teenage bo...",-1
2,exploitative and largely devoid of the depth o...,-1
3,[garbus] discards the potential for pathologic...,-1
4,a visually flashy but narratively opaque and e...,-1


In [7]:
text_df.loc[text_df['label'] == -1, 'label'] = 0
text_df.head()

Unnamed: 0,sentence,label
0,"simplistic , silly and tedious .",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


In [8]:
text_df['sentence'].apply(lambda x: len(x)).max()

268

In [9]:
text_df.groupby('label').size()

label
0    5331
1    5331
dtype: int64

Загружаем предобученные модели. Указываем, для чего - for sequence classification, for masked LM etc. Токенайзер должен быть из той же модели.

In [27]:
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer1 = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")
model1 = BertForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2)

from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoModelForSequenceClassification

# tokenizer1 = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
# model1 = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2)

tokenizer2 = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model2 = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model2

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0

Токенизируем данные. Обрезаем слишком длинный текст и добавляем padding для короткого.

In [12]:
sample_data = ["I am eating","I am playing "]
tokenizer2(sample_data, padding=True, truncation=True, max_length=128)

{'input_ids': [[101, 1045, 2572, 5983, 102], [101, 1045, 2572, 2652, 102]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

Задача masked LM. Можно замаскировать слово с помощью [MASK] и посмотреть, какое слово предложит вставить модель.

In [55]:
text = "I saw a [MASK] yesterday. It was amazing"
inputs = tokenizer2(text, padding = True, truncation = True, max_length=512, return_tensors='pt')
outputs = model2(**inputs)
print(outputs, '\n', outputs[0].shape)

logits = outputs.logits # находим индекс замаскированного токена
mask_token_index = torch.where(inputs["input_ids"] == tokenizer2.mask_token_id)[1]
# print(mask_token_index)

mask_token_logits = logits[0, mask_token_index, :] # смотрим на логиты для этого токена, выбираем 5 наиболее вероятных
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
    print(tokenizer2.decode([token]))
    
inputs["input_ids"][0, mask_token_index] = top_5_tokens[0] 
predicted_sentence = tokenizer2.decode(inputs["input_ids"][0], skip_special_tokens=True)
print(predicted_sentence)


MaskedLMOutput(loss=None, logits=tensor([[[ -5.6731,  -5.6568,  -5.6591,  ...,  -4.9875,  -4.9015,  -3.1113],
         [-11.0382, -10.8346, -10.9389,  ...,  -9.8182,  -9.8506,  -8.2592],
         [-10.6046, -10.4936, -10.3048,  ...,  -7.7668,  -8.8406,  -6.7888],
         ...,
         [-12.0322, -11.9245, -11.9511,  ..., -10.2797,  -9.7362,  -6.4143],
         [ -9.2199,  -9.3021,  -9.3776,  ...,  -8.1030,  -9.5854,  -3.4862],
         [-12.6807, -12.7495, -12.6446,  ..., -10.4797, -10.7781,  -8.0877]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None) 
 torch.Size([1, 11, 30522])
movie
ufo
ghost
truck
picture
i saw a movie yesterday. it was amazing


Задача классификации. Можно посмотреть на результаты предобученной модели.

In [56]:
text = "I saw a film yesterday. It was amazing"
inputs = tokenizer1(text, padding = True, truncation = True, max_length=512, return_tensors='pt')
outputs = model1(**inputs)
print(outputs)

predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
predictions.detach().numpy()

SequenceClassifierOutput(loss=None, logits=tensor([[-1.1176,  1.2692]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


array([[0.08418366, 0.9158163 ]], dtype=float32)

Преобразовываем данные.

In [29]:
X = list(text_df["sentence"])
y = list(text_df["label"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)
X_train_tokenized = tokenizer1(X_train, padding=True, truncation=True, max_length=128)
X_val_tokenized = tokenizer1(X_val, padding=True, truncation=True, max_length=128)

In [30]:
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [31]:
print(X_train_tokenized['attention_mask'][0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [32]:
len(X_train), len(X_val)

(8529, 2133)

In [33]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [34]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [35]:
train_dataset[5]

{'input_ids': tensor([  101,  1996,  2069,  2402,  2111,  2040,  4298,  2097,  5959,  2009,
          2024, 16725,  1012,  1012,  1012,  2040,  2453,  2022, 11116,  2011,
          1996,  3185,  1005,  1055,  4248,  5750,  1998,  4165,  1012,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0

In [36]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    roc_auc = roc_auc_score(labels, pred[:, 1])
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, "roc_auc:" : roc_auc}

Обучим модель для анализа тональности текста.

In [40]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=5,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=model1,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [41]:
trainer.train()

Step,Training Loss
500,0.5264
1000,0.5376
1500,0.4777
2000,0.4589
2500,0.4019
3000,0.3891
3500,0.3698
4000,0.3628
4500,0.3246
5000,0.3317


TrainOutput(global_step=5335, training_loss=0.4119704113346232, metrics={'train_runtime': 589.2625, 'train_samples_per_second': 72.37, 'train_steps_per_second': 9.054, 'total_flos': 7724871380700.0, 'train_loss': 0.4119704113346232, 'epoch': 5.0})

In [42]:
trainer.evaluate()

<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.6071069836616516,
 'eval_accuracy': 0.7759024847632443,
 'eval_precision': 0.7934131736526946,
 'eval_recall': 0.7457786116322702,
 'eval_f1': 0.7688588007736943,
 'eval_roc_auc:': 0.8540858186319591,
 'eval_runtime': 5.2584,
 'eval_samples_per_second': 405.637,
 'eval_steps_per_second': 50.776,
 'epoch': 5.0}

In [44]:
trainer.save_model('CustomModel')

In [None]:
# trainer.save_model('/content/drive/MyDrive/Youtube Tutorials/toxic')
# model_2 = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Youtube Tutorials/toxic")
# model_2.to('cuda')

In [46]:
model3 = BertForSequenceClassification.from_pretrained("CustomModel")

In [62]:
text = "This film is okay"
inputs = tokenizer1(text, padding = True, truncation = True, return_tensors='pt')
outputs = model3(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
predictions = predictions.detach().numpy()
predictions

array([[0.5889006 , 0.41109937]], dtype=float32)