In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
torch.cuda.is_available()

True

In [4]:
LR = 2e-5
EPOCHS = 4
BATCH_SIZE = 32
MODEL = "cardiffnlp/twitter-xlm-roberta-base"
MAX_TRAINING_EXAMPLES = -1 # set this to -1 if you want to use the whole training set

In [5]:
df = pd.read_csv("three_source.csv").dropna()
X = df['text']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=1, stratify=y_train)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

Downloading:   0%|          | 0.00/652 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [7]:
x_train

9544     filmi izlemedim, izlemeyi de düsünmüyorum. sim...
4819     film 10 numara, ona diyecek bisey yok. ama yön...
8228     memento'dan tanidigimiz nolan kardeslerden bir...
18967    Ürün çabuk geldi çok şık bir görünüme sahip am...
6000     konusu ve oyunculuk iyi özelllikle audrey taut...
                               ...                        
11655    film gercekten guzel izlemenisi tavsiye ederim...
457          Mayısı bekleme erken seçim yap ve defol git !
16146    ürün çok çabuk elime ulaştı herhangi bir hasar...
16067    Siyah olandan aldım.İşcilik güzel fakat malzem...
14765            Siradan - Siradan kebaplari olan mekan...
Name: text, Length: 15764, dtype: object

In [8]:
x_train = x_train.apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=256)).reset_index(drop=True)
x_val = x_val.apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=256)).reset_index(drop=True)
x_test = x_test.apply(lambda x: tokenizer(x, truncation=True, padding="max_length", max_length=256)).reset_index(drop=True)

In [9]:
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

In [13]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {"input_ids": torch.tensor(self.encodings[idx]["input_ids"])}
        item['attention_mask'] = torch.tensor(self.encodings[idx]["attention_mask"])
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(x_train, y_train)
val_dataset = MyDataset(x_val, y_val)
test_dataset = MyDataset(x_test, y_test)

In [11]:
training_args = TrainingArguments(
    output_dir='./results',                   # output directory
    num_train_epochs=EPOCHS,                  # total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # batch size for evaluation
    warmup_steps=100,                         # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                        # strength of weight decay
    logging_dir='./logs',                     # directory for storing logs
    logging_steps=10,                         # when to print log
    save_strategy='no'
)

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.out

In [14]:
trainer = Trainer(
    model=model,                              # the instantiated 🤗 Transformers model to be trained
    args=training_args,                       # training arguments, defined above
    train_dataset=train_dataset,              # training dataset
    eval_dataset=val_dataset                  # evaluation dataset
)

trainer.train()

***** Running training *****
  Num examples = 15764
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1972


Step,Training Loss
10,0.6974
20,0.6881
30,0.6889
40,0.6853
50,0.6286
60,0.5613
70,0.4781
80,0.4434
90,0.4468
100,0.5903




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1972, training_loss=0.1844900896192791, metrics={'train_runtime': 2751.3016, 'train_samples_per_second': 22.919, 'train_steps_per_second': 0.717, 'total_flos': 8295365353390080.0, 'train_loss': 0.1844900896192791, 'epoch': 4.0})

In [15]:
test_preds_raw, test_labels , _ = trainer.predict(test_dataset)
test_preds = np.argmax(test_preds_raw, axis=-1)
print(classification_report(test_labels, test_preds, digits=3))

***** Running Prediction *****
  Num examples = 1947
  Batch size = 32


              precision    recall  f1-score   support

           0      0.929     0.930     0.930      1004
           1      0.926     0.925     0.925       943

    accuracy                          0.928      1947
   macro avg      0.928     0.927     0.928      1947
weighted avg      0.928     0.928     0.928      1947



In [16]:
val_preds_raw, val_labels , _ = trainer.predict(val_dataset)
val_preds = np.argmax(val_preds_raw, axis=-1)
print(classification_report(val_labels, val_preds, digits=3))

***** Running Prediction *****
  Num examples = 1752
  Batch size = 32


              precision    recall  f1-score   support

           0      0.920     0.926     0.923       904
           1      0.920     0.914     0.917       848

    accuracy                          0.920      1752
   macro avg      0.920     0.920     0.920      1752
weighted avg      0.920     0.920     0.920      1752



In [17]:
trainer.save_model("./results/best_model") # save best model

Saving model checkpoint to ./results/best_model
Configuration saved in ./results/best_model/config.json
Model weights saved in ./results/best_model/pytorch_model.bin
