In [1]:
from transformers import (
    AutoTokenizer,
    XLMRobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)
import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-base", num_labels=4)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
df = pd.read_csv('dataset.csv')

In [4]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,text,label
24079,24079,well here i go off to work,1
9261,9261,"This app has been great for me, especially to ...",2
10952,10952,Download it and then immediately uninstalled. ...,0
9920,9920,Best tool for tracking time in my opinion,2
18272,18272,doing Accounting homework Just nicely got a 5...,2


In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,label
count,31232.0,31232.0
mean,15615.5,1.043961
std,9016.04614,0.790636
min,0.0,0.0
25%,7807.75,0.0
50%,15615.5,1.0
75%,23423.25,2.0
max,31231.0,2.0


In [6]:
df["label"].value_counts()

label
1    11649
2    10478
0     9105
Name: count, dtype: int64

In [7]:
device = torch.cuda.set_device('cuda:0')

In [8]:
X = df["text"]
Y = df["label"]


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, shuffle=True)
df_train, df_test = train_test_split(df, random_state=42, shuffle=True)

In [10]:
print_X_train = X_train.describe()
print_X_test = X_test.describe()
print_Y_train = Y_train.describe()
print_Y_test = Y_test.describe()

print("X_train : " + str(print_X_train)+ "\n" + "=============================================" + "\n")
print("X_test : " + str(print_X_test)+ "\n" + "=============================================" + "\n")
print("Y_train : " + str(print_Y_train)+ "\n" + "=============================================" + "\n")
print("Y_test : " + str(print_Y_test)+ "\n" + "=============================================" + "\n")

X_train : count                23424
unique               23424
top       salt and vinegar
freq                     1
Name: text, dtype: object

X_test : count                              7808
unique                             7808
top       nice night, should be golfing
freq                                  1
Name: text, dtype: object

Y_train : count    23424.000000
mean         1.045125
std          0.790243
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max          2.000000
Name: label, dtype: float64

Y_test : count    7808.000000
mean        1.040471
std         0.791851
min         0.000000
25%         0.000000
50%         1.000000
75%         2.000000
max         2.000000
Name: label, dtype: float64



In [11]:
ds_train = Dataset.from_pandas(df_train)
ds_test = Dataset.from_pandas(df_test)

In [12]:





def tokenize(batch):
    tokenized_batch = tokenizer(batch["text"], padding='max_length', truncation=True, max_length=256)

    tokenized_batch["labels"] = batch["label"]
    return tokenized_batch


bs = 16
tokenize_ds_train = ds_train.map(tokenize, batched=True, batch_size=bs)
tokenize_ds_test = ds_test.map(tokenize, batched=True, batch_size=bs)

Map: 100%|██████████| 23424/23424 [00:29<00:00, 790.96 examples/s] 
Map: 100%|██████████| 7808/7808 [00:09<00:00, 842.26 examples/s] 


In [13]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        p.label_ids, preds, average='weighted'
    )
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [18]:
output_dir = "./results"

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    eval_strategy="epoch",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    learning_rate=5e-05,
    num_train_epochs=1,
    resume_from_checkpoint = True,
    
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenize_ds_train,
    eval_dataset=tokenize_ds_test, 
    compute_metrics=compute_metrics,

)


In [19]:

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.102,1.095178,0.371414,0.201177,0.137948,0.371414


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=1464, training_loss=1.105270010526063, metrics={'train_runtime': 1258.4302, 'train_samples_per_second': 18.614, 'train_steps_per_second': 1.163, 'total_flos': 3081612016484352.0, 'train_loss': 1.105270010526063, 'epoch': 1.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.5997633337974548,
 'eval_accuracy': 0.758452868852459,
 'eval_f1': 0.7577840635164018,
 'eval_precision': 0.7573753890725914,
 'eval_recall': 0.758452868852459,
 'eval_runtime': 84.2612,
 'eval_samples_per_second': 92.664,
 'eval_steps_per_second': 5.792,
 'epoch': 1.0}