In [1]:
from transformers import (
    AutoTokenizer,
    XLMRobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-base", problem_type="multi_label_classification")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
df = pd.read_csv('dataset.csv')

In [4]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,text,label,text_length
269,269,@user I said that once and my pals ripped me a...,0,61
452,452,@user @user Sort it out @user,2,29
201,201,@user huhu kak help me through all of this,3,42
749,749,Chicks I don't even talk to look up to me. #fl...,1,53
173,173,@user Ganguly chose? It is Kohli's provocation...,0,98


In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,label,text_length
count,1000.0,1000.0,1000.0
mean,499.5,1.232,90.05
std,288.819436,1.22543,36.856939
min,0.0,0.0,10.0
25%,249.75,0.0,59.0
50%,499.5,1.0,94.0
75%,749.25,3.0,124.0
max,999.0,3.0,147.0


In [6]:
torch.cuda.set_device('cuda:0')

In [7]:
X = df["text"]
Y = df["label"]


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, shuffle=True)
df_train, df_test = train_test_split(df, random_state=42, shuffle=True)

In [9]:
print_X_train = X_train.describe()
print_X_test = X_test.describe()
print_Y_train = Y_train.describe()
print_Y_test = Y_test.describe()

print("X_train : " + str(print_X_train)+ "\n" + "=============================================" + "\n")
print("X_test : " + str(print_X_test)+ "\n" + "=============================================" + "\n")
print("Y_train : " + str(print_Y_train)+ "\n" + "=============================================" + "\n")
print("Y_test : " + str(print_Y_test)+ "\n" + "=============================================" + "\n")

X_train : count                                                   750
unique                                                  750
top       @user Network bandwidth died after 9gb offer r...
freq                                                      1
Name: text, dtype: object

X_test : count                                                   250
unique                                                  250
top       that moment when you feel meaningless and just...
freq                                                      1
Name: text, dtype: object

Y_train : count    750.000000
mean       1.212000
std        1.216976
min        0.000000
25%        0.000000
50%        1.000000
75%        3.000000
max        3.000000
Name: label, dtype: float64

Y_test : count    250.000000
mean       1.292000
std        1.250998
min        0.000000
25%        0.000000
50%        1.000000
75%        3.000000
max        3.000000
Name: label, dtype: float64



In [10]:
ds_train = Dataset.from_pandas(df_train)
ds_test = Dataset.from_pandas(df_test)

In [12]:
from sklearn.preprocessing import MultiLabelBinarizer


mlb = MultiLabelBinarizer(classes=[0, 1, 2, 3])
mlb.fit([[0], [1], [2], [3]])

def tokenize(batch):
    tokenized_batch = tokenizer(batch["text"], padding='max_length', truncation=True, max_length=256)
    labels = mlb.transform(pd.Series(batch["label"]).apply(lambda x: [x]))
    tokenized_batch["labels"] = labels
    return tokenized_batch

bs = 16
tokenize_ds_train = ds_train.map(tokenize, batched=True, batch_size=bs)
tokenize_ds_test = ds_test.map(tokenize, batched=True, batch_size=bs)

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map: 100%|██████████| 750/750 [00:00<00:00, 1549.67 examples/s]
Map: 100%|██████████| 250/250 [00:00<00:00, 1840.85 examples/s]


In [13]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [14]:
output_dir = "./results"

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    eval_strategy="epoch",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    learning_rate=5e-05,
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenize_ds_train,
    eval_dataset=tokenize_ds_test,
    compute_metrics=compute_metrics
)


In [15]:
trainer.train()

ValueError: Target size (torch.Size([16, 4])) must be the same as input size (torch.Size([16, 2]))