In [3]:
import os
import sys

sys.path.append("../")

import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
from collections import Counter
import langdetect
import random
import re
from sklearn.model_selection import train_test_split

from utils.preprocessing import clean_dataset

In [5]:
DATA_DIR = "../data"
data_train = pd.read_csv(os.path.join(DATA_DIR, "train_cleaned.csv"), na_filter=False)
data_val = pd.read_csv(os.path.join(DATA_DIR, "val_cleaned.csv"), na_filter=False)

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import InputExample, InputFeatures
from transformers import TrainingArguments
from transformers import Trainer
import torch
from datasets import load_metric

from utils.classes import SentimentDataset
from utils.preprocessing import make_labels, tokenize

In [7]:
MODEL = "xlm-roberta-base"

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

In [9]:
X_train = tokenize(tokenizer, data_train.content)

In [10]:
X_val = tokenize(tokenizer, data_val.content)

In [11]:
y_train = data_train.sentiment
y_val = data_val.sentiment

In [12]:
y_train_labels = make_labels(y_train)
y_val_labels = make_labels(y_val)

In [13]:
train_dataset_torch = SentimentDataset(X_train, y_train_labels)
val_dataset_torch = SentimentDataset(X_val, y_val_labels)

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [19]:
training_args = TrainingArguments(
    output_dir = os.path.join(DATA_DIR, "models", "xlm_roberta_classif"),
    per_device_train_batch_size=2,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=16,
    fp16 = True,
    fp16_opt_level = 'O1',
    evaluation_strategy = 'epoch',
    save_strategy="epoch",
    num_train_epochs=4,
    
    
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_torch,
    eval_dataset=val_dataset_torch,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


In [None]:
trainer.train()