In [1]:
import os
import sys
sys.path.append("../")

import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
from collections import Counter
import langdetect
import random
import re
from sklearn.model_selection import train_test_split

from utils.preprocessing import clean_dataset

In [2]:
DATA_DIR = "../data"
data_train = pd.read_csv(os.path.join(DATA_DIR, "train_cleaned.csv"), na_filter=False)
data_val = pd.read_csv(os.path.join(DATA_DIR, "val_cleaned.csv"), na_filter=False)

In [3]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2060'

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import InputExample, InputFeatures
from transformers import TrainingArguments
from transformers import Trainer
import torch
from datasets import load_metric

from utils.classes import SentimentDataset
from utils.preprocessing import make_labels, tokenize

In [5]:
MODEL = "xlm-roberta-base"

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [7]:
metric = load_metric("accuracy")

In [8]:
X_train = tokenize(tokenizer, data_train.content)

In [9]:
X_val = tokenize(tokenizer, data_val.content)

In [10]:
y_train = data_train.sentiment
y_val = data_val.sentiment

In [11]:
y_train_labels = make_labels(y_train, regress=True)
y_val_labels = make_labels(y_val, regress=True)

In [12]:
train_dataset_torch = SentimentDataset(X_train, y_train_labels)
val_dataset_torch = SentimentDataset(X_val, y_val_labels)

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
training_args = TrainingArguments(
    os.path.join(DATA_DIR, "models", "xlm_roberta_regress"),
    per_device_train_batch_size=2,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=16,
    fp16 = True,
    fp16_opt_level = 'O1',
    evaluation_strategy = 'epoch',
    save_strategy="epoch",
    num_train_epochs=7
    
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_torch,
    eval_dataset=val_dataset_torch,
)

Using amp fp16 backend


In [27]:
trainer.train()

***** Running training *****
  Num examples = 18750
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4102


Epoch,Training Loss,Validation Loss
1,0.1175,0.082018
2,0.0841,0.076514
3,0.0702,0.075858
4,0.0566,0.077351
5,0.045,0.086077
6,0.0306,0.08608
7,0.0243,0.086248


***** Running Evaluation *****
  Num examples = 6250
  Batch size = 32
Saving model checkpoint to roberta_regress2/checkpoint-586
Configuration saved in roberta_regress2/checkpoint-586/config.json
Model weights saved in roberta_regress2/checkpoint-586/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 6250
  Batch size = 32
Saving model checkpoint to roberta_regress2/checkpoint-1172
Configuration saved in roberta_regress2/checkpoint-1172/config.json
Model weights saved in roberta_regress2/checkpoint-1172/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 6250
  Batch size = 32
Saving model checkpoint to roberta_regress2/checkpoint-1758
Configuration saved in roberta_regress2/checkpoint-1758/config.json
Model weights saved in roberta_regress2/checkpoint-1758/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 6250
  Batch size = 32
Saving model checkpoint to roberta_regress2/checkpoint-2344
Configuration saved in roberta_regress2/checkpoint-23

TrainOutput(global_step=4102, training_loss=0.05741294449797379, metrics={'train_runtime': 2155.735, 'train_samples_per_second': 60.884, 'train_steps_per_second': 1.903, 'total_flos': 8633253988800000.0, 'train_loss': 0.05741294449797379, 'epoch': 7.0})