In [1]:
import sys
import os

sys.path.append(os.path.abspath(".."))

In [2]:
from torch.utils.data import DataLoader
from anm.utils import Config, LOGGER
import numpy as np
import pandas as pd
import argparse
import torch
import csv

from datasets import Dataset
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
)

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
def create_dataset_from_faulty_csv(src_path, label):
    dataset_dict = {'text': [], 'label': []}
    with open(src_path) as src_file:
        csv_reader = csv.reader(src_file, delimiter=',', quotechar='"')
        print('')
        for row in csv_reader:
            if row[0] == 'idtwitter':
                continue
            if len(row) != 9:
                cut_row = row[:9]
                cut_row[8] += ',' + ', '.join(row[9:])
                row = cut_row
            dataset_dict['text'].append(row[8])
            if label == 'pos':
                dataset_dict['label'].append(int(row[2]))
            else:
                dataset_dict['label'].append(int(row[3]))
    return Dataset.from_dict(dataset_dict)

In [5]:
def prepare_datasets(dataset_dir, tokenizer, label):
    sentipolc_files = {
        'train': [os.path.join(dataset_dir, file_name) for file_name in os.listdir(dataset_dir) if
                  'training_set' in file_name][0],
        'test':
            [os.path.join(dataset_dir, file_name) for file_name in os.listdir(dataset_dir) if 'test_set' in file_name][
                0]
    }
    train_dataset = create_dataset_from_faulty_csv(sentipolc_files['train'], label)
    test_dataset = create_dataset_from_faulty_csv(sentipolc_files['test'], label)

    def preprocess_function(examples):
        result = tokenizer(examples["text"], truncation=True)
        return result

    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=['text'])
    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=['text'])

    return tokenized_train_dataset, tokenized_test_dataset

In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

In [8]:
def load_model_from_hf(model_name, pretrained):
    # Model
    LOGGER.info("Initiating model ...")
    if not pretrained:
        # initiate model with random weights
        LOGGER.info("Take randomized model")
        
        config = AutoConfig.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_config(config)
    else:
        LOGGER.info("Take pretrained model")
    
        model = AutoModelForSequenceClassification.from_pretrained(model_name)

    return model

In [9]:
class TrainingConfig:

    def __init__(self):
        self.weight_decay = 1e-2
        self.lr = 2e-5 #5e-6
        self.train_bs = 8
        self.eval_bs = 8
        self.n_epochs = 8
        self.seed = 1234
        self.num_warmup_steps = 0
        # self.language_mode = args.language_mode
        # self.pretrained = args.pretrained
        # self.finetuned = args.finetuned
        # self.user_id = args.user_id

In [10]:
cf = TrainingConfig()

In [11]:
# model_name = "idb-ita/gilberto-uncased-from-camembert"
model_name = 'xlm-roberta-base'
dataset_dir = '../augmenting_nlms_meco_data/sentiment/it_sentipolc'
output_dir = 'xlm_p_np'
finetuned = False

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
train_dataset, test_dataset = prepare_datasets(dataset_dir, tokenizer, 'pos')





Map:   0%|          | 0/7410 [00:00<?, ? examples/s]

Map:   0%|          | 0/1998 [00:00<?, ? examples/s]

In [14]:
train_dataset

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 7410
})

In [29]:
training_args = TrainingArguments(
        output_dir=output_dir,          # output directory
        num_train_epochs=cf.n_epochs,              # total number of training epochs
        per_device_train_batch_size=cf.train_bs,  # batch size per device during training
#        per_device_eval_batch_size=cf.eval_bs,   # batch size for evaluation
        warmup_steps=0,#500,                # number of warmup steps for learning rate scheduler
        weight_decay=cf.weight_decay,               # strength of weight decay
        save_strategy="no",
        learning_rate=cf.lr,
        logging_strategy='steps',
        logging_steps=100
    )

In [30]:
if not finetuned: # downaload from huggingface
    LOGGER.info("Model retrieving, not finetuned, from hf...")
    model = load_model_from_hf(model_name, True)
else: # the finetuned model has to be loaded from disk
    LOGGER.info("Model retrieving, finetuned, load from disk...")
    model = AutoModelForSequenceClassification.from_pretrained(args.model_dir, 
                                                                   ignore_mismatched_sizes=True,
                                                                   output_attentions=False, output_hidden_states=False,
                                                                   num_labels=2) # number of the classes


[2023-10-12 12:10:24,410 - processing - INFO] Model retrieving, not finetuned, from hf...
[2023-10-12 12:10:24,411 - processing - INFO] Initiating model ...
[2023-10-12 12:10:24,411 - processing - INFO] Take pretrained model


In [31]:
trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
#         eval_dataset=tokenized_datasets_sst2["validation"],            # evaluation dataset
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

In [32]:
train_result = trainer.train()



Step,Training Loss


KeyboardInterrupt: 

In [None]:
trainer.save_model(output_dir)

In [25]:
train_metrics = trainer.evaluate(eval_dataset=train_dataset, metric_key_prefix="train")



In [26]:
trainer.log_metrics("train", train_metrics)
trainer.save_metrics("train", train_metrics)

***** train metrics *****
  epoch                    =        8.0
  train_accuracy           =     0.9866
  train_f1                 =     0.9867
  train_loss               =     0.0626
  train_precision          =     0.9867
  train_recall             =     0.9866
  train_runtime            = 0:00:44.49
  train_samples_per_second =    166.542
  train_steps_per_second   =     10.429


In [27]:
test_metrics = trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")



In [28]:
trainer.log_metrics("test", test_metrics)
trainer.save_metrics("test", test_metrics)

***** test metrics *****
  epoch                   =        8.0
  test_accuracy           =     0.7713
  test_f1                 =     0.7897
  test_loss               =      1.092
  test_precision          =     0.8243
  test_recall             =     0.7713
  test_runtime            = 0:00:11.99
  test_samples_per_second =    166.535
  test_steps_per_second   =     10.419
