In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

from anm.modeling.multitask_xlm_roberta import XLMRobertaForMultiTaskSequenceClassification
from anm.modeling.multitask_camembert import CamembertForMultiTaskSequenceClassification
from transformers import TrainingArguments, AutoTokenizer, AutoConfig, get_scheduler
from anm.gaze_dataloader.datacollator import MultiLabelDataCollatorWithPadding
from anm.gaze_training.utils import  create_finetuning_optimizer
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from datasets import Dataset
import pandas as pd
import torch
import math
import csv
import re

In [2]:
cf = {
    "weight_decay": 1e-2,
    "lr": 2e-5,#5e-6,
    "train_bs": 8,
    "eval_bs": 8,
    "n_epochs": 1, #8,
    "seed": 1234,
    "num_warmup_steps": 0
}

class Config:
    def __init__(self, d=None):
        if d is not None:
            for key, value in d.items():
                setattr(self, key, value)

cf = Config(cf)

In [3]:
def get_finetuned_model_path(model_cf, finetuned_models_dir, user_id):
    model_str = 'xlm' if model_cf == 'cross_lingual' else 'camem'
    finetuned_str = 'p' if model_cf.finetuned else 'np'    
    model_dir = f'{finetuned_models_dir}/gaze_finetuning_it_{user_id}_{finetuned_str}_{model_str}'
    for file_name in os.listdir(model_dir):
        file_path = os.path.join(model_dir, file_name)
        if file_name != 'tf_logs' and os.path.isdir(file_path):
            if 'config.json' in os.listdir(file_path):
                model_path = file_path
            else:
                inner_dir = os.listdir(file_path)[0]
                model_path = os.path.join(file_path, inner_dir)
    return model_path

In [4]:
def get_config_with_tasks(model_name):
    config = AutoConfig.from_pretrained(model_name)
    config.tasks = ['pos', 'neg']
    return config

In [5]:
def load_model(model_cf, finetuned_models_dir=None, user_id=None):    
    if model_cf.finetuned:
        model_name = get_finetuned_model_path(model_cf, finetuned_models_dir, user_id)
        config = get_config_with_tasks(model_name)
        if model_cf.language_mode == 'cross_lingual':
            model = XLMRobertaForMultiTaskSequenceClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)
        else:
            model = CamembertForMultiTaskSequenceClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)
    else:
        if model_cf.language_mode == 'cross_lingual':
            model_name = 'xlm-roberta-base'
            config = get_config_with_tasks(model_name)
            if model_cf.pretrained:
                model = XLMRobertaForMultiTaskSequenceClassification.from_pretrained(model_name, config=config)
            else:
                model = XLMRobertaForMultiTaskSequenceClassification(config=config)
        else:
            model_name = 'idb-ita/gilberto-uncased-from-camembert'
            config = get_config_with_tasks(model_name)
            if model_cf.pretrained:
                model = CamembertForMultiTaskSequenceClassification.from_pretrained(model_name, config=config)
            else:
                model = CamembertForMultiTaskSequenceClassification(config=config)
    return model, model_name

In [6]:
model_cf = {
    'language_mode': 'cross_lingual',
    'pretrained': True,
    'finetuned': False
}

model_cf = Config(model_cf)

In [7]:
finetuned_models_dir = '/home/lmoroni/__workdir/augmenting_nlms_meco/output'
# for user_id  in [1, 26, 38, 43, 44]:
    # model = load_model(model_cf, finetuned_models_dir=finetuned_models_dir, user_id=user_id)
model, model_name = load_model(model_cf)

Some weights of XLMRobertaForMultiTaskSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifiers.neg.dense.bias', 'classifiers.pos.dense.weight', 'classifiers.pos.out_proj.bias', 'classifiers.pos.dense.bias', 'classifiers.pos.out_proj.weight', 'classifiers.neg.out_proj.bias', 'classifiers.neg.out_proj.weight', 'classifiers.neg.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
dataset_dir = '../augmenting_nlms_meco_data/sentiment/it_sentipolc'

In [9]:
sentipolc_files = {
    'train': [os.path.join(dataset_dir, file_name) for file_name in os.listdir(dataset_dir) if 'training_set' in file_name][0],
    'test': [os.path.join(dataset_dir, file_name) for file_name in os.listdir(dataset_dir) if 'test_set' in file_name][0]
}

In [10]:
sentipolc_files

{'train': '../augmenting_nlms_meco_data/sentiment/it_sentipolc/training_set_sentipolc16.csv',
 'test': '../augmenting_nlms_meco_data/sentiment/it_sentipolc/test_set_sentipolc16_gold2000.csv'}

In [11]:
def create_dataset_from_faulty_csv(src_path):
    dataset_dict = {'text': [], 'label_pos': [], 'label_neg':[]}
    with open(src_path) as src_file:
        csv_reader = csv.reader(src_file, delimiter=',', quotechar='"')
        print('')
        for row in csv_reader:
            if row[0] == 'idtwitter':
                continue
            if len(row) != 9:
                cut_row = row[:9]
                cut_row[8] += ',' + ', '.join(row[9:])
                row = cut_row
            dataset_dict['text'].append(row[8])
            dataset_dict['label_pos'].append(int(row[2]))
            dataset_dict['label_neg'].append(int(row[3]))
    return Dataset.from_dict(dataset_dict)

In [12]:
train_dataset = create_dataset_from_faulty_csv(sentipolc_files['train'])
test_dataset = create_dataset_from_faulty_csv(sentipolc_files['test'])





In [13]:
train_dataset, test_dataset

(Dataset({
     features: ['text', 'label_pos', 'label_neg'],
     num_rows: 7410
 }),
 Dataset({
     features: ['text', 'label_pos', 'label_neg'],
     num_rows: 1998
 }))

In [14]:
# tokenizer_name = 'xlm-roberta-base' if model_cf.language_mode == 'cross_lingual' else 'idb-ita/gilberto-uncased-from-camembert'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [15]:
def preprocess_function(examples):
    result = tokenizer(examples["text"], truncation=True)
    return result

In [16]:
tokenized_train_ds = train_dataset.map(preprocess_function, batched=True, remove_columns=['text'])
tokenized_test_ds = test_dataset.map(preprocess_function, batched=True, remove_columns=['text'])

Map:   0%|          | 0/7410 [00:00<?, ? examples/s]

Map:   0%|          | 0/1998 [00:00<?, ? examples/s]

In [17]:
tokenized_train_ds

Dataset({
    features: ['label_pos', 'label_neg', 'input_ids', 'attention_mask'],
    num_rows: 7410
})

In [18]:
data_collator = MultiLabelDataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
train_dataloader = DataLoader(tokenized_train_ds, shuffle=True, collate_fn=data_collator, batch_size=cf.train_bs)
eval_dataloader = DataLoader(tokenized_test_ds, collate_fn=data_collator, batch_size=cf.eval_bs)

In [20]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": cf.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=cf.lr)

In [21]:
num_training_steps = cf.n_epochs * math.ceil(len(train_dataloader))
lr_scheduler = get_scheduler(name='linear', 
                             optimizer=optimizer, 
                             num_warmup_steps=cf.num_warmup_steps,
                             num_training_steps=num_training_steps)

In [22]:
device = 'cuda:1'

In [23]:
model.config.num_labels

2

In [24]:
import evaluate
f1 = evaluate.load('f1')
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [25]:
from tqdm.notebook import tqdm

progress_bar = tqdm(range(num_training_steps))


model.to(device)


for epoch in range(1, cf.n_epochs + 1):
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = {
            "input_ids": batch["input_ids"].to(device),
            "attention_mask": batch["attention_mask"].to(device),
            "labels": {k: v.to(device) for k, v in batch["labels"].items()}
        }

        model_output = model(**batch)
        loss = model_output.loss
        
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        progress_bar.update(1)
        # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=cf.max_grad_norm)

        
    print("Loss", loss.item())
    

    model.eval()

    with torch.no_grad():
        for batch in eval_dataloader:
            batch = {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device),
                "labels": {k: v.to(device) for k, v in batch["labels"].items()}
            }

            model_output = model(**batch)

            for task in batch['labels']:
                predictions = model_output.logits[task].argmax(dim=-1)
                references = batch['labels'][task]
                f1.add_batch(predictions=predictions, references=references)

    eval_accuracy = f1.compute()
    print(f'Eval accuracy {eval_accuracy}')

  0%|          | 0/927 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Loss 0.4195045530796051
Eval accuracy {'f1': 0.5749506903353058}


In [27]:
def evaluate_model(model, dataloader, split, all_metrics, pos_metrics, neg_metrics):
    model.eval()

    with torch.no_grad():
        for batch in dataloader:
            batch = {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device),
                "labels": {k: v.to(device) for k, v in batch["labels"].items()}
            }
    
            model_output = model(**batch)
    
            for task in batch['labels']:
                predictions = model_output.logits[task].argmax(dim=-1)
                references = batch['labels'][task]
                # for metric in metrics:
                all_metrics.add_batch(predictions=predictions, references=references)
                if task  == 'pos':
                    pos_metrics.add_batch(predictions=predictions, references=references)
                elif task == 'neg':
                    neg_metrics.add_batch(predictions=predictions, references=references)

    
    
    all_res = all_metrics.compute()
    pos_res = pos_metrics.compute()
    neg_res = neg_metrics.compute()

    res_dict = {}
    
    for metric in all_res:
        res_dict[f'{split}_{metric}'] = all_res[metric]  
    for metric in pos_res:
        res_dict[f'{split}_pos_{metric}'] = pos_res[metric]
    for metric in neg_res:
        res_dict[f'{split}_neg_{metric}'] = neg_res[metric]
        
    return res_dict

In [26]:
def get_eval_metrics():
    return evaluate.combine([
        evaluate.load("accuracy"),
        evaluate.load("f1"),
        evaluate.load("precision", average='binary'),
        evaluate.load("recall")])

In [None]:
def get_out_path(out_dir, model_cf, user_id=None):
    model_str = 'xlm' if model_cf.language_mode == 'cross_lingual' else 'camem'
    pretrained = 'p' if model_cf.pretrained else 'np'
    finetuned = f'f_it{user_id}' if model_cf.finetuned else 'nf'
    if not model_cf.finetuned and model_cf.language_mode == 'cross_lingual':
        model_str += '_it'
    return os.path.join(out_dir, f'{model_str}_{pretrained}_{finetuned}')
    

In [None]:
model_save_dir = f'../output/sentipolc'
out_path = get_out_path(model_save_dir, model_cf)

In [None]:
out_path

In [28]:
train_res = evaluate_model(model, train_dataloader, 'train', get_eval_metrics(), get_eval_metrics(), get_eval_metrics())
test_res = evaluate_model(model, eval_dataloader, 'test', get_eval_metrics(), get_eval_metrics(), get_eval_metrics())

In [29]:
train_res

{'train_accuracy': 0.7898110661268556,
 'train_f1': 0.688219397457712,
 'train_precision': 0.6935646560419608,
 'train_recall': 0.6829558998808105,
 'train_pos_accuracy': 0.8005398110661268,
 'train_pos_f1': 0.63160518444666,
 'train_pos_precision': 0.6460989291177971,
 'train_pos_recall': 0.6177474402730375,
 'train_neg_accuracy': 0.7790823211875844,
 'train_neg_f1': 0.7262083960528516,
 'train_neg_precision': 0.7246328437917223,
 'train_neg_recall': 0.7277908146161582}

In [None]:
d = train_res|test_res

In [None]:
model.save_pretrained(out_path)

In [None]:
import json
metrics_out_path = os.path.join(out_path, 'all_results.json')
with open(metrics_out_path, 'w+') as out_file:
    json.dump(train_res | test_res, out_file)