In [None]:
# NOTE FOR LOCAL EXECUTION: use pytorch kernel and execute the code below to specify cuda device
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
# install packages
!pip install torch transformers memory_profiler datasets accelerate nltk tweet-preprocessor
import time
import datetime
tic = time.time()

In [None]:
# import modules
import torch
import random
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW
from datasets import load_metric
%load_ext memory_profiler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [None]:
# Function to create partition from split
def create_partition_from_split(train, test, x_colname, y_colname):
    X_train = train[x_colname]
    X_test = test[x_colname]
    y_train = train[y_colname]
    y_test = test[y_colname]
    return X_train, X_test, y_train, y_test

In [None]:
# Function to define datasets
def create_datasets(tokenizer, X_train, X_test, y_train, y_test):
    # Convert input dataframes into lists
    X_train = X_train.tolist()
    X_test = X_test.tolist()
    y_train = y_train.tolist()
    y_test = y_test.tolist()

    # Tokenize the text
    train_tokens = tokenizer(X_train, truncation=True, padding=True, max_length=512)
    valid_tokens = tokenizer(X_test, truncation=True, padding=True, max_length=512)

    class MakeTorchData(torch.utils.data.Dataset):
        def __init__(self, tokens, labels):
            self.tokens = tokens
            self.labels = labels

        def __getitem__(self, idx):
            item = {k: torch.tensor(v[idx]) for k, v in self.tokens.items()}
            item["labels"] = torch.tensor([self.labels[idx]])
            return item

        def __len__(self):
            return len(self.labels)

    # Convert our tokenized data into a torch Dataset
    train_dataset = MakeTorchData(train_tokens, y_train)
    valid_dataset = MakeTorchData(valid_tokens, y_test)

    return train_dataset, valid_dataset

In [None]:
from sklearn.metrics import accuracy_score
auc_metric = load_metric("roc_auc", trust_remote_code=True)

def compute_metrics(eval_pred):
    # Calculate accuracy
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=predictions)

    # Calculate AUC
    roc_auc = auc_metric.compute(prediction_scores=predictions, references=labels)["roc_auc"]

    return {"accuracy": accuracy, "roc_auc": roc_auc}

In [None]:
from transformers import TrainerCallback

class CustomCallback(TrainerCallback):

    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        train_results = self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
        print(train_results)
        eval_results = self._trainer.evaluate(eval_dataset=self._trainer.eval_dataset, metric_key_prefix="eval")
        print(eval_results)
        print()

In [None]:
# Create trainer
def create_trainer(suffix, model, train_dataset, valid_dataset, num_epochs=5,
                   load_best_model_at_end=True, optimizer=None):
    training_args = TrainingArguments(
      output_dir='./results_' + suffix,  # output directory
      num_train_epochs=num_epochs,       # total number of training epochs
      per_device_train_batch_size=16,     # batch size per device during training
      per_device_eval_batch_size=16,     # batch size for evaluation
      warmup_steps=500,                  # number of warmup steps for learning rate scheduler
      weight_decay=0.01,                 # strength of weight decay
      logging_dir='./logs_' + suffix,    # directory for storing logs
      load_best_model_at_end = load_best_model_at_end, # load the best model when finished training
      metric_for_best_model = "eval_roc_auc", # select the base metrics
      greater_is_better = True,
      logging_steps=100,                 # log & save weights each logging_steps
      #save_steps=100,
      #save_strategy="steps",
      save_strategy="epoch",
      save_total_limit=1,
      #evaluation_strategy="steps",      # evaluate each `logging_steps`
      evaluation_strategy="epoch",       # evaluate each epoch
    )

    trainer = Trainer(
        model=model,                     # the instantiated Transformers model to be trained
        args=training_args,              # training arguments, defined above
        train_dataset=train_dataset,     # training dataset
        eval_dataset=valid_dataset,      # evaluation dataset
        compute_metrics=compute_metrics, # the callback that computes metrics of interest
        optimizers=(optimizer, None)
    )
    # NOTE: metric for best model not correctly applied when adding callback
    #trainer.add_callback(CustomCallback(trainer))
    return trainer

In [None]:
import torch.nn as nn
from sklearn.model_selection import KFold

def write_float_pair_to_file(file_path, float1, float2):
    with open(file_path, 'w') as file:
        file.write(f"{float1} {float2}\n")

def get_mlp_classifier(hidden_size, num_labels, hsize_factor=1):
    new_classifier = nn.Sequential(
        nn.Linear(hidden_size, int(hidden_size * hsize_factor)),
        nn.ReLU(),
        nn.Linear(int(hidden_size * hsize_factor), num_labels)
    )
    return new_classifier

def instantiate_model(model_name, num_labels, hidden_dropout_prob, attention_probs_dropout_prob,
                      mlp_classif=False, mlp_hsize_factor=1, freeze_bert=False):
    # Define the model with the current dropout probabilities
    if "distilbert-" in model_name:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2,
                                                                   dropout=hidden_dropout_prob,
                                                                   attention_dropout=attention_probs_dropout_prob)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2,
                                                                   hidden_dropout_prob=hidden_dropout_prob,
                                                                   attention_probs_dropout_prob=attention_probs_dropout_prob)

    if mlp_classif:
        model.classifier = get_mlp_classifier(model.config.hidden_size,
                                              model.config.num_labels,
                                              mlp_hsize_factor)

    # Freeze BERT parameters if requested
    if freeze_bert:
        for param in model.bert.parameters():
            param.requires_grad = False

    return model

def single_train_exper(suffix, hidden_dropout_prob, attention_probs_dropout_prob, lr, num_epochs,
                       model_name, train_dataset, valid_dataset, test_dataset=None, load_best_model_at_end=True,
                       mlp_classif=False, mlp_hsize_factor=1, freeze_bert=False):
    model = instantiate_model(model_name,
                              num_labels=2,
                              hidden_dropout_prob=hidden_dropout_prob,
                              attention_probs_dropout_prob=attention_probs_dropout_prob,
                              mlp_classif=mlp_classif,
                              mlp_hsize_factor=mlp_hsize_factor,
                              freeze_bert=freeze_bert)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-2)


    # Rest of your code for this specific model configuration
    # Train the model
    trainer = create_trainer(suffix, model, train_dataset, valid_dataset, num_epochs=num_epochs,
                             load_best_model_at_end=load_best_model_at_end, optimizer=optimizer)
    training_results = trainer.train()  # Capture the training results

    # Evaluate the model using the validation data
    valid_results = trainer.evaluate()

    # Evaluate the model using the test data if given
    if test_dataset is not None:
        test_results = trainer.evaluate(eval_dataset=test_dataset)
    else:
        test_results = None

    return trainer, training_results, valid_results, test_results


def training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, learn_rates, num_epochs,
                        model_name, train_dataset, valid_dataset, test_dataset=None, mlp_classif=False,
                        mlp_hsize_factor = 1, freeze_bert=False):
    # Loop to iterate through all combinations
    best_eval_measure = None
    best_test_measure = None
    for hidden_dropout_prob in hidden_dropout_probs:
        for attention_probs_dropout_prob in attn_dropout_probs:
            for lr in lrs:
                # Print start message
                print(f"* Model with hidden_dropout_prob={hidden_dropout_prob} , attention_probs_dropout_prob={attention_probs_dropout_prob} and lr={lr}:")

                trainer, training_results, valid_results, test_results = single_train_exper(suffix,
                                                                                   hidden_dropout_prob,
                                                                                   attention_probs_dropout_prob,
                                                                                   lr,
                                                                                   num_epochs,
                                                                                   model_name,
                                                                                   train_dataset,
                                                                                   valid_dataset,
                                                                                   test_dataset,
                                                                                   load_best_model_at_end=True,
                                                                                   mlp_classif=mlp_classif,
                                                                                   mlp_hsize_factor=mlp_hsize_factor,
                                                                                   freeze_bert=freeze_bert)

                # Show results
                print(f"Training Results: {training_results}")
                print(f"Validation Results: {valid_results}")
                if test_dataset is not None:
                    print(f"Test Results: {test_results}")
                print("")

                # Save model if it is the best so far
                eval_measure = valid_results['eval_roc_auc']
                if best_eval_measure is None or best_eval_measure < eval_measure:
                    best_eval_measure = eval_measure
                    if test_dataset is not None:
                        best_test_measure = test_results['eval_roc_auc']
                    trainer.save_model()

    # Write validation and test AUCs to file
    output_dir='./results_' + suffix
    write_float_pair_to_file(output_dir+"/aucs.txt", best_eval_measure, best_test_measure)

    return best_eval_measure, best_test_measure

from sklearn.model_selection import KFold

def kfold_exper(df, x_colname, y_colname, n_splits, suffix, hidden_dropout_prob,
                attention_probs_dropout_prob, lr, num_epochs, model_name,
                mlp_classif=False, mlp_hsize_factor = 1, freeze_bert=False):
    sum_eval_measure = 0

    # Initialize KFold object
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)

    # Iterate over folds
    for i, (train_indices, test_indices) in enumerate(kf.split(df)):
        print(f"Fold {i}:")

        # Obtain dataframes
        df_train = df.iloc[train_indices]
        df_test = df.iloc[test_indices]

        # Obtain datasets
        X_train, X_test, y_train, y_test = create_partition_from_split(df_train, df_test, x_colname, y_colname)
        train_dataset, valid_dataset = create_datasets(X_train, X_test, y_train, y_test)

        # Execute train experiment
        _, training_results, valid_results, _ = single_train_exper(suffix,
                                                                   hidden_dropout_prob,
                                                                   attention_probs_dropout_prob,
                                                                   lr,
                                                                   num_epochs,
                                                                   model_name,
                                                                   train_dataset,
                                                                   valid_dataset,
                                                                   load_best_model_at_end=False,
                                                                   mlp_classif=mlp_classif,
                                                                   mlp_hsize_factor=mlp_hsize_factor,
                                                                   freeze_bert=freeze_bert)

        # Show results
        print(f"Training Results: {training_results}")
        print(f"Validation Results: {valid_results}")
        print("")

        # Retrieve evaluation measure
        eval_measure = valid_results['eval_roc_auc']
        sum_eval_measure += eval_measure
        print("- Evaluation measure:", eval_measure)
        print("")

    result = sum_eval_measure / n_splits
    print("Average of Evaluation Measures:", result)

    return result

## Data Preparation

### Load Original Data

In [None]:
# Set the directory and create the data

#from google.colab import drive
#drive.mount('/content/drive')

# Load the data into a pandas DataFrame | remember to have a similar structure in your Drive so that the data can be read properly.
#df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP & Personality/essays.csv', encoding = "latin-1")
df = pd.read_csv('/home/dortiz/Dropbox/work/ub/research/projects/psico/data/essays_utf8.csv', encoding = "utf-8")
df = df.replace({'y': 1, 'n': 0})

# Print a sample of the data to verify it's loaded correctly
print(df.head())

### Preprocess Data

In [None]:
import preprocessor as p

df['TEXT'] = df['TEXT'].apply(p.clean)

### Data Fragmentation

In [None]:
def fragment_essays_df(df, max_words, min_fragm_len):

    def add_dict_entry(dict_of_lists, authid, text, cext, cneu, cagr, ccon, copn):
        dict_of_lists["#AUTHID"].append(authid)
        dict_of_lists["TEXT"].append(text)
        dict_of_lists["cEXT"].append(cext)
        dict_of_lists["cNEU"].append(cneu)
        dict_of_lists["cAGR"].append(cagr)
        dict_of_lists["cCON"].append(ccon)
        dict_of_lists["cOPN"].append(copn)

    def fragment_string(text, max_words):
        words = text.split()
        fragments = []
        current_fragment = []

        for word in words:
            current_fragment.append(word)
            if len(current_fragment) == max_words:
                fragments.append(' '.join(current_fragment))
                current_fragment = []

        if current_fragment:
            fragments.append(' '.join(current_fragment))

        return fragments

    # Convert DataFrame to a dictionary of lists
    dict_of_lists = df.to_dict(orient='list')

    # Initialize fragmented dictionary
    fragm_dict_of_lists = {colname:[] for colname in dict_of_lists}

    # Iterate over rows
    num_rows = len(dict_of_lists["TEXT"])

    # Initialize dictionary to memoize best synonyms
    best_synonym_map = {}

    for i in range(num_rows):
        # Retrieve column values
        authid = dict_of_lists["#AUTHID"][i]
        text = dict_of_lists["TEXT"][i]
        cext = dict_of_lists["cEXT"][i]
        cneu = dict_of_lists["cNEU"][i]
        cagr = dict_of_lists["cAGR"][i]
        ccon = dict_of_lists["cCON"][i]
        copn = dict_of_lists["cOPN"][i]

        # Obtain fragments
        fragments = fragment_string(text, max_words)

        # Add fragments
        for fragment in fragments:
            if len(fragment) >= min_fragm_len or len(fragments) == 1:
                add_dict_entry(fragm_dict_of_lists, authid, fragment, cext, cneu, cagr, ccon, copn)

    # Create augmented dataframe
    augm_df = pd.DataFrame(fragm_dict_of_lists)

    return augm_df

In [None]:
# Fragment data
max_words = 256
min_fragm_len = 256
df_fragm = fragment_essays_df(df, max_words, min_fragm_len)
fragmented_data_file = "./df_essays_fragm_"+str(max_words)+".csv"
df_fragm.to_csv(fragmented_data_file, index=False)

In [None]:
# Load fragmented data
fragmented_data_file = "./df_essays_fragm_"+str(max_words)+".csv"
df_fragm = pd.read_csv(fragmented_data_file, encoding = "utf-8")
print(df_fragm.head())
print(len(df), len(df_fragm))

### Data Split

In [None]:
random_state = 42

# Split with validation and test sets
df_all_train, df_test = train_test_split(df, test_size=0.1, random_state=random_state)
df_train, df_valid = train_test_split(df_all_train, test_size=0.1, random_state=random_state)
print("Data with validation + test:", len(df_train), len(df_valid), len(df_test))

# Fragmented data split with validation and test sets
df_fragm_all_train, df_fragm_test = train_test_split(df_fragm, test_size=0.1, random_state=random_state, shuffle=True)
df_fragm_train, df_fragm_valid = train_test_split(df_fragm_all_train, test_size=0.1, random_state=random_state, shuffle=True)
print("Fragmented Data:", len(df_fragm_train), len(df_fragm_valid), len(df_fragm_test))
df_fragm_train.to_csv("./df_essays_fragm_train.csv", index=False)
df_fragm_valid.to_csv("./df_essays_fragm_valid.csv", index=False)
df_fragm_test.to_csv("./df_essays_fragm_test.csv", index=False)

## Extraversion

In [None]:
# FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'TEXT', 'cEXT')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'TEXT', 'cEXT')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "extraversion_fragm"
hidden_dropout_probs = [0.1, 0.2, 0.3]
attn_dropout_probs = [0.1, 0.2, 0.3]
lrs = [5e-5, 4e-5, 3e-5, 2e-5]
num_epochs = 4
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'TEXT', 'cEXT')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'TEXT', 'cEXT')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "extraversion_fragm_roberta"
hidden_dropout_probs = [0.1, 0.2, 0.3]
attn_dropout_probs = [0.1, 0.2, 0.3]
lrs = [4e-5, 3e-5, 2e-5]
num_epochs = 4
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset,
                    mlp_classif=False, mlp_hsize_factor=1)

## Neuroticism

In [None]:
# FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'TEXT', 'cNEU')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'TEXT', 'cNEU')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "neuroticism_fragm"
hidden_dropout_probs = [0.1, 0.2, 0.3]
attn_dropout_probs = [0.1, 0.2, 0.3]
lrs = [5e-5, 4e-5, 3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'TEXT', 'cNEU')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'TEXT', 'cNEU')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "neuroticism_fragm_roberta"
hidden_dropout_probs = [0.1, 0.2, 0.3]
attn_dropout_probs = [0.1, 0.2, 0.3]
lrs = [4e-5, 3e-5, 2e-5]
num_epochs = 4
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset,
                    mlp_classif=False, mlp_hsize_factor=1)

## Agreeableness

In [None]:
# FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'TEXT', 'cAGR')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'TEXT', 'cAGR')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "agreeableness_fragm"
hidden_dropout_probs = [0.1, 0.2, 0.3]
attn_dropout_probs = [0.1, 0.2, 0.3]
lrs = [3e-5, 2e-5]
num_epochs = 4
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'TEXT', 'cAGR')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'TEXT', 'cAGR')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "agreeableness_fragm_roberta"
hidden_dropout_probs = [0.1, 0.2, 0.3]
attn_dropout_probs = [0.1, 0.2, 0.3]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset,
                    mlp_classif=False, mlp_hsize_factor=1)

## Conscientiousness

In [None]:
# FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'TEXT', 'cCON')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'TEXT', 'cCON')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "conscientiousness_fragm"
#hidden_dropout_probs = [0.1, 0.2, 0.3]
#attn_dropout_probs = [0.1, 0.2, 0.3]
#lrs = [5e-5, 4e-5, 3e-5, 2e-5]
#num_epochs = 3
hidden_dropout_probs = [0.1, 0.2, 0.3]
attn_dropout_probs = [0.1, 0.2, 0.3]
lrs = [3e-5, 2e-5]
num_epochs = 4
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset,
                    mlp_classif=False, mlp_hsize_factor=1)

In [None]:
# FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'TEXT', 'cCON')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'TEXT', 'cCON')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "conscientiousness_fragm_roberta"
hidden_dropout_probs = [0.1, 0.2, 0.3]
attn_dropout_probs = [0.1, 0.2, 0.3]
lrs = [3e-5, 2e-5]
num_epochs = 4
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset,
                    mlp_classif=False, mlp_hsize_factor=1)

## Openness

In [None]:
# FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'TEXT', 'cOPN')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'TEXT', 'cOPN')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "openness_fragm"
hidden_dropout_probs = [0.1, 0.2, 0.3]
attn_dropout_probs = [0.1, 0.2, 0.3]
lrs = [5e-5, 4e-5, 3e-5, 2e-5]
num_epochs = 4
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'TEXT', 'cOPN')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'TEXT', 'cOPN')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "openness_fragm_roberta"
hidden_dropout_probs = [0.1, 0.2, 0.3]
attn_dropout_probs = [0.1, 0.2, 0.3]
lrs = [3e-5, 2e-5]
num_epochs = 4
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset,
                    mlp_classif=False, mlp_hsize_factor=1)