In [None]:
# NOTE FOR LOCAL EXECUTION: use pytorch kernel and execute the code below to specify cuda device
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
# install packages
!pip install torch transformers memory_profiler datasets accelerate nltk tweet-preprocessor
import time
import datetime
tic = time.time()

In [None]:
# import modules
import torch
import random
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW
from datasets import load_metric
%load_ext memory_profiler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [None]:
# Function to create partition from split
def create_partition_from_split(train, test, x_colname, y_colname):
    X_train = train[x_colname]
    X_test = test[x_colname]
    y_train = train[y_colname]
    y_test = test[y_colname]
    return X_train, X_test, y_train, y_test

In [None]:
# Function to define datasets
def create_datasets(tokenizer, X_train, X_test, y_train, y_test):
    # Convert input dataframes into lists
    X_train = X_train.tolist()
    X_test = X_test.tolist()
    y_train = y_train.tolist()
    y_test = y_test.tolist()

    # Tokenize the text
    train_tokens = tokenizer(X_train, truncation=True, padding=True, max_length=512)
    valid_tokens = tokenizer(X_test, truncation=True, padding=True, max_length=512)

    class MakeTorchData(torch.utils.data.Dataset):
        def __init__(self, tokens, labels):
            self.tokens = tokens
            self.labels = labels

        def __getitem__(self, idx):
            item = {k: torch.tensor(v[idx]) for k, v in self.tokens.items()}
            item["labels"] = torch.tensor([self.labels[idx]])
            return item

        def __len__(self):
            return len(self.labels)

    # Convert our tokenized data into a torch Dataset
    train_dataset = MakeTorchData(train_tokens, y_train)
    valid_dataset = MakeTorchData(valid_tokens, y_test)

    return train_dataset, valid_dataset

In [None]:
from sklearn.metrics import accuracy_score
auc_metric = load_metric("roc_auc", trust_remote_code=True)

def compute_metrics(eval_pred):
    # Calculate accuracy
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=predictions)

    # Calculate AUC
    roc_auc = auc_metric.compute(prediction_scores=predictions, references=labels)["roc_auc"]

    return {"accuracy": accuracy, "roc_auc": roc_auc}

In [None]:
from transformers import TrainerCallback

class CustomCallback(TrainerCallback):

    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        train_results = self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
        print(train_results)
        eval_results = self._trainer.evaluate(eval_dataset=self._trainer.eval_dataset, metric_key_prefix="eval")
        print(eval_results)
        print()

In [None]:
# Create trainer
def create_trainer(suffix, model, train_dataset, valid_dataset, num_epochs=5,
                   load_best_model_at_end=True, optimizer=None):
    training_args = TrainingArguments(
      output_dir='./results_' + suffix,  # output directory
      num_train_epochs=num_epochs,       # total number of training epochs
      per_device_train_batch_size=16,    # batch size per device during training
      per_device_eval_batch_size=16,     # batch size for evaluation
      warmup_steps=500,                  # number of warmup steps for learning rate scheduler
      weight_decay=0.01,                 # strength of weight decay
      logging_dir='./logs_' + suffix,    # directory for storing logs
      load_best_model_at_end = load_best_model_at_end, # load the best model when finished training
      metric_for_best_model = "eval_roc_auc", # select the base metrics
      greater_is_better = True,
      logging_steps=100,                 # log & save weights each logging_steps
      #save_steps=100,
      #save_strategy="steps",
      save_strategy="epoch",
      save_total_limit=1,
      #evaluation_strategy="steps",      # evaluate each `logging_steps`
      evaluation_strategy="epoch",       # evaluate each epoch
    )

    trainer = Trainer(
        model=model,                     # the instantiated Transformers model to be trained
        args=training_args,              # training arguments, defined above
        train_dataset=train_dataset,     # training dataset
        eval_dataset=valid_dataset,      # evaluation dataset
        compute_metrics=compute_metrics, # the callback that computes metrics of interest
        optimizers=(optimizer, None)
    )
    # NOTE: metric for best model not correctly applied when adding callback
    #trainer.add_callback(CustomCallback(trainer))
    return trainer

In [None]:
import torch.nn as nn
from sklearn.model_selection import KFold

def write_float_pair_to_file(file_path, float1, float2):
    with open(file_path, 'w') as file:
        file.write(f"{float1} {float2}\n")

def get_mlp_classifier(hidden_size, num_labels, hsize_factor=1):
    new_classifier = nn.Sequential(
        nn.Linear(hidden_size, int(hidden_size * hsize_factor)),
        nn.ReLU(),
        nn.Linear(int(hidden_size * hsize_factor), num_labels)
    )
    return new_classifier

def instantiate_model(model_name, num_labels, hidden_dropout_prob, attention_probs_dropout_prob,
                      mlp_classif=False, mlp_hsize_factor=1, freeze_bert=False):
    # Define the model with the current dropout probabilities
    if "distilbert-" in model_name:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2,
                                                                   dropout=hidden_dropout_prob,
                                                                   attention_dropout=attention_probs_dropout_prob)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2,
                                                                   hidden_dropout_prob=hidden_dropout_prob,
                                                                   attention_probs_dropout_prob=attention_probs_dropout_prob)

    if mlp_classif:
        model.classifier = get_mlp_classifier(model.config.hidden_size,
                                              model.config.num_labels,
                                              mlp_hsize_factor)

    # Freeze BERT parameters if requested
    if freeze_bert:
        for param in model.bert.parameters():
            param.requires_grad = False

    return model

def single_train_exper(suffix, hidden_dropout_prob, attention_probs_dropout_prob, lr, num_epochs,
                       model_name, train_dataset, valid_dataset, test_dataset=None, load_best_model_at_end=True,
                       mlp_classif=False, mlp_hsize_factor=1, freeze_bert=False):
    model = instantiate_model(model_name,
                              num_labels=2,
                              hidden_dropout_prob=hidden_dropout_prob,
                              attention_probs_dropout_prob=attention_probs_dropout_prob,
                              mlp_classif=mlp_classif,
                              mlp_hsize_factor=mlp_hsize_factor,
                              freeze_bert=freeze_bert)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-2)


    # Rest of your code for this specific model configuration
    # Train the model
    trainer = create_trainer(suffix, model, train_dataset, valid_dataset, num_epochs=num_epochs,
                             load_best_model_at_end=load_best_model_at_end, optimizer=optimizer)
    training_results = trainer.train()  # Capture the training results

    # Evaluate the model using the validation data
    valid_results = trainer.evaluate()

    # Evaluate the model using the test data if given
    if test_dataset is not None:
        test_results = trainer.evaluate(eval_dataset=test_dataset)
    else:
        test_results = None

    return trainer, training_results, valid_results, test_results


def training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, learn_rates, num_epochs,
                        model_name, train_dataset, valid_dataset, test_dataset=None, mlp_classif=False,
                        mlp_hsize_factor = 1, freeze_bert=False):
    # Loop to iterate through all combinations
    best_eval_measure = None
    best_test_measure = None
    for hidden_dropout_prob in hidden_dropout_probs:
        for attention_probs_dropout_prob in attn_dropout_probs:
            for lr in lrs:
                # Print start message
                print(f"* Model with hidden_dropout_prob={hidden_dropout_prob} , attention_probs_dropout_prob={attention_probs_dropout_prob} and lr={lr}:")

                trainer, training_results, valid_results, test_results = single_train_exper(suffix,
                                                                                   hidden_dropout_prob,
                                                                                   attention_probs_dropout_prob,
                                                                                   lr,
                                                                                   num_epochs,
                                                                                   model_name,
                                                                                   train_dataset,
                                                                                   valid_dataset,
                                                                                   test_dataset,
                                                                                   load_best_model_at_end=True,
                                                                                   mlp_classif=mlp_classif,
                                                                                   mlp_hsize_factor=mlp_hsize_factor,
                                                                                   freeze_bert=freeze_bert)

                # Show results
                print(f"Training Results: {training_results}")
                print(f"Validation Results: {valid_results}")
                if test_dataset is not None:
                    print(f"Test Results: {test_results}")
                print("")

                # Save model if it is the best so far
                eval_measure = valid_results['eval_roc_auc']
                if best_eval_measure is None or best_eval_measure < eval_measure:
                    best_eval_measure = eval_measure
                    if test_dataset is not None:
                        best_test_measure = test_results['eval_roc_auc']
                    trainer.save_model()

    # Write validation and test AUCs to file
    output_dir='./results_' + suffix
    write_float_pair_to_file(output_dir+"/aucs.txt", best_eval_measure, best_test_measure)

    return best_eval_measure, best_test_measure

from sklearn.model_selection import KFold

def kfold_exper(df, x_colname, y_colname, n_splits, suffix, hidden_dropout_prob,
                attention_probs_dropout_prob, lr, num_epochs, model_name,
                mlp_classif=False, mlp_hsize_factor = 1, freeze_bert=False):
    sum_eval_measure = 0

    # Initialize KFold object
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)

    # Iterate over folds
    for i, (train_indices, test_indices) in enumerate(kf.split(df)):
        print(f"Fold {i}:")

        # Obtain dataframes
        df_train = df.iloc[train_indices]
        df_test = df.iloc[test_indices]

        # Obtain datasets
        X_train, X_test, y_train, y_test = create_partition_from_split(df_train, df_test, x_colname, y_colname)
        train_dataset, valid_dataset = create_datasets(X_train, X_test, y_train, y_test)

        # Execute train experiment
        _, training_results, valid_results, _ = single_train_exper(suffix,
                                                                   hidden_dropout_prob,
                                                                   attention_probs_dropout_prob,
                                                                   lr,
                                                                   num_epochs,
                                                                   model_name,
                                                                   train_dataset,
                                                                   valid_dataset,
                                                                   load_best_model_at_end=False,
                                                                   mlp_classif=mlp_classif,
                                                                   mlp_hsize_factor=mlp_hsize_factor,
                                                                   freeze_bert=freeze_bert)

        # Show results
        print(f"Training Results: {training_results}")
        print(f"Validation Results: {valid_results}")
        print("")

        # Retrieve evaluation measure
        eval_measure = valid_results['eval_roc_auc']
        sum_eval_measure += eval_measure
        print("- Evaluation measure:", eval_measure)
        print("")

    result = sum_eval_measure / n_splits
    print("Average of Evaluation Measures:", result)

    return result

## Data Preparation

### Load Original Data

In [None]:
# Set the directory and create the data

#from google.colab import drive
#drive.mount('/content/drive')

# Load the data into a pandas DataFrame | remember to have a similar structure in your Drive so that the data can be read properly.
#df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP & Personality/Fine tuning/MBTI/mbti_2.csv', encoding = "latin-1")
df = pd.read_csv('/home/dortiz/nlp/tasks/psico/mbti_2.csv',encoding = "latin-1")

# Print a sample of the data to verify it's loaded correctly
print(df.head())

### Preprocess Data

In [None]:
import preprocessor as p

df['text'] = df['text'].apply(p.clean)

### Data Fragmentation

In [None]:
def fragment_mbti_df(df, max_words, min_fragm_len):

    def add_dict_entry(dict_of_lists, ty, text, ie, ns, ft, jp):
        dict_of_lists["type"].append(ty)
        dict_of_lists["text"].append(text)
        dict_of_lists["I/E"].append(ie)
        dict_of_lists["N/S"].append(ns)
        dict_of_lists["F/T"].append(ft)
        dict_of_lists["J/P"].append(jp)

    def fragment_string(text, max_words):
        words = text.split()
        fragments = []
        current_fragment = []

        for word in words:
            current_fragment.append(word)
            if len(current_fragment) == max_words:
                fragments.append(' '.join(current_fragment))
                current_fragment = []

        if current_fragment:
            fragments.append(' '.join(current_fragment))

        return fragments

    # Convert DataFrame to a dictionary of lists
    dict_of_lists = df.to_dict(orient='list')

    # Initialize fragmented dictionary
    fragm_dict_of_lists = {colname:[] for colname in dict_of_lists}

    # Iterate over rows
    num_rows = len(dict_of_lists["text"])

    # Initialize dictionary to memoize best synonyms
    best_synonym_map = {}

    for i in range(num_rows):
        # Retrieve column values
        ty = dict_of_lists["type"][i]
        text = dict_of_lists["text"][i]
        ie = dict_of_lists["I/E"][i]
        ns = dict_of_lists["N/S"][i]
        ft = dict_of_lists["F/T"][i]
        jp = dict_of_lists["J/P"][i]

        # Obtain fragments
        fragments = fragment_string(text, max_words)

        # Add fragments
        for fragment in fragments:
            if len(fragment) >= min_fragm_len or len(fragments) == 1:
                add_dict_entry(fragm_dict_of_lists, ty, fragment, ie, ns, ft, jp)

    # Create augmented dataframe
    augm_df = pd.DataFrame(fragm_dict_of_lists)

    return augm_df

In [None]:
# Fragment data
max_words = 256
min_fragm_len = 256
df_fragm = fragment_mbti_df(df, max_words, min_fragm_len)
fragmented_data_file = "./df_mbti_fragm_"+str(max_words)+".csv"
df_fragm.to_csv(fragmented_data_file, index=False)

In [None]:
# Load fragmented data
fragmented_data_file = "./df_mbti_fragm_"+str(max_words)+".csv"
df_fragm = pd.read_csv(fragmented_data_file, encoding = "utf-8")
print(df_fragm.head())
print(len(df), len(df_fragm))

### Mask Words

In [None]:
import re

mbti_words = {'ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ENFJs', 'ENFPs', 'ENTJs', 'ENTPs',
              'ESFJ', 'ESFP', 'ESTJ', 'ESTP', 'ESFJs', 'ESFPs', 'ESTJs', 'ESTPs',
              'INFJ', 'INFP', 'INTJ', 'INTP', 'INFJs', 'INFPs', 'INTJs', 'INTPs',
              'ISFJ', 'ISFP', 'ISTJ', 'ISTP', 'ISFJs', 'ISFPs', 'ISTJs', 'ISTPs',
              'INFX', 'INFx', 'INFJness', 'ISFX', 'ISFx', 'ESxx', 'Ixxx', 'ISTJish', 'EXFJ',
              'E', 'N', 'F', 'J', 'P',
              'Es', 'Ns', 'Ss', 'Fs', 'Ts', 'Js', 'Ps',
              'EN', 'FJ', 'FP', 'TJ', 'TP',
              'ENs', 'FJs', 'FPs', 'TJs', 'TPs',
              'NE', 'NI', 'SI', 'TE',
              'NEs', 'NIs', 'SIs', 'TEs',
              'TI', 'FI',
              'TIs', 'FIs',
              'NF', 'NT', 'SF', 'ST',
              "SE", "FE", "PE", "JE",
              "SEs", "FEs", "PEs", "JEs",
              'NFs', 'NTs', 'TJs', 'SFs', 'STs',
              'ENF', 'ENT', 'ESF', 'EST',
              'ENFs', 'ENTs', 'ESFs', 'ESTs',
              'NFJ', 'NFP', 'NTJ', 'NTP',
              'NFJs', 'NFPs', 'NTJs', 'NTPs',
              'SFJ', 'SFP', 'STJ', 'STP',
              'SFJs', 'SFPs', 'STJs', 'STPs',
              'INF', 'INT', 'ISF', 'IST',
              'INFs', 'INTs', 'ISFs', 'ISTs'}

def mask_words_bert(text, words_to_replace):
    # Create a regular expression pattern to match the combinations
    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_replace) + r')\b'

    # Replace the matched combinations with a masking token (e.g., "[MASK]")
    masked_text, num_replacements = re.subn(pattern, '[UNK]', text, flags=re.IGNORECASE)

    return masked_text, num_replacements

def mask_words_roberta(text, words_to_replace):
    # Create a regular expression pattern to match the combinations
    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_replace) + r')\b'

    # Replace the matched combinations with a masking token (e.g., "[MASK]")
    masked_text, num_replacements = re.subn(pattern, '<unk>', text, flags=re.IGNORECASE)

    return masked_text, num_replacements

# Apply the masking function to the 'text' column and create a new column
df['masked_text_bert'], df['num_replacements'] = zip(*df['text'].apply(lambda x: mask_words_bert(x, mbti_words)))
df['masked_text_roberta'], _ = zip(*df['text'].apply(lambda x: mask_words_roberta(x, mbti_words)))

# Calculate the sum of replacements
total_replacements = df['num_replacements'].sum()
print("total replacements: ", total_replacements)

# Repeat for fragmented data
df_fragm['masked_text_bert'], df_fragm['num_replacements'] = zip(*df_fragm['text'].apply(lambda x: mask_words_bert(x, mbti_words)))
df_fragm['masked_text_roberta'], _ = zip(*df_fragm['text'].apply(lambda x: mask_words_roberta(x, mbti_words)))

# Print the first five rows to check the result
print(df.head())

### Mask Words Randomly

In [None]:
import random

def mask_words_randomly_bert(text, fraction):
    return ' '.join('[UNK]' if random.random() < fraction else word for word in text.split())

def mask_words_randomly_roberta(text, fraction):
    return ' '.join('<unk>' if random.random() < fraction else word for word in text.split())

def count_unk_bert(df):
    return df['random_masked_text_bert'].str.count('\[UNK\]').sum()

def count_unk_roberta(df):
    return df['random_masked_text_roberta'].str.count('<unk>').sum()

# Calculate total word count and fraction of masked words
total_word_count = sum(df['text'].apply(lambda x: len(x.split())))
masked_words_fraction = total_replacements/total_word_count
print(total_word_count, total_replacements, masked_words_fraction)

# Set random seed
random.seed(42)

# Apply the function to each row of the DataFrame using a lambda function and store results in a new column
df['random_masked_text_bert'] = df['text'].apply(lambda x: mask_words_randomly_bert(x, masked_words_fraction))
df_fragm['random_masked_text_bert'] = df_fragm['text'].apply(lambda x: mask_words_randomly_bert(x, masked_words_fraction))
df['random_masked_text_roberta'] = df['text'].apply(lambda x: mask_words_randomly_roberta(x, masked_words_fraction))
df_fragm['random_masked_text_roberta'] = df_fragm['text'].apply(lambda x: mask_words_randomly_roberta(x, masked_words_fraction))

# Print the first five rows to check the result
print(df['random_masked_text_bert'].head())

# Count the number of [UNK] occurrences in the new column
unk_count = count_unk_bert(df)
print(unk_count)

### Data Split

In [None]:
random_state = 42

# Split with validation and test sets
df_all_train, df_test = train_test_split(df, test_size=0.1, random_state=random_state)
df_train, df_valid = train_test_split(df_all_train, test_size=0.1, random_state=random_state)
print("Data with validation + test:", len(df_train), len(df_valid), len(df_test))

# Fragmented data split with validation and test sets
df_fragm_all_train, df_fragm_test = train_test_split(df_fragm, test_size=0.1, random_state=random_state, shuffle=True)
df_fragm_train, df_fragm_valid = train_test_split(df_fragm_all_train, test_size=0.1, random_state=random_state, shuffle=True)
print("Fragmented Data:", len(df_fragm_train), len(df_fragm_valid), len(df_fragm_test))
df_fragm_train.to_csv("./df_mbti_fragm_train.csv", index=False)
df_fragm_valid.to_csv("./df_mbti_fragm_valid.csv", index=False)
df_fragm_test.to_csv("./df_mbti_fragm_test.csv", index=False)

## I/E

In [None]:
# VALID + TEST EXPERIMENT

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_train, df_valid, 'text', 'I/E')
_, X_test, _, y_test = create_partition_from_split(df_train, df_test, 'text', 'I/E')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "mbti_ie_valtest"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [4e-5, 3e-5, 2e-5]
num_epochs = 4
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'text', 'I/E')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'text', 'I/E')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "mbti_ie_fragm_256"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [4e-5, 3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'text', 'I/E')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'text', 'I/E')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "mbti_ie_fragm_256_roberta"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# MASKED + FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'masked_text_roberta',
                                                                 'I/E')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'masked_text_roberta', 'I/E')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "masked_mbti_ie_fragm_256_roberta"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# RANDOM MASKED + FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'random_masked_text_bert',
                                                                 'I/E')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'random_masked_text_bert', 'I/E')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "rand_masked_mbti_ie_fragm_256"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# RANDOM MASKED + FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create dadtasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'random_masked_text_roberta',
                                                                 'I/E')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'random_masked_text_roberta', 'I/E')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "rand_masked_mbti_ie_fragm_256_roberta"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

## N/S

In [None]:
# FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'text', 'N/S')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'text', 'N/S')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "mbti_ns_fragm_256"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [4e-5, 3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'text', 'N/S')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'text', 'N/S')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "mbti_ns_fragm_256_roberta"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# MASKED + FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'masked_text_bert',
                                                                 'N/S')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'masked_text_bert', 'N/S')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "masked_mbti_ns_fragm_256"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# MASKED + FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'masked_text_roberta',
                                                                 'N/S')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'masked_text_roberta', 'N/S')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "masked_mbti_ns_fragm_256_roberta"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# RANDOM MASKED + FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'random_masked_text_bert',
                                                                 'N/S')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'random_masked_text_bert', 'N/S')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "rand_masked_mbti_ie_fragm_256"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# RANDOM MASKED + FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create dadtasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'random_masked_text_roberta',
                                                                 'N/S')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'random_masked_text_roberta', 'N/S')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "rand_masked_mbti_ns_fragm_256_roberta"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

## F/T

In [None]:
# FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'text', 'F/T')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'text', 'F/T')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "mbti_ft_fragm_256"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'text', 'F/T')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'text', 'F/T')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "mbti_ft_fragm_256_roberta"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# MASKED + FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'masked_text_bert',
                                                                 'F/T')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'masked_text_bert', 'F/T')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "masked_mbti_ft_fragm_256"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# MASKED + FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'masked_text_roberta',
                                                                 'F/T')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'masked_text_roberta', 'F/T')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "masked_mbti_ft_fragm_256_roberta"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# RANDOM MASKED + FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'random_masked_text_bert',
                                                                 'F/T')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'random_masked_text_bert', 'F/T')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "rand_masked_mbti_ft_fragm_256"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# RANDOM MASKED + FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create dadtasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'random_masked_text_roberta',
                                                                 'F/T')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'random_masked_text_roberta', 'F/T')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "rand_masked_mbti_ft_fragm_256_roberta"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

## J/P

In [None]:
# FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'text', 'J/P')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'text', 'J/P')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "mbti_jp_fragm_256"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid, 'text', 'J/P')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'text', 'J/P')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "mbti_jp_fragm_256_roberta"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# MASKED + FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'masked_text_bert',
                                                                 'J/P')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'masked_text_bert', 'J/P')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "masked_mbti_jp_fragm_256"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# MASKED + FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'masked_text_roberta',
                                                                 'J/P')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'masked_text_roberta', 'J/P')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "masked_mbti_jp_fragm_256_roberta"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# RANDOM MASKED + FRAGMENTED DATA (256)

# Define model name
model_name = "bert-base-uncased"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create datasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'random_masked_text_bert',
                                                                 'J/P')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'random_masked_text_bert', 'J/P')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "rand_masked_mbti_jp_fragm_256"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)

In [None]:
# RANDOM MASKED + FRAGMENTED DATA (256) + ROBERTA

# Define model name
model_name = "FacebookAI/roberta-base"

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Create dadtasets
X_train, X_valid, y_train, y_valid = create_partition_from_split(df_fragm_train, df_fragm_valid,
                                                                 'random_masked_text_roberta',
                                                                 'J/P')
_, X_test, _, y_test = create_partition_from_split(df_fragm_train, df_fragm_test, 'random_masked_text_roberta', 'J/P')

train_dataset, valid_dataset = create_datasets(tokenizer, X_train, X_valid, y_train, y_valid)
_, test_dataset = create_datasets(tokenizer, X_train, X_test, y_train, y_test)

# Launch training experiment
suffix = "rand_masked_mbti_jp_fragm_256_roberta"
hidden_dropout_probs = [0.1, 0.2]
attn_dropout_probs = [0.1, 0.2]
lrs = [3e-5, 2e-5]
num_epochs = 3
training_experiment(suffix, hidden_dropout_probs, attn_dropout_probs, lrs, num_epochs,
                    model_name, train_dataset, valid_dataset, test_dataset)