# Preparing the dataset

Pulling manual data classification and placing it into train-validate datasets

In [34]:
import pandas as pd

df_chi = pd.read_csv('../data/chi.csv')

In [35]:
df_chi['selftext'] = df_chi['selftext'].fillna('')
df_chi['text'] = df_chi['title'] + '\n' + df_chi['selftext']

In [36]:
manual_bool = ~df_chi.loc[:, 'negative'].str.startswith('0.')
df_manual = df_chi.loc[manual_bool]

In [37]:
df_manual.loc[:, ['tone', 'emotion', 'theme']].describe()

Unnamed: 0,tone,emotion,theme
count,26,26,26
unique,3,5,5
top,neutral,fear,question
freq,16,19,16


In [38]:
tone_labels = ['negative', 'neutral', 'positive']
emotion_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
theme_labels = ['clinical update', 'community', 'question', 'education', 'advocating', 'dissuading', 'other']

In [39]:
from sklearn.model_selection import train_test_split

def train_test_val_split(df, train_size, val_size, test_size, random_state=42):
    """
    Splits a pandas dataframe into training, validation, and test sets.

    Args:
    - df: pandas dataframe to split.
    - train_size: float between 0 and 1 indicating the proportion of the dataframe to include in the training set.
    - val_size: float between 0 and 1 indicating the proportion of the dataframe to include in the validation set.
    - test_size: float between 0 and 1 indicating the proportion of the dataframe to include in the test set.
    - random_state: int or None, optional (default=42). The seed used by the random number generator.

    Returns:
    - train_df: pandas dataframe containing the training set.
    - val_df: pandas dataframe containing the validation set.
    - test_df: pandas dataframe containing the test set.

    Raises:
    - AssertionError: if the sum of train_size, val_size, and test_size is not equal to 1.
    """

    assert train_size + val_size + test_size == 1, "Train, validation, and test sizes must add up to 1."
    
    # Split the dataframe into training and test sets
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    
    # Calculate the size of the validation set relative to the original dataframe
    val_ratio = val_size / (1 - test_size)
    
    # Split the training set into training and validation sets
    train_df, val_df = train_test_split(train_df, test_size=val_ratio, random_state=random_state)
    
    return train_df, val_df, test_df

In [40]:
cols = ['id', 'created_utc', 'text', 'tone', 'emotion', 'theme']
train_df, val_df, test_df = train_test_val_split(df_manual.loc[:, cols], 0.8, 0.1, 0.1)

In [41]:
manual_path_train = "../data/fine-tune/chi_train.jsonl"
manual_path_validate = "../data/fine-tune/chi_validate.jsonl"
manual_path_test = "../data/fine-tune/chi_test.jsonl"

train_df.to_json(manual_path_train, orient="records", lines=True)
val_df.to_json(manual_path_validate, orient="records", lines=True)
test_df.to_json(manual_path_test, orient="records", lines=True)

In [42]:
from datasets import load_dataset

data_files = {
    'train': manual_path_train,
    'validate': manual_path_validate,
    'test': manual_path_test
}

ds = load_dataset("json", data_files=data_files)

Downloading and preparing dataset json/default to /Users/stefan/.cache/huggingface/datasets/json/default-d251bf710f7c6032/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validate split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /Users/stefan/.cache/huggingface/datasets/json/default-d251bf710f7c6032/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [43]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'created_utc', 'text', 'tone', 'emotion', 'theme'],
        num_rows: 20
    })
    validate: Dataset({
        features: ['id', 'created_utc', 'text', 'tone', 'emotion', 'theme'],
        num_rows: 3
    })
    test: Dataset({
        features: ['id', 'created_utc', 'text', 'tone', 'emotion', 'theme'],
        num_rows: 3
    })
})

In [44]:
from datasets import Dataset
def isolate_dataset(ds: Dataset, feature: str):
    cols = ds.column_names['train']
    col_keep = {'text', feature}
    
    ds_filter = ds.remove_columns(col_keep.symmetric_difference(cols))
    ds_filter = ds_filter.rename_column(feature, 'label')
    ds_filter = ds_filter.class_encode_column('label')

    return ds_filter

## Setup model and trainer

In [57]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

def init_model(model_path: str):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    config = AutoConfig.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    return (tokenizer, config, model)

In [46]:
import numpy as np
import evaluate

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [47]:
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")

In [48]:
from transformers import Trainer, TrainingArguments, logging
from datasets import Dataset

logging.set_verbosity_error()

def setup_trainer(name: str, dataset: Dataset, model, tokenizer):
    logging_steps = len(dataset['train'])
    model_name = f"../fine-tuning-chkp/{name}"

    training_args = TrainingArguments(
        output_dir=model_name,
        num_train_epochs=2,
        learning_rate=2e-5,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        disable_tqdm=False,
        logging_steps=logging_steps,
        log_level="error",
        use_mps_device=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validate'],
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    return trainer

# Fine-tuning Tone

In [35]:
tone_tokenizer, tone_config, tone_model = init_model("cardiffnlp/twitter-roberta-base-sentiment-latest")

In [36]:
ds_tone = isolate_dataset(ds, 'tone')

ds_tone = ds_tone.map(
  lambda row: tone_tokenizer(row['text'], max_length=512, padding='max_length', truncation=True, return_tensors='pt'), 
  batched=True,
  remove_columns=['text']
)

ds_tone['train'].features

Casting to class labels:   0%|          | 0/20 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'neutral', 'positive'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [37]:
tone_trainer = setup_trainer('tone', dataset=ds_tone, model=tone_model, tokenizer=tone_tokenizer)

In [38]:
result = tone_trainer.train()
print_summary(result)

tone_trainer.save_model('../fine-tuning-final/tone')



  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.1915661096572876, 'eval_accuracy': 0.6666666666666666, 'eval_runtime': 0.0752, 'eval_samples_per_second': 39.888, 'eval_steps_per_second': 13.296, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.9577839970588684, 'eval_accuracy': 0.6666666666666666, 'eval_runtime': 0.0736, 'eval_samples_per_second': 40.774, 'eval_steps_per_second': 13.591, 'epoch': 2.0}
{'train_runtime': 3.6712, 'train_samples_per_second': 10.896, 'train_steps_per_second': 1.634, 'train_loss': 0.49689579010009766, 'epoch': 2.0}
Time: 3.67
Samples/second: 10.90


# Emotion fine-tuning

In [39]:
emotion_tokenizer, emotion_config, emotion_model = init_model("bhadresh-savani/distilbert-base-uncased-emotion")


In [40]:
# emotion_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

ds_emotion = isolate_dataset(ds, 'emotion')

ds_emotion = ds_emotion.map(
  lambda row: emotion_tokenizer(row['text'], max_length=512, padding='max_length', truncation=True, return_tensors='pt'), 
  batched=True,
  remove_columns=['text']
)

ds_emotion['train'].features

Casting to class labels:   0%|          | 0/20 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['fear', 'joy', 'sadness', 'surprise'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [41]:
emotion_trainer = setup_trainer('emotion', dataset=ds_emotion, model=emotion_model, tokenizer=emotion_tokenizer)

In [42]:
result = emotion_trainer.train()
print_summary(result)

emotion_trainer.save_model('../fine-tuning-final/emotion')



  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.3068583309650421, 'eval_accuracy': 1.0, 'eval_runtime': 0.0651, 'eval_samples_per_second': 46.053, 'eval_steps_per_second': 15.351, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.5239936113357544, 'eval_accuracy': 0.6666666666666666, 'eval_runtime': 0.048, 'eval_samples_per_second': 62.56, 'eval_steps_per_second': 20.853, 'epoch': 2.0}
{'train_runtime': 2.1537, 'train_samples_per_second': 18.572, 'train_steps_per_second': 2.786, 'train_loss': 2.852828025817871, 'epoch': 2.0}
Time: 2.15
Samples/second: 18.57


# Theme fine-tuning

In [53]:
feature = 'theme'
cols = ds.column_names['train']
col_keep = {'text', feature}

ds_theme = ds.remove_columns(col_keep.symmetric_difference(cols))
ds_theme = ds_theme.rename_column(feature, 'label')

In [60]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import random


theme_tokenizer, theme_config, theme_model = init_model("facebook/bart-large-mnli")
# Linear(in_features=1024, out_features=3, bias=True)
# {0: 'contradiction', 1: 'neutral', 2: 'entailment'}

theme_labels = ['clinical update', 'community', 'question', 'education', 'advocating', 'dissuading', 'other']
num_labels = len(theme_labels)
template="This example is {}."

def create_input_sequence(sample):
    text = sample['text']
    label = sample['label'][0]
    contradiction_labels = theme_labels[:]
    label_idx = contradiction_labels.index(label)
    contradiction_labels.pop(label_idx)

    encoded_sequence = theme_tokenizer(
        text,
        [template.format(label)],
        # max_length=512,
        # padding='max_length', 
        truncation=True, 
        return_tensors='pt'
    )
    encoded_sequence['labels'] = [2]
    encoded_sequence['input_sentence'] = theme_tokenizer.batch_decode(encoded_sequence.input_ids)
    return encoded_sequence

ds_theme_encoded = ds_theme.map(
    create_input_sequence, 
    batched=True, 
    batch_size=1,
    remove_columns=["label", "text"]
)

ds_theme_encoded


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 20
    })
    validate: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 3
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 3
    })
})

In [61]:
ds_theme_encoded['train'][0]

{'input_ids': [0,
  500,
  1949,
  213,
  449,
  2013,
  116,
  50118,
  100,
  56,
  10,
  21431,
  24904,
  626,
  11,
  1824,
  4,
  1308,
  284,
  8,
  38,
  439,
  7,
  5,
  213,
  449,
  2013,
  1349,
  147,
  51,
  56,
  41,
  33638,
  8,
  28445,
  9668,
  4,
  2041,
  137,
  94,
  38,
  4024,
  10,
  213,
  449,
  2013,
  8,
  38,
  2145,
  38,
  1705,
  17,
  27,
  90,
  269,
  2842,
  5,
  1123,
  26965,
  8,
  20789,
  142,
  9,
  141,
  1359,
  127,
  124,
  16,
  6,
  53,
  961,
  1493,
  198,
  162,
  115,
  2842,
  24,
  95,
  2051,
  19,
  49,
  15145,
  18822,
  4,
  85,
  938,
  17,
  27,
  90,
  14,
  38,
  21,
  765,
  6,
  24,
  21,
  14,
  38,
  1705,
  17,
  27,
  90,
  20789,
  4,
  6233,
  1268,
  1493,
  655,
  2984,
  42,
  116,
  7698,
  47,
  3068,
  213,
  449,
  7870,
  114,
  47,
  17,
  27,
  548,
  56,
  42,
  1907,
  9,
  3012,
  116,
  2,
  2,
  713,
  1246,
  16,
  864,
  4,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


In [62]:
premise = ds_theme['train'][0]['text']
template= "This example is {}."
hypothesis = template.format(ds_theme['train'][0]['label'])

# run through model pre-trained on MNLI
x = theme_tokenizer(premise, hypothesis, 
                           truncation_strategy='only_first',
        return_tensors='pt')
x
# logits = theme_model(x.to(device))[0]

# # we throw away "neutral" (dim 1) and take the probability of
# # "entailment" (2) as the probability of the label being true 
# entail_contradiction_logits = logits[:,[0,2]]
# probs = entail_contradiction_logits.softmax(dim=1)
# prob_label_is_true = probs[:,1]
# prob_label_is_true



{'input_ids': tensor([[    0,   500,  1949,   213,   449,  2013,   116, 50118,   100,    56,
            10, 21431, 24904,   626,    11,  1824,     4,  1308,   284,     8,
            38,   439,     7,     5,   213,   449,  2013,  1349,   147,    51,
            56,    41, 33638,     8, 28445,  9668,     4,  2041,   137,    94,
            38,  4024,    10,   213,   449,  2013,     8,    38,  2145,    38,
          1705,    17,    27,    90,   269,  2842,     5,  1123, 26965,     8,
         20789,   142,     9,   141,  1359,   127,   124,    16,     6,    53,
           961,  1493,   198,   162,   115,  2842,    24,    95,  2051,    19,
            49, 15145, 18822,     4,    85,   938,    17,    27,    90,    14,
            38,    21,   765,     6,    24,    21,    14,    38,  1705,    17,
            27,    90, 20789,     4,  6233,  1268,  1493,   655,  2984,    42,
           116,  7698,    47,  3068,   213,   449,  7870,   114,    47,    17,
            27,   548,    56,    42,  

In [63]:
ds_theme_encoded['train'][0]

{'input_ids': [0,
  500,
  1949,
  213,
  449,
  2013,
  116,
  50118,
  100,
  56,
  10,
  21431,
  24904,
  626,
  11,
  1824,
  4,
  1308,
  284,
  8,
  38,
  439,
  7,
  5,
  213,
  449,
  2013,
  1349,
  147,
  51,
  56,
  41,
  33638,
  8,
  28445,
  9668,
  4,
  2041,
  137,
  94,
  38,
  4024,
  10,
  213,
  449,
  2013,
  8,
  38,
  2145,
  38,
  1705,
  17,
  27,
  90,
  269,
  2842,
  5,
  1123,
  26965,
  8,
  20789,
  142,
  9,
  141,
  1359,
  127,
  124,
  16,
  6,
  53,
  961,
  1493,
  198,
  162,
  115,
  2842,
  24,
  95,
  2051,
  19,
  49,
  15145,
  18822,
  4,
  85,
  938,
  17,
  27,
  90,
  14,
  38,
  21,
  765,
  6,
  24,
  21,
  14,
  38,
  1705,
  17,
  27,
  90,
  20789,
  4,
  6233,
  1268,
  1493,
  655,
  2984,
  42,
  116,
  7698,
  47,
  3068,
  213,
  449,
  7870,
  114,
  47,
  17,
  27,
  548,
  56,
  42,
  1907,
  9,
  3012,
  116,
  2,
  2,
  713,
  1246,
  16,
  864,
  4,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


In [64]:
theme_trainer = setup_trainer('theme', dataset=ds_theme_encoded, model=theme_model, tokenizer=theme_tokenizer)

In [65]:
result = theme_trainer.train()
print_summary(result)

theme_trainer.save_model('../fine-tuning-final/theme')



  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 3) + inhomogeneous part.