# Preparing the dataset

Pulling manual data classification and placing it into train-validate datasets

In [1]:
import pandas as pd

df_chi = pd.read_csv('data/chi.csv')

In [2]:
df_chi['selftext'] = df_chi['selftext'].fillna('')
df_chi['text'] = df_chi['title'] + '\n' + df_chi['selftext']

In [3]:
manual_bool = ~df_chi.loc[:, 'negative'].str.startswith('0.')
df_manual = df_chi.loc[manual_bool]

In [4]:
df_manual.loc[:, ['tone', 'emotion', 'theme']].describe()

Unnamed: 0,tone,emotion,theme
count,26,26,26
unique,3,5,5
top,neutral,fear,question
freq,16,19,16


In [5]:
tone_labels = ['negative', 'neutral', 'positive']
emotion_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
theme_labels = ['clinical update', 'community', 'question', 'education', 'advocating', 'dissuading', 'other']

In [6]:
from sklearn.model_selection import train_test_split

def train_test_val_split(df, train_size, val_size, test_size, random_state=42):
    """
    Splits a pandas dataframe into training, validation, and test sets.

    Args:
    - df: pandas dataframe to split.
    - train_size: float between 0 and 1 indicating the proportion of the dataframe to include in the training set.
    - val_size: float between 0 and 1 indicating the proportion of the dataframe to include in the validation set.
    - test_size: float between 0 and 1 indicating the proportion of the dataframe to include in the test set.
    - random_state: int or None, optional (default=42). The seed used by the random number generator.

    Returns:
    - train_df: pandas dataframe containing the training set.
    - val_df: pandas dataframe containing the validation set.
    - test_df: pandas dataframe containing the test set.

    Raises:
    - AssertionError: if the sum of train_size, val_size, and test_size is not equal to 1.
    """

    assert train_size + val_size + test_size == 1, "Train, validation, and test sizes must add up to 1."
    
    # Split the dataframe into training and test sets
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    
    # Calculate the size of the validation set relative to the original dataframe
    val_ratio = val_size / (1 - test_size)
    
    # Split the training set into training and validation sets
    train_df, val_df = train_test_split(train_df, test_size=val_ratio, random_state=random_state)
    
    return train_df, val_df, test_df

In [7]:
cols = ['id', 'created_utc', 'text', 'tone', 'emotion', 'theme']
train_df, val_df, test_df = train_test_val_split(df_manual.loc[:, cols], 0.8, 0.1, 0.1)

In [8]:
manual_path_train = "data/fine-tune/chi_train.jsonl"
manual_path_validate = "data/fine-tune/chi_validate.jsonl"
manual_path_test = "data/fine-tune/chi_test.jsonl"

train_df.to_json(manual_path_train, orient="records", lines=True)
val_df.to_json(manual_path_validate, orient="records", lines=True)
test_df.to_json(manual_path_test, orient="records", lines=True)

In [9]:
from datasets import load_dataset

data_files = {
    'train': manual_path_train,
    'validate': manual_path_validate,
    'test': manual_path_test
}

ds = load_dataset("json", data_files=data_files)

Downloading and preparing dataset json/default to /Users/stefan/.cache/huggingface/datasets/json/default-12827ee0da3e1b2b/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validate split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /Users/stefan/.cache/huggingface/datasets/json/default-12827ee0da3e1b2b/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'created_utc', 'text', 'tone', 'emotion', 'theme'],
        num_rows: 20
    })
    validate: Dataset({
        features: ['id', 'created_utc', 'text', 'tone', 'emotion', 'theme'],
        num_rows: 3
    })
    test: Dataset({
        features: ['id', 'created_utc', 'text', 'tone', 'emotion', 'theme'],
        num_rows: 3
    })
})

In [11]:
ds['train'][0]

{'id': '13vs887',
 'created_utc': '5/30/23 14:44',
 'text': 'Ride go kart?\nI had a spinal fusion done in 2010. My family and I went to the go kart track where they had an arcade and amusement rides. Year before last I drove a go kart and I remember I couldn’t really touch the gas pedal and bend because of how straight my back is, but everyone else around me could touch it just fine with their knees bent. It wasn’t that I was short, it was that I couldn’t bend. Has anyone else ever experienced this? Should you ride go karts if you’ve had this type of surgery?',
 'tone': 'neutral',
 'emotion': 'surprise',
 'theme': 'question'}

In [12]:
from transformers import AutoConfig

model_checkpoint = "facebook/bart-large-mnli"
config = AutoConfig.from_pretrained(model_checkpoint)

config.id2label


{0: 'contradiction', 1: 'neutral', 2: 'entailment'}

In [13]:
from datasets import ClassLabel

class_labels = ClassLabel(num_classes=config.num_labels, names=list(config.id2label.values()))
class_labels

ClassLabel(names=['contradiction', 'neutral', 'entailment'], id=None)

In [14]:
from datasets import Dataset
from transformers import AutoTokenizer
import os
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

os.environ["TOKENIZERS_PARALLELISM"]="True"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)


theme_labels = ['clinical update', 'community', 'question', 'education', 'advocating', 'dissuading', 'other']
num_labels = len(theme_labels)
# class_labels = ClassLabel(num_classes=num_labels, names=theme_labels)
template="This example is {}."

def preprocess_func(row):
    text = row['text']
    themes = row['theme']
    return {
        'premise': text,
        'hypothesis': [template.format(theme) for theme in themes]
    }

def encode_func(row):
    premise = row['premise']
    theme = row['theme'][0]
    contradictions = [template.format(x) for x in theme_labels if x != theme]
    encoded_input = tokenizer(
        premise*num_labels, 
        [template.format(theme)] + contradictions,
        max_length=512,
        padding="max_length",
        truncation=True
    )
    encoded_input['label'] = [2] + [0] * (num_labels - 1)
    encoded_input['input_sentence'] = tokenizer.batch_decode(encoded_input.input_ids)

    return encoded_input

def isolate_dataset(ds: Dataset, feature: str):
    cols = ds.column_names['train']
    col_keep = {'text', feature}
    
    ds_filter = ds.remove_columns(col_keep.symmetric_difference(cols))
    ds_filter = ds_filter.map(preprocess_func, batched=True, remove_columns=['text'])
    ds_filter = ds_filter.map(encode_func, batched=True, batch_size=1, remove_columns=['theme', 'premise', 'hypothesis'])
    ds_filter = ds_filter.cast_column('label', class_labels)

    return ds_filter

feature = "theme"
ds_theme = isolate_dataset(ds, feature)
ds_theme

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/140 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/21 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/21 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'label', 'input_sentence'],
        num_rows: 140
    })
    validate: Dataset({
        features: ['input_ids', 'attention_mask', 'label', 'input_sentence'],
        num_rows: 21
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'label', 'input_sentence'],
        num_rows: 21
    })
})

In [15]:
ds_theme['train'].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'label': ClassLabel(names=['contradiction', 'neutral', 'entailment'], id=None),
 'input_sentence': Value(dtype='string', id=None)}

In [16]:
ds_encoded = ds_theme.remove_columns(["input_sentence"])
ds_encoded = ds_encoded.rename_column("label", "labels")
ds_encoded.set_format("torch")

In [17]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(ds_encoded['train'], shuffle=True, batch_size=8)
eval_dataloader = DataLoader(ds_encoded['validate'], batch_size=8)

In [18]:
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3).to(device)

In [19]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [20]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [51]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/54 [00:00<?, ?it/s]

In [21]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.14285714285714285}