In [24]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

import numpy as np
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, accuracy_score, classification_report
import pandas as pd
from sklearn.model_selection import train_test_split

In [25]:
PATH = '/Users/smutnuri/Documents/UVA/_Projects/Fall24/NLP/NLPRepo/data/manual/all_data.csv'

In [26]:
df = pd.read_csv(PATH)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    60 non-null     int64 
 1   video_id      60 non-null     object
 2   comment       60 non-null     object
 3   username      60 non-null     object
 4   comment_date  60 non-null     object
 5   Unnamed: 5    30 non-null     object
 6   leaning       30 non-null     object
dtypes: int64(1), object(6)
memory usage: 3.4+ KB


In [27]:
short_df = df.drop(['Unnamed: 0', 'video_id', 'username', 'comment_date', 'Unnamed: 5'], axis=1)
short_df.head()

Unnamed: 0,comment,leaning
0,Trump accidentally revealed himself in his so-...,left
1,Didn't Trump say he would debate Biden in Sept...,right
2,When you hire just to fill a box and not hire ...,right
3,Good question. As long as you’re investigating...,right
4,"I think over the last month, the independent-l...",neutral


In [28]:
# Hypotheses
categories = ["left", "right", "neutral"]

# Expanding the DataFrame to include hypotheses
expanded_rows = []
for _, row in short_df.iterrows():
    for category in categories:
        expanded_rows.append({
            "comment": row["comment"],
            "hypothesis": f"This text leans {category}",
            "label": 1 if row["leaning"] == category else 0
        })

# Create the new DataFrame
expanded_df = pd.DataFrame(expanded_rows)

In [29]:
train_df, validate_df = train_test_split(expanded_df, test_size=0.3, random_state=42)

In [30]:
modname = "mlburnham/Political_DEBATE_large_v1.0"
training_directory ='few_shot' # this is where the trained model will be saved. You can rename it to anything.

device = "mps" if torch.backends.mps.is_available() else "cpu" 
print(f"Device: {device}")

Device: mps


In [31]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 126 entries, 46 to 102
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   comment     126 non-null    object
 1   hypothesis  126 non-null    object
 2   label       126 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 3.9+ KB


In [32]:
val_ds = Dataset.from_pandas(train_df)
fs_ds = Dataset.from_pandas(validate_df)
ds = DatasetDict()

ds['few_shot'] = fs_ds
ds['validate'] = val_ds

In [33]:
# import the tokenizer using the modname variable we defined above
tokenizer = AutoTokenizer.from_pretrained(modname)

# define a generic tokenizing function
# padding will add empty tokens to the end of documents to make all documents the same length. This is generally required for passing documents through the model.
# Truncation will cut off any portion of the document longer than the models maximum accepted length.
def tokenize_function(docs):
    return tokenizer(docs['comment'], docs['hypothesis'], padding = 'max_length', truncation = True)

# Now we tokenize the dataset by applying padding, truncation, and converting each document to a tensor of numbers.
dstok = ds.map(tokenize_function)

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

In [34]:
model = AutoModelForSequenceClassification.from_pretrained(modname, num_labels = 2, ignore_mismatched_sizes=True, id2label = {0:'entailment', 1:'not_entailment'})

In [35]:
# this function will be used to calculate performance metrics during training. You can pass a different custom function, but this is a good default set of metrics
def compute_metrics(eval_pred, label_text_alphabetical=list(model.config.id2label.values())):
    # Extract labels
    labels = eval_pred.label_ids
    pred_logits = eval_pred.predictions
    preds_max = np.argmax(pred_logits, axis=1)

    # Compute the metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds_max, average='macro')
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds_max, average='micro')
    acc_balanced = balanced_accuracy_score(labels, preds_max)
    acc_not_balanced = accuracy_score(labels, preds_max)

    # Pass computed metrics to a dictionary for printing
    metrics = {'f1_macro': f1_macro,
            'f1_micro': f1_micro,
            'accuracy_balanced': acc_balanced,
            'accuracy': acc_not_balanced,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            }

    # Print results
    print("Aggregate metrics: ", {key: metrics[key] for key in metrics if key not in ["label_gold_raw", "label_predicted_raw"]} )
    print("Detailed metrics: ", classification_report(
        labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
        target_names=label_text_alphabetical, sample_weight=None,
        digits=2, output_dict=True, zero_division='warn'),
    "\n")

    return metrics

In [37]:
training_args = TrainingArguments(output_dir=training_directory,
    logging_dir=f'{training_directory}/logs',
    lr_scheduler_type= "linear",
    group_by_length=False,
    report_to='none', # change this if you're using a library like weights & biases to track model training
    learning_rate = 9e-6,# use this learning rate for the large model
    #learning_rate = 2e-5, # use this learning rate for the small model

    # batch size controls how many documents are passed through the model at once. Higher batch sizes train faster but demand more memory. lower the batch size if you are running out of memory
    per_device_train_batch_size = 2, # A smaller traning batch size is generally better for few-shot learning. This means the model will learn more from each training example.
    per_device_eval_batch_size = 16, # This just determines how fast the model will go through documents during the evaluation phase
    gradient_accumulation_steps = 1,

    num_train_epochs=5, # number of times to pass the entire training set through the model. 3-5 is generally good for few-shot training.
    warmup_ratio=0.06,  # warmup length before learning rate scheduler kicks in
    weight_decay=0.01, # weight regularization

    fp16=False,   # the data type that the model's weights are stored in. fp16 stands for floating point 16 and will make the model much smaller and faster, but can have a slight effect on performance.
    fp16_full_eval=False,

    # eval strategy defines how often the model evaluates performance on the valiation set. In a few-shot context we assume there is no validation set.
    eval_strategy="no",
    seed=1,

    # save_strategy determines how frequently a checkpoint of the model is saved. Change to 'epoch' for saving after each epoch.
    save_strategy="no",
    dataloader_num_workers = 4,# this determines how many cpu cores are used to lead data to the model. This usually isn't very important but could offer a small speed boost.
)

In [38]:
# Initialize the trainer, passing the model, tokenizer, data, and all of the arguments set above to the trainer.
trainer = Trainer(
    model = model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dstok['few_shot'],
    eval_dataset=dstok['validate'],
    compute_metrics=lambda x: compute_metrics(x, label_text_alphabetical=list(model.config.id2label.values()))
)

  trainer = Trainer(


In [39]:
# Train the model
# trainer.train()

  0%|          | 0/135 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'train_runtime': 5414.2225, 'train_samples_per_second': 0.05, 'train_steps_per_second': 0.025, 'train_loss': 8.235166196469907, 'epoch': 5.0}


TrainOutput(global_step=135, training_loss=8.235166196469907, metrics={'train_runtime': 5414.2225, 'train_samples_per_second': 0.05, 'train_steps_per_second': 0.025, 'total_flos': 251623166791680.0, 'train_loss': 8.235166196469907, 'epoch': 5.0})

In [None]:
# trainer.evaluate()

python(63951) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
python(63952) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
python(63954) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadloc

RuntimeError: MPS backend out of memory (MPS allocated: 14.18 GB, other allocations: 3.49 GB, max allowed: 18.13 GB). Tried to allocate 512.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [42]:
trainer.save_model('./few_shot')

In [43]:
validate_df.head()

Unnamed: 0,comment,hypothesis,label
19,What ever it takes to get attention vote them ...,This text leans right,0
42,"First, Israel is NOT our ally. Second, Israel ...",This text leans left,0
153,I truly think joy Reid is nuts bullying threat...,This text leans left,0
78,"THe GOP LOVES criminals, since the GOP is curr...",This text leans left,1
145,I live in Canada and one thing that affected u...,This text leans right,0


In [44]:
def process_validation_data(validation_data, output_file):
    unique_comments = validation_data[validation_data['label'] == 1].drop_duplicates(subset=['comment'])
    unique_comments = unique_comments[['comment', 'hypothesis']].copy()
    unique_comments['leaning'] = unique_comments['hypothesis'].str.replace("This text leans ", "", regex=False)
    unique_comments = unique_comments.drop(columns=['hypothesis'])
    unique_comments.to_csv(output_file, index=False)

In [46]:
validate_df.to_csv('/Users/smutnuri/Documents/UVA/_Projects/Fall24/NLP/NLPRepo/data/manual/fullvalidation.csv')

In [47]:
path = '/Users/smutnuri/Documents/UVA/_Projects/Fall24/NLP/NLPRepo/data/manual/train.csv'

process_validation_data(train_df, path)