This notebook uses augments that have been created by LLMs, along with real data, to train a classifier that attempts to learn to apply the label ("Resources"). The performance of the classifier is then recorded. The classifier loops through a user-specified list of combinations of LLM model (`model_id`) and temperature (`temperature`).

This notebook is for development purposes. When the code here is complete, it should be ported to a script (e.g. `use_augs_in_label_classifier.py`) so that it can be run via a .pbs script, to collect classifier performance metrics for a wide variety of combinations of LLM model and temperature.

In [1]:
# Imports
import os
import torch
#from torch.utils.data import Dataset, DataLoader
from datasets import Dataset
from scipy.special import softmax
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score
import json
import pandas as pd
from transformers import TrainingArguments, RobertaTokenizerFast, RobertaForSequenceClassification, EarlyStoppingCallback, Trainer

In [2]:
# Load the classifier training settings from the JSON file
with open('classifier_training_config.json') as f:
    training_args_dict = json.load(f)
    user = os.getenv('USER')
    training_args_dict['output_dir'] = os.path.join('/scratch/',user)

In [3]:
def load_real_retips_data(fold:int):
    """
    Given a training fold, load the real RETIPS labeled data. 
    Returns a pd.DataFrame for each of the train and test set.
    
    Args:
        fold (int): Which training fold to use.
        
    Returns:
        tuple(pd.DataFrame, pd.Dataframe): The training data, and the test data.
    """
    #TODO
    #pass

    train_file_path = f'data/stratified_data_splits/{fold}/train.csv'
    test_file_path = f'data/stratified_data_splits/{fold}/test.csv'

    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)
    
    return train_data, test_data
    
def load_augments(fold:int):
    """
    Given a training fold, load the augments based on that fold.
    Returns a pd.DataFrame of the augments.
    
    Args:
        fold (int): Which training fold to use.
        
    Returns:
        pd.DataFrame: The augments.
    """
    #TODO
    #pass
    augments_file_path = f'data/stratified_data_splits/{fold}/augments.csv'
   
    augments = pd.read_csv(augments_file_path)
    
    return augments


def filter_augments(augments:pd.DataFrame, llm_type:str, temp:float):
    """
    Given a pd.DataFrame of augments, a specified LLM type and a temperature, 
    filter the pd.DataFrame to keep only the rows for that LLM type and temperature.
    
    Args:
        augments (pd.DataFrame): The full set of unfiltered augments.
        llm_type (str): The LLM model type we want to keep.
        temp (float): The temperature we want to keep.
        
    Returns:
        pd.DataFrame: The filtered augments.
    """
    #TODO
    #pass
    filtered_aug = augments[(augments['model_id'] == llm_type) & (augments['temperature'] == temp)]
    
    return filtered_aug
            

def combine_real_with_augs(real_data:pd.DataFrame, augments:pd.DataFrame):
    """
    Given some real data and augments, combine them to form a single data frame to be used for training.
    The combined data frame does NOT include all the augments.
    Instead, this function randomly samples from the augments just enough rows 
    in order to balance the real data, so that the final dataframe has the same number of rows in each
    category.
    
    Args:
        real_data (pd.DataFrame): The real data.
        augments (pd.DataFrame): The augments.
        
    Returns:
        pd.DataFrame: the combined dataframe.
    """
    
    #TODO
    #pass
    count = 40 # currently 40 augments /what is enough to remained balanced
    sampled_augments = augments.sample(n=count, replace=False)
    df_combined = pd.concat([real_data, sampled_augments], ignore_index=True)
    
    return df_combined

class DataFrameDataset(Dataset):
    """Class for using pandas DataFrames as a datasource"""
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        return self.df.iloc[index]

    
def dataset_loader(train_df:pd.DataFrame, test_df:pd.DataFrame):
    """
    Given train and test dataframes, put them into pytorch dataloaders.
    
    Args:
        train_df (pd.DataFrame): Training df
        test_df (pd.DataFrame): Test df
        
    Returns:
        tuple(dataloader, dataloader)
    """
    
    # Create datasets
    #train_df = train_df.drop_index()
    #test_df = test_df.drop_index()
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    
    return train_dataset, test_dataset


def load_classifier_and_tokenizer(classifier_type:str):
    """
    Given a classifier type, load (from Hugging Face) that model and its tokenizer, for use as a classifier.
    
    Args:
        classifier_type (str): A string specifying which model type to load as classifier.
        
    Returns:
        tuple(model, tokenizer): The classifier and its tokenizer.
    """
    # Load classifier
    num_labels = 2 # Assumes binary classification
    if classifier_type == 'roberta':
        model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
        tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = 512)
    elif classifier_type == 'xlnet':
        model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=num_labels)
        tokenizer = XLNetTokenizerFast.from_pretrained('xlnet-base-cased', max_length = 512)
    elif classifier_type == 'distilbert':
        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', max_length = 512)
    else:
        raise ValueError(f"Invalid model_type: {classifier_type}. Expected 'roberta', 'xlnet', or 'distilbert'.")
    
    return model, tokenizer

def compute_metrics(pred, average = 'binary'):
        """
        Compute custom evaluation metrics for the model.

        Args:
            pred (EvalPrediction): An object that contains the model's predictions and labels for evaluation.

        Returns:
            dict: A dictionary containing the computed metrics.
        """
        # Extract ground truth labels and predicted labels from EvalPrediction object
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)

        # Compute predicted probabilities using softmax activation along axis 1
        probs = softmax(pred.predictions, axis=1)

        # Compute precision, recall, F1-score, and support using sklearn's precision_recall_fscore_support function
        # Set the 'average' parameter to 'macro' to compute macro-averaged metrics for multi-class classification
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average = average)

        # Compute accuracy using sklearn's accuracy_score function
        acc = accuracy_score(labels, preds)

        # Compute area under the ROC curve (AUC) using sklearn's roc_auc_score function
        auc = roc_auc_score(labels, probs[:, 1])

        # Return the computed metrics as a dictionary
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall,
            'auc': auc
        }


In [6]:
# Loop over a range of training folds, temps and model_ids, and get performances of resulting models. 
# Save performances in csv.


for fold in folds:
    train_df, test_df = load_real_retips_data(fold)
    all_augments = load_augments(fold)
    classifier_type = 'roberta'
    
    for llm_type in llm_types:
        #loop through classifiers
        for temp in temps:

            # Instantiate dict which will store the classifier performance results
            performance_dict = {'temperature':temp,
                                'training_fold':fold,
                                'llm_type':llm_type}


            # Make an augmented training dataframe
            filtered_augments = filter_augments(augments=all_augments, llm_type=llm_type, temp=temp)
            augmented_train_data = combine_real_with_augs(real_data=train_df, augments=filtered_augments)

            # Create dataset objects for training
            train_data, test_data = dataset_loader(train_df=train_df, test_df=test_df)

            # Load classifier and tokenizer
            model, tokenizer = load_classifier_and_tokenizer(classifier_type)


            # Define a function that will use the tokenizer to tokenize the data, 
            # and will return the relevant inputs for the model
            def tokenization(batched_text):
                return tokenizer(batched_text['Response'], padding = True, truncation=True)

            train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data))
            test_data = test_data.map(tokenization, batched = True, batch_size = len(test_data))
            train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
            test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

            # Use above function to transform the data
            train_data = train_data.drop_index()
            test_data = test_data.drop_index()
            
            # Create the TrainingArguments
            training_args = TrainingArguments(**training_args_dict)

            # Instantiate the EarlyStoppingCallback to add early stopping based on auc_1
            early_stopping_callback = EarlyStoppingCallback(
                early_stopping_patience=20,  # Number of epochs to wait for improvement
                early_stopping_threshold=0,  # Minimum improvement required to consider as improvement
            )
callbacks = [early_stopping_callback]
            
            # instantiate the trainer class and check for available devices

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_data,
                compute_metrics=compute_metrics,
                eval_dataset=test_data,
                callbacks=callbacks
            )
            device = 'cuda' if torch.cuda.is_available() else 'cpu'

            # Perform fine-tuning
            trainer.train()

            # Record final model performance (at best epoch)
            performance_dict.update(trainer.evaluate())
            
            # Delete the trained model (otherwise storage will fill)
            
            del trainer.model
            
            # Convert the performance_dict to a df
            perf_df = pd.DataFrame({k: [v] for k, v in performance_dict.items()})
            # try to load the previous performance data from a CSV file
            try:
                perfs_previous_df = pd.read_csv('perfs_csv_loc')
            except FileNotFoundError:
            # if the file does not exist, create an empty DataFrame
            perfs_previous_df = pd.DataFrame()

            # combine the previous and current performance data
            perfs_df = pd.concat([perfs_previous_df, perf_df])
            perfs_df.to_csv(index=False) #perfs_df.to_csv(perfs_previous_df, index=False)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should pr

Map:   0%|          | 0/136 [00:00<?, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc
5,No log,0.779493,0.23913,0.385965,0.23913,1.0,0.485714
10,0.778300,0.746936,0.23913,0.385965,0.23913,1.0,0.394805
15,0.778300,0.699282,0.304348,0.384615,0.243902,0.909091,0.597403
20,0.715100,0.639596,0.76087,0.0,0.0,0.0,0.65974
25,0.645700,0.527036,0.76087,0.0,0.0,0.0,0.753247
30,0.645700,0.638609,0.76087,0.0,0.0,0.0,0.761039
35,0.463300,0.541365,0.76087,0.0,0.0,0.0,0.784416
40,0.610700,0.534601,0.76087,0.0,0.0,0.0,0.761039
45,0.610700,0.550861,0.76087,0.0,0.0,0.0,0.758442
50,0.556900,0.524707,0.76087,0.0,0.0,0.0,0.711688


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  _warn_prf(average, modifier, msg_start, len(result))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  _warn_prf(average, modifier, msg_start, len(result))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  _warn_prf(average, modifier, msg_start, len(result))


	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after par

  _warn_prf(average, modifier, msg_start, len(result))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  _warn_prf(average, modifier, msg_start, len(result))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  _warn_prf(average, modifier, msg_start, len(result))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  _warn_prf(average, modifier, msg_start, len(result))


	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after par

  _warn_prf(average, modifier, msg_start, len(result))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  _warn_prf(average, modifier, msg_start, len(result))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  _warn_prf(average, modifier, msg_start, len(result))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  _warn_prf(average, modifier, msg_start, len(result))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  _warn_prf(average, modifier, msg_start, len(result))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  _warn_prf(average, modifier, msg_start, len(result))


'temperature,training_fold,llm_type,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_auc,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch\n1,1,nomic-ai/gpt4all-13b-snoozy,0.4247034192085266,0.7608695652173914,0.0,0.0,0.0,0.8805194805194805,0.563,81.705,10.657,10.59\n'

In [18]:
%debug

> [0;32m/local_scratch/pbs.710359.pbs02/ipykernel_3217632/2428841794.py[0m(131)[0;36mdataset_loader[0;34m()[0m
[0;32m    129 [0;31m[0;34m[0m[0m
[0m[0;32m    130 [0;31m[0;34m[0m[0m
[0m[0;32m--> 131 [0;31m    [0mtrain_dataset[0m [0;34m=[0m [0mtrain_dataset[0m[0;34m.[0m[0mmap[0m[0;34m([0m[0mtokenization[0m[0;34m,[0m [0mbatched[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m [0mbatch_size[0m [0;34m=[0m [0mlen[0m[0;34m([0m[0mtrain_data[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    132 [0;31m    [0mtest_dataset[0m [0;34m=[0m [0mtest_dataset[0m[0;34m.[0m[0mmap[0m[0;34m([0m[0mtokenization[0m[0;34m,[0m [0mbatched[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m [0mbatch_size[0m [0;34m=[0m [0mlen[0m[0;34m([0m[0mtest_data[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    133 [0;31m    [0mtrain_dataset[0m[0;34m.[0m[0mset_format[0m[0;34m([0m[0;34m'torch'[0m[0;34m,[0m [0mcolumns[

ipdb>  train_dataset


<__main__.DataFrameDataset object at 0x14ebd03b6980>


ipdb>  train_df


                                              Question  \
0    What surprised you in working through the COVI...   
1    What surprised you in working through the COVI...   
2    Describe in detail your example of an 'adjustm...   
3    Describe in detail your example of an 'adjustm...   
4    What surprised you in working through the COVI...   
..                                                 ...   
131  Could you describe any opportunities to improv...   
132  What surprised you in working through the COVI...   
133  Describe in detail your example of an 'adjustm...   
134  Describe in detail your example of an 'adjustm...   
135  What surprised you in working through the COVI...   

                                              Response  label  
0    how fast IS was able to respond once we had 40...      0  
1    How few patients were actually effected at our...      0  
2    How informed we were to have precautions. How ...      0  
3    How informed we were to have precautions. 

ipdb>  train_dataset


<__main__.DataFrameDataset object at 0x14ebd03b6980>


ipdb>  exit


# SCRATCH

Use cells below here to copy/paste bits of the code above for testing/development.

In [None]:
# E.g., suppose I've drafted load_real_retips_data(), load_augments(), and filter_augments(),
# and I want to see if they work. Then instead of using the debugger and the above for-loops 
# (which would also be fine), I can do the following:
fold = 1
llm_type = 'nomic-ai/gpt4all-13b-snoozy' # or whatever
temp = 1.0

###### This bit is just copy/pasted from the for loops #####
train_df, test_df = load_real_retips_data(fold)
all_augments = load_augments(fold)

# Instantiate dict which will store the classifier performance results
performance_dict = {'temperature':temp,
                    'training_fold':fold,
                    'llm_type':llm_type}

# Make an augmented training dataframe
filtered_augments = filter_augments(augments=all_augments, llm_type=llm_type, temp=temp)
########

# Now that I've run those copy/pasted bits, I can examine their outputs in another cell to see if they look ok

In [None]:
print(filtered_augments)