# Preparing Data for Distallation

Charles Ciampa

In [1]:
import ollama
import numpy as np
import pandas as pd
from typing import Dict, Callable
import warnings

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import torch.nn.functional as F
import os
from tqdm.notebook import tqdm

In [2]:
from huggingface_hub import notebook_login

notebook_login(False)



In [3]:
from huggingface_hub import scan_cache_dir

print(scan_cache_dir())
# delete_strategy = scan_cache_dir().delete_revisions(
#     "8d8ffc158a3bee9fbb03afacdfc347c823c5ec8b"
# )

# print("Will free " + delete_strategy.expected_freed_size_str)



In [4]:
class DistilModelData:
    """ Class will load data from a tokenizer, model, and a dataset. Also a prompt and labels will be provided.
    """
    def __init__(self):
        # Initialize the variables
        self._train_df = None
        self._test_df = None
        self._labels = None
        self._reversed_labels = None
        self._prompt: Callable | None = None
        self._num_examples: int = 0
        self._model: AutoModelForCausalLM = None
        self._tokenizer: AutoTokenizer = None
    
    def set_labels(self, labels: Dict[int, str]):
        """Provided a dictionary of labels it will se the labels. The keys are the integer labels in the dataset and the values of the dictionary are the labels for the prompt into the models.

        Args:
            labels (Dict[int, str]): The labels to be saved

        Raises:
            ValueError: A dictionary must be provided as input otherwise an error will be risen.
            ValueError: If not all the keys are integers it will cause issues.
            ValueError: If not all the values are strings it will raise an error.
        """
        if self._train_df is None or self._test_df is None:
            raise ValueError("The train and test dataframes have not be set yet. You must set to ensure that each of the labels in the dataframe have been set.")
        if not isinstance(labels, dict):
            raise ValueError("Labels must be a dictionary")
        if not all(isinstance(k, int) for k in labels.keys()):
            raise ValueError("Label keys must be integers")
        if not all(isinstance(v, str) for v in labels.values()):
            raise ValueError("Label values must be strings")
        label_keys = set(labels.keys())
        train_df_labels = set(self._train_df['label'].unique())
        test_df_labels = set(self._test_df["label"].unique())
        if not train_df_labels.issubset(label_keys) or not test_df_labels.issubset(label_keys):
            raise ValueError(f"The provided labels are missing assigned string values for the following values: {', '.join(train_df_labels.difference(label_keys).union(test_df_labels.difference(label_keys)))}.")
        self._labels = labels
        self._reversed_labels = {v: k for k, v in self._labels.items()}
    
    def set_num_examples_in_prompt(self, num: int = 0):
        """Provided an integer it will set the number of examples in the prompt.

        Args:
            num (int): The number of examples to be saved.

        Raises:
            ValueError: An integer must be provided.
        """
        if not isinstance(num, int):
            raise ValueError("An integer must be provided")
        self._num_examples = num
    
    def set_prompt(self, prompt_func: Callable[[str, dict, pd.DataFrame], str]):
        # Prompt function takes in as such f(string to label, label options, example dataframe) -> prompt string
        self._prompt = prompt_func

    def set_model(self, model_name: str, bnb_config: None | BitsAndBytesConfig = None):
        if not isinstance(model_name, str):
            raise ValueError("A model name must be provided as a string")
        
        self._tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

        self._model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True,
            trust_remote_code=True,
        )

        print(self._model.device)

    def reset_datasets_and_labels(self):
        self._labels = None
        self._train_df = None
        self._test_df = None
    
    def set_datasets_from_path(
        self,
        train_path: str,
        test_path: str,
        rename_columns: Dict[str, str] = {},
        create_columns: None | Callable[[pd.DataFrame], pd.DataFrame] = None,
        ignore_common_text_thresh: float = 0,
    ):
        # Loads the data
        try:
            train_temp = pd.read_parquet(train_path)
            test_temp = pd.read_parquet(test_path)
            # Renames the columns if provided any renames. This is there to help you make sure there is a text and label column as these will be used in this code
            train_temp.rename(columns=rename_columns, inplace=True)
            test_temp.rename(columns=rename_columns, inplace=True)
            # Runs a provided function which modifies the data to ensure that there are columns text and label, and their values are appropriet.
            if create_columns is not None:
                train_temp = create_columns(train_temp)
                test_temp = create_columns(test_temp)
        except Exception as e:
            raise e
        # This is where it actually sets the data. At this point no errors should have occured so its safe to finally set the values. The last checks will be here.
        self.set_datasets(
            train_temp.copy(),
            test_temp.copy(),
            ignore_common_text_thresh=ignore_common_text_thresh,
        )
    

    def set_datasets(self, train_df: pd.DataFrame, test_df: pd.DataFrame, ignore_common_text_thresh: float = 0):
        """Sets the train and test datasets.

        Args:
            train_df (pd.DataFrame): The training dataframe.
            test_df (pd.DataFrame): The testing dataframe.

        Raises:
            ValueError: Both inputs must be pandas DataFrames.
            ValueError: Train DataFrame must have 'text' and 'label' columns.
            ValueError: Test DataFrame must have 'text' and 'label' columns.
            ValueError: Train DataFrame 'label' column must be of integer type.
            ValueError: Test DataFrame 'label' column must be of integer type.
            ValueError: Train DataFrame 'text' column must be of string type.
            ValueError: Test DataFrame 'text' column must be of string type.
            ValueError: Train and Test DataFrames share common text entries. Data leakage detected.
        """
        # Ensures that both of the inputs are DataFrames
        if not isinstance(train_df, pd.DataFrame) or not isinstance(test_df, pd.DataFrame):
            raise ValueError("Both inputs must be pandas DataFrames.")
        
        # Checks that there is a labels and text column
        if "text" not in train_df.columns or "label" not in train_df.columns:
            raise ValueError("Train DataFrame must have 'text' and 'label' columns.")
        if "text" not in test_df.columns or "label" not in test_df.columns:
            raise ValueError("Test DataFrame must have 'text' and 'label' columns.")
        
        # Ensure that the labels are of the integer type
        if not pd.api.types.is_integer_dtype(train_df["label"]):
            raise ValueError("Train DataFrame 'label' column must be of integer type.")
        if not pd.api.types.is_integer_dtype(test_df["label"]):
            raise ValueError("Test DataFrame 'label' column must be of integer type.")
        
        # Ensure that the text columns are a string value
        if not pd.api.types.is_string_dtype(train_df["text"]):
            raise ValueError("Train DataFrame 'text' column must be of string type")
        if not pd.api.types.is_string_dtype(test_df["text"]):
            raise ValueError("Test DataFrame 'text' column must be of string type")
        
        # Check for overlapping data between train and test sets based on the 'text' column
        common_texts = set(train_df["text"]).intersection(set(test_df["text"]))
        if common_texts:
            perc = len(common_texts) / len(test_df) 
            err = f"Data leakage detected! Train and Test DataFrames share {len(common_texts)} ({perc:.2%} of testing dataset) common text entries."
            if perc > ignore_common_text_thresh:
                raise ValueError(err)
            else:
                warnings.warn(err)
        self._train_df = train_df
        self._test_df = test_df

    def distil_labels(self):
        if self._labels is None:
            raise ValueError("Labels must be set.")
        if self._train_df is None or self._test_df is None:
            raise ValueError("Datasets must be set.")
        if self._model is None or self._tokenizer is None:
            raise ValueError("Model and Tokenizer must be set")
        if self._prompt is None:
            raise ValueError("Prompt must be set.")
        if self._model is None or self._tokenizer is None:
            raise ValueError("Model and Tokenizer have not been set yet.")
        
        train_examples = {k: [] for k in self._labels.keys()}
        with torch.inference_mode():
            for i, row in tqdm(self._train_df.iterrows(), total=len(self._train_df), desc="Getting Probability of Labels"):
                # Create the prompt
                prompt = self._prompt(
                    row["text"], self._labels, self._train_df.drop(i).sample(self._num_examples)
                )
                # Get the prompt encoding
                model_inputs = self._tokenizer(prompt, return_tensors="pt").to(
                    self._model.device
                )
                # Input into the model and get the output
                model_outputs = self._model(**model_inputs)
                # Get the last token output
                next_token_logits = model_outputs.logits[:, -1, :]
                # Get the probabilities of the values
                probs = F.softmax(next_token_logits, dim=-1)[0]
                # Iterate through the labels and get the probability of it
                label_probs = {}
                for label in self._labels.values():
                    # For simplicity, use first token probability
                    label_tokens = self._tokenizer.encode(f" {label}", add_special_tokens=False)
                    token_id = label_tokens[0]
                    prob = probs[token_id].item()
                    label_probs[label] = prob
                # Normalize the probabilities of the values
                total = sum(label_probs.values())
                for k, v in label_probs.items():
                    train_examples[self._reversed_labels[k]].append(v / total)
        for k, v in train_examples.items():
            self._train_df[f'label_{k}'] = v
    def folder_export(self, path: str):
        if self._test_df  is None or self._train_df is None:
            raise ValueError("The datasets have not been set.")
        self._train_df.to_csv(f"{path}train.csv", index=False)
        self._test_df.to_csv(f"{path}test.csv", index=False)
    
    def export_files(self, train_path: str, test_path: str):
        if self._test_df is None or self._train_df is None:
            raise ValueError("The datasets have not been set.")
        self._train_df.to_csv(train_path, index=False)
        self._test_df.to_csv(test_path, index=False)


In [5]:
model_distallation = DistilModelData()

# # "hf://datasets/stanfordnlp/imdb/" + splits["train"])
# splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet', 'unsupervised': 'plain_text/unsupervised-00000-of-00001.parquet'}
model_distallation.set_datasets_from_path(
    train_path="hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet",
    test_path="hf://datasets/stanfordnlp/imdb/plain_text/test-00000-of-00001.parquet",
    ignore_common_text_thresh=0.01
)

model_distallation.set_labels({0: "Negative", 1: "Positive"})



In [6]:
model_distallation.set_model("meta-llama/Meta-Llama-3.1-8B-Instruct")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

cuda:0


In [7]:
model_distallation.set_prompt(
    lambda ex,
    labels,
    _: f"""Classify the sentiment as {", ".join(list(labels.values())[:-1])}, or {list(labels.values())[-1]}.

Text: {ex}
Sentiment:"""
)

In [8]:
model_distallation.distil_labels()

Getting Probability of Labels:   0%|          | 0/25000 [00:00<?, ?it/s]

In [9]:
model_distallation.folder_export("../data/")

In [10]:
model_distallation._train_df

Unnamed: 0,text,label,label_0,label_1
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,0.020954,0.979046
1,"""I Am Curious: Yellow"" is a risible and preten...",0,0.978229,0.021771
2,If only to avoid making this type of film in t...,0,0.988581,0.011419
3,This film was probably inspired by Godard's Ma...,0,0.983598,0.016402
4,"Oh, brother...after hearing about this ridicul...",0,0.992127,0.007873
...,...,...,...,...
24995,A hit at the time but now better categorised a...,1,0.936263,0.063737
24996,I love this movie like no other. Another time ...,1,0.002822,0.997178
24997,This film and it's sequel Barry Mckenzie holds...,1,0.001142,0.998858
24998,'The Adventures Of Barry McKenzie' started lif...,1,0.078626,0.921374
