In [1]:
from huggingface_hub import login
login(token='')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Install required packages. Specific pyarrow version needed to avoid issues with Datasets

In [3]:
%%capture
%pip install pyarrow=15.0.2
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

Import required packages

In [4]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

Load model (llama-2) and the tokenizer function for input. Configu for running on GPUs

In [5]:
base_model_name = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Load the dataset

In [6]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')
file_path = '/content/drive/My Drive/test_food_bev_alc.csv'
df = pd.read_csv(file_path)

Mounted at /content/drive


In [9]:
df['Category']=[df['label'][i].split('/')[0] for i in range(len(df))]
df['Sub-Category']=[df['label'][i].split('/')[1] for i in range(len(df))]
#df.drop(['link','typeID'],axis=1,inplace=True)
df.sample(3)

Unnamed: 0,product,label,Category,Sub-Category
63680,Jell-O Cherry Sugar Free Refrigerated Gelatin3...,food/pantry/snacks/gelatin snacks,food,pantry
60903,Hunt's Crushed Tomatoes 100% Natural With No S...,food/pantry/canned goods/canned tomatoes/canne...,food,pantry
2179,Jose Cuervo Margarita Mix Classic Lime1 L,alcohol/cocktail accessories/non alcoholic coc...,alcohol,cocktail accessories


In [10]:
print(df['Category'].unique())

['alcohol' 'beverages' 'food']


Train-test split

In [11]:
# Split the DataFrame
train_size = 0.8
eval_size = 0.1

# Calculate sizes
train_end = int(train_size * len(df))
eval_end = train_end + int(eval_size * len(df))

# Split the data
X_train = df[:train_end]
X_eval = df[train_end:eval_end]
X_test = df[eval_end:]


In [12]:
# Define the prompt generation functions
def generate_prompt(data_point):
    return f"""
            Classify the text into food, alcohol or beverage and return the answer as the corresponding label.
text: {data_point["product"]}
label: {data_point["Category"]}""".strip()

def generate_test_prompt(data_point):
    return f"""
            Classify the text into food, alcohol or beverage and return the answer as the corresponding label.
text: {data_point["product"]}
label: """.strip()

X_train = X_train.copy()
X_eval = X_eval.copy()

# Generate prompts for training and evaluation data
X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)

# Generate test prompts and extract true labels
y_true = X_test.loc[:,'Category']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

In [13]:
X_train.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
food,49277
alcohol,8013
beverages,3464


In [None]:
from datasets import Dataset

# Convert to datasets for Hugginface
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [14]:
def predict(test, model, tokenizer):
    y_pred = []
    categories = df['Category'].unique()

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=2,
                        temperature=0.1)

        result = pipe(prompt)
        answer = result[0]['generated_text'].split("label:")[-1].strip()

        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")

    return y_pred

y_pred = predict(X_test, model, tokenizer)

100%|██████████| 7595/7595 [37:02<00:00,  3.42it/s]


Check model accuracy before fine tunning it

In [15]:
def evaluate(y_true, y_pred):
    labels = df.Category.unique()
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data

    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

evaluate(y_true, y_pred)

Accuracy: 0.995
Accuracy for label food: 0.995

Classification Report:
              precision    recall  f1-score   support

     alcohol       0.00      0.00      0.00         0
   beverages       0.00      0.00      0.00         0
        food       1.00      0.99      1.00      7595

   micro avg       1.00      0.99      1.00      7595
   macro avg       0.33      0.33      0.33      7595
weighted avg       1.00      0.99      1.00      7595


Confusion Matrix:
[[   0    0    0]
 [   0    0    0]
 [   6    0 7554]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
