In [None]:
# Install the missing module
!pip install -q accelerate -U bitsandbytes
!pip install -q scikit-multilearn datasets peft transformers
import os
import random
import functools
import csv
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score
from skmultilearn.model_selection import iterative_train_test_split # This import should now work
from datasets import Dataset, DatasetDict
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
import os
os.environ["WANDB_MODE"] = "disabled"
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)


os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs


In [None]:
# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels'])
    return d


# define which metrics to compute for evaluation
# def compute_metrics(p):
#     predictions, labels = p
#     f1_micro = f1_score(labels, predictions > 0, average = 'micro')
#     f1_macro = f1_score(labels, predictions > 0, average = 'macro')
#     f1_weighted = f1_score(labels, predictions > 0, average = 'weighted')
#     return {
#         'f1_micro': f1_micro,
#         'f1_macro': f1_macro,
#         'f1_weighted': f1_weighted
#     }



# import torch
# from sklearn.metrics import f1_score, precision_score, recall_score

# # Define custom batch preprocessor
# def collate_fn(batch, tokenizer):
#     dict_keys = ['input_ids', 'attention_mask', 'labels']
#     d = {k: [dic[k] for dic in batch] for k in dict_keys}
#     d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
#         d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
#     )
#     d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
#         d['attention_mask'], batch_first=True, padding_value=0
#     )
#     d['labels'] = torch.stack(d['labels'])
#     return d

# # Define which metrics to compute for evaluation
# def compute_metrics(p):
#     predictions, labels = p
#     thresholded_predictions = predictions > 0

#     f1_micro = f1_score(labels, thresholded_predictions, average='micro')
#     f1_macro = f1_score(labels, thresholded_predictions, average='macro')
#     f1_weighted = f1_score(labels, thresholded_predictions, average='weighted')

#     precision_micro = precision_score(labels, thresholded_predictions, average='micro')
#     precision_macro = precision_score(labels, thresholded_predictions, average='macro')
#     precision_weighted = precision_score(labels, thresholded_predictions, average='weighted')

#     recall_micro = recall_score(labels, thresholded_predictions, average='micro')
#     recall_macro = recall_score(labels, thresholded_predictions, average='macro')
#     recall_weighted = recall_score(labels, thresholded_predictions, average='weighted')

#     return {
#         'f1_micro': f1_micro,
#         'f1_macro': f1_macro,
#         'f1_weighted': f1_weighted,
#         'precision_micro': precision_micro,
#         'precision_macro': precision_macro,
#         'precision_weighted': precision_weighted,
#         'recall_micro': recall_micro,
#         'recall_macro': recall_macro,
#         'recall_weighted': recall_weighted
#     }

# =======================================+===============================

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Define which metrics to compute for evaluation
def compute_metrics(p, class_names):
    predictions, labels = p
    thresholded_predictions = predictions > 0

#     accuracy = accuracy_score(labels, thresholded_predictions)

    f1_micro = f1_score(labels, thresholded_predictions, average='micro')

    precision_micro = precision_score(labels, thresholded_predictions, average='micro')

    recall_micro = recall_score(labels, thresholded_predictions, average='micro')

    # Compute per-class metrics
    f1_per_class = f1_score(labels, thresholded_predictions, average=None)
    precision_per_class = precision_score(labels, thresholded_predictions, average=None)
    recall_per_class = recall_score(labels, thresholded_predictions, average=None)

    # Map metrics to class names
    f1_per_class_dict = {class_names[i]: f1_per_class[i] for i in range(len(class_names))}
    precision_per_class_dict = {class_names[i]: precision_per_class[i] for i in range(len(class_names))}
    recall_per_class_dict = {class_names[i]: recall_per_class[i] for i in range(len(class_names))}

    return {
        'f1_micro': f1_micro,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_per_class': f1_per_class_dict,
        'precision_per_class': precision_per_class_dict,
        'recall_per_class': recall_per_class_dict
    }


# ++++++++++++++++++++++++++++++++++++++++
# from sklearn.metrics import f1_score, precision_score, recall_score

# # Define custom batch preprocessor
# def collate_fn(batch, tokenizer):
#     dict_keys = ['input_ids', 'attention_mask', 'labels']
#     d = {k: [dic[k] for dic in batch] for k in dict_keys}
#     d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
#         d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
#     )
#     d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
#         d['attention_mask'], batch_first=True, padding_value=0
#     )
#     d['labels'] = torch.stack(d['labels'])
#     return d

# # Define which metrics to compute for evaluation
# def compute_metrics(p, class_names):
#     predictions, labels = p
#     thresholded_predictions = predictions > 0

#     f1_micro = f1_score(labels, thresholded_predictions, average='micro')
#     f1_weighted = f1_score(labels, thresholded_predictions, average='weighted')

#     precision_micro = precision_score(labels, thresholded_predictions, average='micro')
#     precision_weighted = precision_score(labels, thresholded_predictions, average='weighted')

#     recall_micro = recall_score(labels, thresholded_predictions, average='micro')
#     recall_weighted = recall_score(labels, thresholded_predictions, average='weighted')

#     # Compute per-class metrics
#     f1_per_class = f1_score(labels, thresholded_predictions, average=None)
#     precision_per_class = precision_score(labels, thresholded_predictions, average=None)
#     recall_per_class = recall_score(labels, thresholded_predictions, average=None)

#     # Create a dictionary with class names and their respective metrics
#     per_class_metrics = {}
#     for idx, class_name in enumerate(class_names):
#         per_class_metrics[class_name] = {
#             'f1': f1_per_class[idx],
#             'precision': precision_per_class[idx],
#             'recall': recall_per_class[idx]
#         }

#     return {
#         'f1_micro': f1_micro,
#         'f1_weighted': f1_weighted,
#         'precision_micro': precision_micro,
#         'precision_weighted': precision_weighted,
#         'recall_micro': recall_micro,
#         'recall_weighted': recall_weighted,
#         'per_class_metrics': per_class_metrics
#     }


# ======================================================
# import torch
# from sklearn.metrics import f1_score, precision_score, recall_score

# # Define custom batch preprocessor
# def collate_fn(batch, tokenizer):
#     dict_keys = ['input_ids', 'attention_mask', 'labels']
#     d = {k: [dic[k] for dic in batch] for k in dict_keys}
#     d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
#         d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
#     )
#     d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
#         d['attention_mask'], batch_first=True, padding_value=0
#     )
#     d['labels'] = torch.stack(d['labels'])
#     return d

# # Define which metrics to compute for evaluation
# def compute_metrics(p):
#     predictions, labels = p
#     thresholded_predictions = predictions > 0

#     f1_micro = f1_score(labels, thresholded_predictions, average='micro')
#     # f1_macro = f1_score(labels, thresholded_predictions, average='macro')
#     f1_weighted = f1_score(labels, thresholded_predictions, average='weighted')

#     precision_micro = precision_score(labels, thresholded_predictions, average='micro')
#     # precision_macro = precision_score(labels, thresholded_predictions, average='macro')
#     precision_weighted = precision_score(labels, thresholded_predictions, average='weighted')

#     recall_micro = recall_score(labels, thresholded_predictions, average='micro')
#     # recall_macro = recall_score(labels, thresholded_predictions, average='macro')
#     recall_weighted = recall_score(labels, thresholded_predictions, average='weighted')

#     # Compute per-class metrics
#     f1_per_class = f1_score(labels, thresholded_predictions, average=None)
#     precision_per_class = precision_score(labels, thresholded_predictions, average=None)
#     recall_per_class = recall_score(labels, thresholded_predictions, average=None)

#     return {
#         'f1_micro': f1_micro,
#         # 'f1_macro': f1_macro,
#         'f1_weighted': f1_weighted,
#         'precision_micro': precision_micro,
#         # 'precision_macro': precision_macro,
#         'precision_weighted': precision_weighted,
#         'recall_micro': recall_micro,
#         # 'recall_macro': recall_macro,
#         'recall_weighted': recall_weighted,
#         'f1_per_class': f1_per_class,
#         'precision_per_class': precision_per_class,
#         'recall_per_class': recall_per_class
#     }




# create custom trainer class to be able to pass label weights and calculate mutilabel loss
class CustomTrainer(Trainer):

    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")

        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # compute custom loss
        loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
        return (loss, outputs) if return_outputs else loss

# set random seed
random.seed(0)

In [None]:
import pandas as pd
import re
data = pd.read_csv("/kaggle/input/dataset/reviewsDataset.csv")

In [None]:
def clean_text(text):
    # Regular expression to match only Bengali characters, digits, spaces, and %
    pattern = re.compile(r'[^০-৯\u0980-\u09FF\s%]')
    filtered_string = pattern.sub('', text)
    output_string = re.sub(r'\s+', ' ', filtered_string).strip()
    return output_string

# Apply the function to the 'text' column
data['reviewContent'] = data['reviewContent'].apply(clean_text)
data.head()

In [None]:
data = data.dropna()

In [None]:
# Create a set to hold unique aspects
unique_aspects = set()

# Split the aspects and update the unique_aspects set
data['ground_truth_aspects'].str.split(', ').apply(unique_aspects.update)

# Strip spaces from unique aspects and remove any empty strings
unique_aspects = {aspect.strip() for aspect in unique_aspects if aspect.strip()}

# Create a new column for each unique aspect and initialize with 0
for aspect in unique_aspects:
    data[aspect] = 0

# Populate the columns based on the aspects present in each row
for index, row in data.iterrows():
    aspects = [aspect.strip() for aspect in row['ground_truth_aspects'].split(', ')]
    for aspect in aspects:
        if aspect:  # only update if aspect is not an empty string
            data.at[index, aspect] = 1


data.head()

In [None]:
columns = ['seller', 'delivery', 'service', 'price', 'packaging',
       'shelf', 'rider', 'product']

In [None]:
import plotly.express as px

def plot_value_counts(df, column_name):
    # Get value counts
    value_counts = df[column_name].value_counts().reset_index()
    value_counts.columns = [column_name, 'count']

    # Create bar plot with different colors
    fig_bar = px.bar(value_counts, x=column_name, y='count', 
                     color=column_name, 
                     labels={column_name:'Value', 'count':'Count'},
                     title=f'{column_name.capitalize()} Value Counts')

    # Update the layout for better visualization
    fig_bar.update_layout(showlegend=False)

    fig_bar.show()

    # Create pie chart
    fig_pie = px.pie(value_counts, names=column_name, values='count', 
                     title=f'{column_name.capitalize()} Value Counts Distribution')

    fig_pie.show()

# Example usage:
for col in columns:
    plot_value_counts(data, col)

In [None]:
drop_columns = ["ground_truth_aspects", "packaging  product", "seller,shelf", "product,price"]
data = data.drop(columns=drop_columns, axis = 1)

In [None]:
text, labels = list(zip(*[(f'ReviewContent: {row[0].strip()}', row[1:].astype(int)) for row in data.values])) # Iterate over the values of the DataFrame and convert the labels to integers
labels = np.array(labels)

In [None]:
# create label weights
label_weights = 1 - labels.sum(axis=0) / labels.sum()

# stratified train test split for multilabel ds
row_ids = np.arange(len(labels))
train_idx, y_train, val_idx, y_val = iterative_train_test_split(row_ids[:,np.newaxis], labels, test_size = 0.1)
x_train = [text[i] for i in train_idx.flatten()]
x_val = [text[i] for i in val_idx.flatten()]

# create hf dataset
ds = DatasetDict({
    'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
    'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
})

In [None]:
from huggingface_hub import login
login("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")
# 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

In [None]:
# model name
# model_name = 'meta-llama/Meta-Llama-3-8B'
model_name = 'microsoft/Phi-3-mini-4k-instruct'
# preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')

In [None]:
# !pip install -q bitsandbytes peft
from transformers import BitsAndBytesConfig
from peft import get_peft_model, LoraConfig
# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)


In [None]:

from transformers import BitsAndBytesConfig
from peft import get_peft_model, LoraConfig
# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

# load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=labels.shape[1]
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
class_names = ['seller', 'delivery', 'service', 'price', 'packaging',
       'shelf', 'rider', 'product']

training_args = TrainingArguments(
    output_dir='multilabel_classification',
    learning_rate=1e-4,
    per_device_train_batch_size=4,  # tested with 16GB GPU RAM
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy='steps',
    save_strategy='steps',
    load_best_model_at_end=True,
    logging_steps=500,  # log every 250 steps (500 is a round multiple of 250)
    save_steps=2000,  # save every 500 steps
    logging_dir='logs',  # directory for storing logs
)

# Instantiate the trainer with the custom implementation
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
    tokenizer=tokenizer,
    data_collator=functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics=lambda p: compute_metrics(p, class_names),
    label_weights=torch.tensor(label_weights, device=model.device) # Pass label_weights here
)
trainer.train()