In [1]:
# Install the missing module
!pip install -q accelerate -U bitsandbytes
!pip install -q scikit-multilearn datasets peft transformers
import os
import random
import functools
import csv
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score
from skmultilearn.model_selection import iterative_train_test_split # This import should now work
from datasets import Dataset, DatasetDict
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
import os
os.environ["WANDB_MODE"] = "disabled"
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)

2024-06-14 03:05:26.706855: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-14 03:05:26.706962: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-14 03:05:26.870550: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

In [3]:
# Define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels'])
    return d

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Define which metrics to compute for evaluation
def compute_metrics(p, class_names):
    predictions, labels = p
    thresholded_predictions = predictions > 0

#     accuracy = accuracy_score(labels, thresholded_predictions)

    f1_micro = f1_score(labels, thresholded_predictions, average='micro')

    precision_micro = precision_score(labels, thresholded_predictions, average='micro')

    recall_micro = recall_score(labels, thresholded_predictions, average='micro')

    # Compute per-class metrics
    f1_per_class = f1_score(labels, thresholded_predictions, average=None)
    precision_per_class = precision_score(labels, thresholded_predictions, average=None)
    recall_per_class = recall_score(labels, thresholded_predictions, average=None)

    # Map metrics to class names
    f1_per_class_dict = {class_names[i]: f1_per_class[i] for i in range(len(class_names))}
    precision_per_class_dict = {class_names[i]: precision_per_class[i] for i in range(len(class_names))}
    recall_per_class_dict = {class_names[i]: recall_per_class[i] for i in range(len(class_names))}

    return {
        'f1_micro': f1_micro,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_per_class': f1_per_class_dict,
        'precision_per_class': precision_per_class_dict,
        'recall_per_class': recall_per_class_dict
    }




# create custom trainer class to be able to pass label weights and calculate mutilabel loss
class CustomTrainer(Trainer):

    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")

        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # compute custom loss
        loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
        return (loss, outputs) if return_outputs else loss

# set random seed
random.seed(0)

In [4]:
import pandas as pd
import re
data = pd.read_csv("/kaggle/input/review-dataset/reviewsDataset.csv")

In [5]:
def clean_text(text):
    # Regular expression to match only Bengali characters, digits, and special characters
    pattern = re.compile(r'[^০-৯\u0980-\u09FF\s,.!?\'"@#$%^&*()_+=\-`~<>:;"{}[\]\\|/]')
    filtered_string = pattern.sub('', text)
    output_string = re.sub(r'\s+', ' ', filtered_string).strip()
    return output_string

# Apply the function to the 'text' column
data['reviewContent'] = data['reviewContent'].apply(clean_text)
data.head()

Unnamed: 0,reviewContent,ground_truth_aspects
0,"আলহামদুলিল্লাহ, প্রোডাক্টটি অনেক ভালো! সাউন্ড ...","product, packaging"
1,"অসাধারণ একটা প্রডাক্ট হাতে পেলাম,সত্যিই অসাধার...","product, seller, packaging"
2,আসা করি যে ভালো হবে কিন্তু অনেক ভালো ছিল চালে ...,"product, seller"
3,"যেমন ওর্ডার করেছি তেমন পেয়েছি,প্যাকেটিং ভালো ছ...","product, packaging, seller"
4,১০০% আসল প্রোডাক্ট সিলেটের মধ্যে ৮ দিনের মধ্যে...,"product, delivery, seller"


In [6]:
data.shape

(1648, 2)

In [7]:
data.isnull().sum()

reviewContent            0
ground_truth_aspects    67
dtype: int64

In [8]:

data = data.dropna()

In [9]:
# Create a set to hold unique aspects
unique_aspects = set()

# Split the aspects and update the unique_aspects set
data['ground_truth_aspects'].str.split(', ').apply(unique_aspects.update)

# Strip spaces from unique aspects and remove any empty strings
unique_aspects = {aspect.strip() for aspect in unique_aspects if aspect.strip()}

# Create a new column for each unique aspect and initialize with 0
for aspect in unique_aspects:
    data[aspect] = 0

# Populate the columns based on the aspects present in each row
for index, row in data.iterrows():
    aspects = [aspect.strip() for aspect in row['ground_truth_aspects'].split(', ')]
    for aspect in aspects:
        if aspect:  # only update if aspect is not an empty string
            data.at[index, aspect] = 1


data.head()

Unnamed: 0,reviewContent,ground_truth_aspects,"product,price",service,rider,seller,product,"seller,shelf",delivery,shelf,price,packaging,packaging product
0,"আলহামদুলিল্লাহ, প্রোডাক্টটি অনেক ভালো! সাউন্ড ...","product, packaging",0,0,0,0,1,0,0,0,0,1,0
1,"অসাধারণ একটা প্রডাক্ট হাতে পেলাম,সত্যিই অসাধার...","product, seller, packaging",0,0,0,1,1,0,0,0,0,1,0
2,আসা করি যে ভালো হবে কিন্তু অনেক ভালো ছিল চালে ...,"product, seller",0,0,0,1,1,0,0,0,0,0,0
3,"যেমন ওর্ডার করেছি তেমন পেয়েছি,প্যাকেটিং ভালো ছ...","product, packaging, seller",0,0,0,1,1,0,0,0,0,1,0
4,১০০% আসল প্রোডাক্ট সিলেটের মধ্যে ৮ দিনের মধ্যে...,"product, delivery, seller",0,0,0,1,1,0,1,0,0,0,0


In [10]:
columns = ['seller', 'delivery', 'service', 'price', 'packaging',
       'shelf', 'rider', 'product']

In [11]:
import plotly.express as px

def plot_value_counts(df, column_name):
    # Get value counts
    value_counts = df[column_name].value_counts().reset_index()
    value_counts.columns = [column_name, 'count']

    # Create bar plot with different colors
    fig_bar = px.bar(value_counts, x=column_name, y='count', 
                     color=column_name, 
                     labels={column_name:'Value', 'count':'Count'},
                     title=f'{column_name.capitalize()} Value Counts')

    # Update the layout for better visualization
    fig_bar.update_layout(showlegend=False)

    fig_bar.show()

    # Create pie chart
    fig_pie = px.pie(value_counts, names=column_name, values='count', 
                     title=f'{column_name.capitalize()} Value Counts Distribution')

    fig_pie.show()
    
# Example usage:
for col in columns:
    plot_value_counts(data, col)

In [12]:
drop_columns = ["ground_truth_aspects", "packaging  product", "seller,shelf", "product,price"]
data = data.drop(columns=drop_columns, axis = 1)

In [13]:
text, labels = list(zip(*[(f'ReviewContent: {row[0].strip()}', row[1:].astype(int)) for row in data.values])) # Iterate over the values of the DataFrame and convert the labels to integers
labels = np.array(labels)

In [14]:
# create label weights
label_weights = 1 - labels.sum(axis=0) / labels.sum()

# stratified train test split for multilabel ds
row_ids = np.arange(len(labels))
train_idx, y_train, val_idx, y_val = iterative_train_test_split(row_ids[:,np.newaxis], labels, test_size = 0.3)
x_train = [text[i] for i in train_idx.flatten()]
x_val = [text[i] for i in val_idx.flatten()]

# create hf dataset
ds = DatasetDict({
    'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
    'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
})


In [15]:
from huggingface_hub import login
login("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")
# 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [16]:
# model name
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'
# model_name = 'microsoft/Phi-3-mini-4k-instruct'
# preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/1097 [00:00<?, ? examples/s]

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

In [17]:
from transformers import BitsAndBytesConfig
from peft import get_peft_model, LoraConfig
# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

# load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=labels.shape[1]
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
class_names = ['seller', 'delivery', 'service', 'price', 'packaging',
       'shelf', 'rider', 'product']

training_args = TrainingArguments(
    output_dir='multilabel_classification_Llama3',
    learning_rate=1e-4,
    per_device_train_batch_size=4,  # tested with 16GB GPU RAM
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy='steps',
    save_strategy='steps',
    load_best_model_at_end=True,
    logging_steps=345,  # log every 250 steps (500 is a round multiple of 250)
    save_steps=345,  # save every 500 steps
    logging_dir='logs',  # directory for storing logs
)

# Instantiate the trainer with the custom implementation
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
    tokenizer=tokenizer,
    data_collator=functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics=lambda p: compute_metrics(p, class_names),
    label_weights=torch.tensor(label_weights, device=model.device) # Pass label_weights here
)
trainer.train()


`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.

torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.



Step,Training Loss,Validation Loss,F1 Micro,Precision Micro,Recall Micro,F1 Per Class,Precision Per Class,Recall Per Class
345,0.2929,0.184502,0.821159,0.863576,0.782713,"{'seller': 0.4814814814814815, 'delivery': 0.45454545454545453, 'service': 0.8829787234042553, 'price': 0.8879184861717614, 'packaging': 0.5757575757575758, 'shelf': 0.8636363636363635, 'rider': 0.8928571428571429, 'product': 0.8410256410256409}","{'seller': 0.5, 'delivery': 1.0, 'service': 0.8924731182795699, 'price': 0.8448753462603878, 'packaging': 0.95, 'shelf': 0.95, 'rider': 0.9375, 'product': 0.9761904761904762}","{'seller': 0.4642857142857143, 'delivery': 0.29411764705882354, 'service': 0.8736842105263158, 'price': 0.9355828220858896, 'packaging': 0.41304347826086957, 'shelf': 0.7916666666666666, 'rider': 0.8522727272727273, 'product': 0.7387387387387387}"
690,0.0677,0.169489,0.87044,0.914135,0.830732,"{'seller': 0.5652173913043479, 'delivery': 0.7741935483870968, 'service': 0.8877005347593583, 'price': 0.890625, 'packaging': 0.8538011695906433, 'shelf': 0.9111111111111111, 'rider': 0.9204545454545455, 'product': 0.8965517241379309}","{'seller': 0.7222222222222222, 'delivery': 0.8571428571428571, 'service': 0.9021739130434783, 'price': 0.9076433121019108, 'packaging': 0.9240506329113924, 'shelf': 0.9761904761904762, 'rider': 0.9204545454545454, 'product': 0.9891304347826086}","{'seller': 0.4642857142857143, 'delivery': 0.7058823529411765, 'service': 0.8736842105263158, 'price': 0.8742331288343558, 'packaging': 0.7934782608695652, 'shelf': 0.8541666666666666, 'rider': 0.9204545454545454, 'product': 0.8198198198198198}"
1035,0.005,0.19046,0.888337,0.919127,0.859544,"{'seller': 0.6122448979591837, 'delivery': 0.7142857142857143, 'service': 0.9060773480662985, 'price': 0.9113149847094801, 'packaging': 0.8813559322033897, 'shelf': 0.9574468085106383, 'rider': 0.9142857142857144, 'product': 0.9073170731707317}","{'seller': 0.7142857142857143, 'delivery': 0.9090909090909091, 'service': 0.9534883720930233, 'price': 0.9085365853658537, 'packaging': 0.9176470588235294, 'shelf': 0.9782608695652174, 'rider': 0.9195402298850575, 'product': 0.9893617021276596}","{'seller': 0.5357142857142857, 'delivery': 0.5882352941176471, 'service': 0.8631578947368421, 'price': 0.9141104294478528, 'packaging': 0.8478260869565217, 'shelf': 0.9375, 'rider': 0.9090909090909091, 'product': 0.8378378378378378}"
1380,0.0021,0.191067,0.889027,0.919231,0.860744,"{'seller': 0.5957446808510638, 'delivery': 0.7142857142857143, 'service': 0.9060773480662985, 'price': 0.9082568807339448, 'packaging': 0.8876404494382023, 'shelf': 0.9574468085106383, 'rider': 0.9142857142857144, 'product': 0.9186602870813397}","{'seller': 0.7368421052631579, 'delivery': 0.9090909090909091, 'service': 0.9534883720930233, 'price': 0.9054878048780488, 'packaging': 0.9186046511627907, 'shelf': 0.9782608695652174, 'rider': 0.9195402298850575, 'product': 0.9795918367346939}","{'seller': 0.5, 'delivery': 0.5882352941176471, 'service': 0.8631578947368421, 'price': 0.911042944785276, 'packaging': 0.8586956521739131, 'shelf': 0.9375, 'rider': 0.9090909090909091, 'product': 0.8648648648648649}"


Trainer is attempting to log a value of "{'seller': 0.4814814814814815, 'delivery': 0.45454545454545453, 'service': 0.8829787234042553, 'price': 0.8879184861717614, 'packaging': 0.5757575757575758, 'shelf': 0.8636363636363635, 'rider': 0.8928571428571429, 'product': 0.8410256410256409}" of type <class 'dict'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'seller': 0.5, 'delivery': 1.0, 'service': 0.8924731182795699, 'price': 0.8448753462603878, 'packaging': 0.95, 'shelf': 0.95, 'rider': 0.9375, 'product': 0.9761904761904762}" of type <class 'dict'> for key "eval/precision_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'seller': 0.4642857142857143, 'delivery': 0.29411764705882354, 'service': 0.8736842105263158, 'price': 0.9355828220858896, 'pa

TrainOutput(global_step=1380, training_loss=0.09195628727691761, metrics={'train_runtime': 38593.5593, 'train_samples_per_second': 0.284, 'train_steps_per_second': 0.036, 'total_flos': 1.1720505292111872e+17, 'train_loss': 0.09195628727691761, 'epoch': 10.0})

In [19]:
from sklearn.metrics import classification_report
import numpy as np

# Assuming the compute_metrics function is already defined somewhere
def compute_metrics(pred, class_names):
    labels = pred.label_ids
    preds = pred.predictions
    preds = (preds > 0.5).astype(int)  # Assuming a threshold of 0.5 for binary classification
    
    # Compute precision, recall, f1 scores for each class
    report = classification_report(labels, preds, target_names=class_names, output_dict=True)
    
    # Convert the report to a dictionary of metrics
    metrics = {}
    for key, value in report.items():
        if isinstance(value, dict):
            for sub_key, sub_value in value.items():
                metrics[f'{key}_{sub_key}'] = sub_value
        else:
            metrics[key] = value
    
    return metrics

# Evaluate the model and get the metrics
metrics = trainer.evaluate()

# Print the classification report
preds = trainer.predict(tokenized_ds['val'])
labels = preds.label_ids
preds = (preds.predictions > 0.5).astype(int)  # Adjust the threshold as needed

report = classification_report(labels, preds, target_names=class_names)
print(report)


Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.



              precision    recall  f1-score   support

      seller       0.78      0.38      0.51        56
    delivery       0.85      0.65      0.73        17
     service       0.91      0.85      0.88        95
       price       0.92      0.84      0.88       326
   packaging       0.95      0.78      0.86        92
       shelf       0.98      0.85      0.91        48
       rider       0.92      0.91      0.91        88
     product       0.99      0.80      0.89       111

   micro avg       0.93      0.80      0.86       833
   macro avg       0.91      0.76      0.82       833
weighted avg       0.92      0.80      0.85       833
 samples avg       0.90      0.84      0.85       833




Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.

