In [1]:
!pip install transformers torch datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer

from torch.nn import CrossEntropyLoss
from sklearn.utils.class_weight import compute_class_weight

In [8]:
#Reading the training, validation and testing data

train_df = pd.read_csv("/content/train_data.csv").drop(columns=['Unnamed: 0'])
print(train_df.count())

val_df = pd.read_csv("/content/validation_data.csv").drop(columns=['Unnamed: 0'])
print(val_df.count())

test_df = pd.read_csv("/content/test_data.csv").drop(columns=['Unnamed: 0'])
print(test_df.count())

productId        7496
Title            7496
userId           7496
Time             7496
Text             7496
Cat1             7496
Cat2             7496
Cat3             7496
clean_Title      7496
clean_Text       7496
combined_text    7496
dtype: int64
productId        1499
Title            1499
userId           1499
Time             1499
Text             1499
Cat1             1499
Cat2             1499
Cat3             1499
clean_Title      1499
clean_Text       1499
combined_text    1499
dtype: int64
productId        1000
Title            1000
userId           1000
Time             1000
Text             1000
Cat1             1000
Cat2             1000
Cat3             1000
clean_Title      1000
clean_Text       1000
combined_text    1000
dtype: int64


In [9]:
# Create a dictionary to map categories to numerical labels

category_mapping = {category: idx for idx, category in enumerate(train_df['Cat1'].unique())}
train_df['label_1'] = train_df['Cat1'].map(category_mapping)
val_df['label_1'] = val_df['Cat1'].map(category_mapping)
test_df['label_1'] = test_df['Cat1'].map(category_mapping)
print(category_mapping)

{'health personal care': 0, 'beauty': 1, 'toys games': 2, 'baby products': 3, 'grocery gourmet food': 4, 'pet supplies': 5}


In [11]:
#splitting the respective data into X and y

X_train = list(train_df["combined_text"])
y_train = list(train_df["label_1"])
X_val = list(val_df["combined_text"])
y_val = list(val_df["label_1"])
X_test = list(test_df["combined_text"])
y_test = list(test_df["label_1"])
print("Original class distribution in training data:", np.bincount(y_train))
print("Original class distribution in validation data:", np.bincount(y_val))
print("Original class distribution in validation data:", np.bincount(y_test))

Original class distribution in training data: [2239 1600 1319  525  632 1181]
Original class distribution in validation data: [457 315 253 107 130 237]
Original class distribution in validation data: [291 220 187  66  78 158]


## Tokenization using BertTokenizer

In [12]:
# Loading pre-trained BertTokeinzer using transformers

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [13]:
# Performing Tokenization on training, validation and testing data

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True,truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True,truncation=True, max_length=512)

In [14]:
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [15]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [16]:
# Converting the tokenized data into torch dataset

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
test_dataset = Dataset(X_test_tokenized, y_test)

## Fine Tuning using BertForSequenceClassification

In [17]:
# Loading BertForSequenceClassification for 6 class classification task and map the model to cuda

from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=6)
model = model.to('cuda')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Define compute_metrics function
from sklearn.metrics import precision_recall_fscore_support
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [19]:
# Computing class weights to handle imbalanced classes in Cat1

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

class_weights_tensor = torch.tensor(list(class_weights.values()), dtype=torch.float)
class_weights_tensor = class_weights_tensor.to('cuda')
loss_fn = CrossEntropyLoss(weight=class_weights_tensor)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [24]:
# Define Trainer
args = TrainingArguments(
    output_dir="product_Cat1_classification",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch"
)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)



### Training and evaluation on validation set

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.329897,0.901268,0.901461,0.901268,0.901256
2,0.452700,0.3564,0.899933,0.903293,0.899933,0.900029
3,0.186300,0.355803,0.917278,0.918109,0.917278,0.917252


TrainOutput(global_step=1407, training_loss=0.25733933113277085, metrics={'train_runtime': 2291.9804, 'train_samples_per_second': 9.812, 'train_steps_per_second': 0.614, 'total_flos': 5917053912662016.0, 'train_loss': 0.25733933113277085, 'epoch': 3.0})

In [26]:
trainer.evaluate()

{'eval_loss': 0.3558032810688019,
 'eval_accuracy': 0.9172781854569713,
 'eval_precision': 0.918109193810239,
 'eval_recall': 0.9172781854569713,
 'eval_f1': 0.9172519509779813,
 'eval_runtime': 45.1902,
 'eval_samples_per_second': 33.171,
 'eval_steps_per_second': 4.16,
 'epoch': 3.0}

In [None]:
# saved the model and moved to drive

trainer.save_model('product_category_prediction')
tokenizer.save_pretrained('product_category_prediction')


## Prediction on Testing data

In [28]:
test_preds_output = trainer.predict(test_dataset)
test_predictions=test_preds_output.predictions
test_prediction_labels = np.argmax(test_predictions, axis=1)

In [29]:
# checking performace on test data

test_preds_output.metrics

{'test_loss': 0.23423178493976593,
 'test_accuracy': 0.928,
 'test_precision': 0.9283025485596446,
 'test_recall': 0.928,
 'test_f1': 0.9280367458331157,
 'test_runtime': 32.3818,
 'test_samples_per_second': 30.882,
 'test_steps_per_second': 3.86}

In [31]:
# Step 1: Reverse the original category mapping
reverse_category_mapping = {v: k for k, v in category_mapping.items()}

# Step 3: Re-map the numeric predictions back to the original category names
test_prediction_Cat1 = [reverse_category_mapping[label] for label in test_prediction_labels]

['health personal care', 'beauty', 'pet supplies', 'health personal care', 'pet supplies', 'pet supplies', 'pet supplies', 'toys games', 'health personal care', 'pet supplies', 'health personal care', 'health personal care', 'health personal care', 'toys games', 'toys games', 'beauty', 'health personal care', 'pet supplies', 'pet supplies', 'pet supplies', 'pet supplies', 'pet supplies', 'beauty', 'health personal care', 'beauty', 'health personal care', 'pet supplies', 'health personal care', 'pet supplies', 'pet supplies', 'health personal care', 'toys games', 'beauty', 'toys games', 'toys games', 'health personal care', 'health personal care', 'health personal care', 'toys games', 'beauty', 'toys games', 'beauty', 'grocery gourmet food', 'toys games', 'health personal care', 'grocery gourmet food', 'grocery gourmet food', 'health personal care', 'toys games', 'toys games', 'beauty', 'toys games', 'pet supplies', 'health personal care', 'baby products', 'beauty', 'health personal car

In [32]:
# appending predicted category 1 to test data

test_df['pred_Cat1']=pd.Series(test_prediction_Cat1)
test_df.head(5)


Unnamed: 0,productId,Title,userId,Time,Text,Cat1,Cat2,Cat3,clean_Title,clean_Text,combined_text,label_1,pred_Cat1
0,B00068S9H2,Mega Yohimbe Extract 750 mg 120 Caps,AEN7OO9EQ3OC8,1358985600,I actually believe this stuff is potent enough...,health personal care,nutrition wellness,vitamins supplements,mega yohimbe extract number mg number caps,i actually believe this stuff is potent enough...,the name of the product is mega yohimbe extrac...,0,health personal care
1,B000C2J64W,FCUK her - EDT Spray 3.4 oz. (Womens),AHEN7LNF68ERG,1358035200,This product was very good and as a gift was r...,beauty,fragrance,women s,fcuk her edt spray number oz womens,this product was very good and as a gift was r...,the name of the product is fcuk her edt spray...,1,beauty
2,B000MRBQA4,Dog Safety Vest - Bright Orange Reflective Saf...,A360N3OUOCINPY,1352160000,"read # 2. ""title for your review"". it does not...",pet supplies,dogs,apparel accessories,dog safety vest bright orange reflective safe...,read number title for your review it does not...,the name of the product is dog safety vest br...,5,pet supplies
3,B0009MFUWC,"Uplift Technologies DL930 Day-Light 10,000 Lux...",A2S4RUT5VCQZRW,1354492800,"I've had this light for a few years now, and I...",health personal care,health care,alternative medicine,uplift technologies dlnumber daylight number l...,ive had this light for a few years now and i l...,the name of the product is uplift technologies...,0,health personal care
4,B000A7XY10,Contech StayAway Automatic Pet Deterrent Refil...,A2DZGR0TW07HR9,1357948800,it taught my two blue russian cats to stay awa...,pet supplies,cats,educational repellents,contech stayaway automatic pet deterrent refil...,it taught my two blue russian cats to stay awa...,the name of the product is contech stayaway au...,5,pet supplies


In [33]:
# saving the predicted test data

test_df.to_csv("testing_data_pred_Cat1.csv")