In [None]:
!pip install simpletransformers

1. Data Loading

In [None]:
# data loading - user upload file, stratified split
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

from google.colab import files
uploaded = files.upload()

# load file from user upload
data = pd.read_csv(next(iter(uploaded)), encoding='ISO-8859-1')

# stratified split
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, val_idx in split.split(data, data["Prediction"]):
    train_data = data.iloc[train_idx]
    val_data   = data.iloc[val_idx]

# create df for train and val
train_df = pd.DataFrame({
    'text': train_data['Input'],
    'labels': train_data['Prediction']
})
val_df = pd.DataFrame({
    'text': val_data['Input'],
    'labels': val_data['Prediction']
})

print("train labels:", train_df['labels'].nunique())
print("val labels:", val_df['labels'].nunique())
print(train_df.head())
print(val_df.head())

print("unique cats:", data["Prediction"].unique())


2. Text Processing

In [None]:
# text processing - clean text
import re

def clean_text(txt):
    txt = txt.lower()
    txt = re.sub(r'[^a-zA-Z\s]', '', txt)
    txt = txt.strip()
    return txt

train_df['text'] = train_df['text'].apply(clean_text)
val_df['text']   = val_df['text'].apply(clean_text)

print(train_df.head())


In [None]:
# model init - create bert & roberta models using simpletransformers
from simpletransformers.classification import ClassificationModel

bert_model = ClassificationModel('bert', 'bert-base-uncased', num_labels=len(data["Prediction"].unique()), use_cuda=True)
roberta_model = ClassificationModel('roberta', 'roberta-base', num_labels=len(data["Prediction"].unique()), use_cuda=True)


4. Model Training with BERT and RoBERTa
Basic Model Training
Train the BERT Model

In [None]:
# model train prep - encode labels, set args, train models
from sklearn.preprocessing import LabelEncoder
from simpletransformers.classification import ClassificationArgs

label_encoder = LabelEncoder()
all_lbls = list(train_df["labels"]) + list(val_df["labels"])
label_encoder.fit(all_lbls)

train_df["labels"] = label_encoder.transform(train_df["labels"])
val_df["labels"]   = label_encoder.transform(val_df["labels"])

model_args = ClassificationArgs(
    num_train_epochs=3,
    train_batch_size=8,
    eval_batch_size=8,
    learning_rate=3e-5,
    max_seq_length=128,
    weight_decay=0.01,
    warmup_steps=0,
    logging_steps=50,
    save_steps=200,
    overwrite_output_dir=True,
    output_dir='outputs'
)

bert_model = ClassificationModel('bert', 'bert-base-uncased', num_labels=len(label_encoder.classes_), args=model_args, use_cuda=True)
bert_model.train_model(train_df)

roberta_model = ClassificationModel('roberta', 'roberta-base', num_labels=len(label_encoder.classes_), args=model_args, use_cuda=True)
roberta_model.train_model(train_df)

print("bert & roberta models trained succesfully!")


5. Evaluation on Validation Set

In [None]:
# evaluation on validation set
from sklearn.metrics import classification_report
import numpy as np

# eval bert
result_bert, out_bert, wrong_bert = bert_model.eval_model(val_df)
bert_preds = np.argmax(out_bert, axis=1)
bert_preds_lbl = label_encoder.inverse_transform(bert_preds)
val_df['bert_pred'] = bert_preds_lbl

print("bert eval result:")
print(result_bert)
target_names = [str(x) for x in label_encoder.classes_]
print("\nbert clas report:")
print(classification_report(val_df['labels'], bert_preds, target_names=target_names))

# eval roberta
result_roberta, out_roberta, wrong_roberta = roberta_model.eval_model(val_df)
roberta_preds = np.argmax(out_roberta, axis=1)
roberta_preds_lbl = label_encoder.inverse_transform(roberta_preds)
val_df['roberta_pred'] = roberta_preds_lbl

print("\nroberta eval result:")
print(result_roberta)
print("\nroberta clas report:")
print(classification_report(val_df['labels'], roberta_preds, target_names=target_names))


In [None]:
# make table of eval metrics
import pandas as pd

data_dict = {
    "no.": [1, 2],
    "model name": ["bert", "roberta"],
    "precision": [0.61, 0.62],
    "recall": [0.48, 0.52],
    "f1 score": [0.51, 0.55],
    "accuracy": [0.84, 0.85],
    "mcc": [0.8158, 0.8223]
}

eval_df = pd.DataFrame(data_dict)
print(eval_df)



6. Saving the Model

In [None]:
# save model - save bert and roberta manually
bert_model.model.save_pretrained("bert_model")
bert_model.tokenizer.save_pretrained("bert_model")
print("bert model saved manually!")

roberta_model.model.save_pretrained("roberta_model")
roberta_model.tokenizer.save_pretrained("roberta_model")
print("roberta model saved manually!")



7. Prediction on Real-World Input

In [None]:
# prediction on real-world input
import torch
from simpletransformers.classification import ClassificationModel
import re
import numpy as np
from sklearn.preprocessing import LabelEncoder

def clean_text(txt):
    txt = txt.lower()
    txt = re.sub(r'[^a-zA-Z\s]', '', txt)
    txt = txt.strip()
    return txt

# load saved models; use use_cuda false if no gpu
bert_model = ClassificationModel("bert", "bert_model", use_cuda=False)
roberta_model = ClassificationModel("roberta", "roberta_model", use_cuda=False)

# set label encoder classes manually
encoder = LabelEncoder()
encoder.classes_ = np.array(['automotive', 'beauty & personal care', 'books', 'clothing', 'electronics',
                              'furniture', 'gaming', 'grocery', 'health & wellness', 'home appliances',
                              'jewelry & accessories', 'office supplies', 'pet supplies', 'sports & outdoors', 'toys'])

input_text = input("enter product description: ")
cleaned_text = clean_text(input_text)

bert_pred, _ = bert_model.predict([cleaned_text])
roberta_pred, _ = roberta_model.predict([cleaned_text])

bert_lbl = encoder.inverse_transform(np.array(bert_pred))[0]
roberta_lbl = encoder.inverse_transform(np.array(roberta_pred))[0]

print("\nbert predicted category:", bert_lbl)
print("roberta predicted category:", roberta_lbl)
