In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import Dataset, DatasetDict

import torch
from torch.utils.data.dataset import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification

In [None]:
data = pd.read_csv('/content/sentisum-evaluation-dataset.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Tires where delivered to the garage of my choi...,garage service positive,ease of booking positive,,,,,,,,,,,,
1,"Easy Tyre Selection Process, Competitive Prici...",garage service positive,value for money positive,,,,,,,,,,,,
2,Very easy to use and good value for money.,value for money positive,,,,,,,,,,,,,
3,Really easy and convenient to arrange,ease of booking positive,,,,,,,,,,,,,
4,It was so easy to select tyre sizes and arrang...,location positive,value for money positive,ease of booking positive,,,,,,,,,,,


In [None]:
sentiment_array = ['col_'+ str(i) for i in range(1,15)]
new_col = ['text'] + sentiment_array
data.columns = new_col
data.columns

Index(['text', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7',
       'col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_13', 'col_14'],
      dtype='object')

In [None]:
sent = {}

for i in data.columns[1:]:
    col_data = data[i].dropna()
    for j in col_data:
        if j not in sent:
            sent[j] = 1
        else:
            sent[j] += 1

sent

{'garage service positive': 2031,
 'value for money positive': 4780,
 'ease of booking positive': 1187,
 'location positive': 1063,
 'length of fitting positive': 657,
 'ease of booking negative': 227,
 'tyre quality positive': 434,
 'garage service negative': 423,
 'wait time negative': 135,
 'delivery punctuality positive': 453,
 'wait time positive': 274,
 'location negative': 27,
 'damage negative': 127,
 'extra charges positive': 85,
 'value for money negative': 136,
 'mobile fitter positive': 225,
 'advisor/agent service positive': 202,
 'facilities positive': 33,
 'change of time negative': 42,
 'extra charges negative': 46,
 'late notice negative': 76,
 'discounts positive': 115,
 'delivery punctuality negative': 250,
 'refund not actioned positive': 1,
 'change of date negative': 277,
 'booking confusion negative': 119,
 'advisoragent service positive': 233,
 'advisor/agent service negative': 47,
 'advisoragent service negative': 125,
 'incorrect tyres sent negative': 70,
 'ty

In [None]:
lst = list(sent.keys())
lst.sort()
lst

['advisor/agent service negative',
 'advisor/agent service positive',
 'advisoragent service negative',
 'advisoragent service positive',
 'balancing negative',
 'balancing positive',
 'booking confusion negative',
 'booking confusion positive',
 'call wait time negative',
 'call wait time positive',
 'change of date negative',
 'change of date positive',
 'change of time negative',
 'change of time positive',
 'damage negative',
 'delivery punctuality negative',
 'delivery punctuality positive',
 'discount not applied negative',
 'discount not applied positive',
 'discounts negative',
 'discounts positive',
 'ease of booking negative',
 'ease of booking positive',
 'extra charges negative',
 'extra charges positive',
 'facilities negative',
 'facilities positive',
 'failed payment negative',
 'failed payment positive',
 'garage service negative',
 'garage service positive',
 'incorrect tyres sent negative',
 'incorrect tyres sent positive',
 'late notice negative',
 'late notice posit

## Storing the subtheme sentences whose value counts > 20 in a list


In [None]:
list_1 = [k for k, v in sent.items() if v > 20]

list_1

['garage service positive',
 'value for money positive',
 'ease of booking positive',
 'location positive',
 'length of fitting positive',
 'ease of booking negative',
 'tyre quality positive',
 'garage service negative',
 'wait time negative',
 'delivery punctuality positive',
 'wait time positive',
 'location negative',
 'damage negative',
 'extra charges positive',
 'value for money negative',
 'mobile fitter positive',
 'advisor/agent service positive',
 'facilities positive',
 'change of time negative',
 'extra charges negative',
 'late notice negative',
 'discounts positive',
 'delivery punctuality negative',
 'change of date negative',
 'booking confusion negative',
 'advisoragent service positive',
 'advisor/agent service negative',
 'advisoragent service negative',
 'incorrect tyres sent negative',
 'tyre quality negative',
 'response time negative',
 'refund positive',
 'no stock negative',
 'change of date positive',
 'call wait time negative',
 'refund negative',
 'length o

In [None]:
def modify_data(data, label_columns):
    # Select only the subtheme labels
    label_data = data.iloc[:, 1:]

    # Create a new DataFrame with same number of rows and predefined columns (subtheme names), filled with NaNs
    binary_labels = pd.DataFrame(index=range(len(data)), columns=label_columns)

    # Loop through each row
    for i, row in label_data.iterrows():
        # Get all non-NaN subtheme labels in this row
        present_labels = row.dropna().tolist()

        # Mark 1 for each subtheme present in the row
        for label in present_labels:
            if label in label_columns:
                binary_labels.loc[i, label] = 1

    # Fill all NaNs with 0s
    binary_labels = binary_labels.fillna(0)
    binary_labels = binary_labels.astype(int)

    # Combine the sentence column with the binary label columns
    final_data = pd.concat([data['text'], binary_labels], axis=1)


    return final_data


final_data = modify_data(data, list_1)
final_data.head(10)

  binary_labels = binary_labels.fillna(0)


Unnamed: 0,text,garage service positive,value for money positive,ease of booking positive,location positive,length of fitting positive,ease of booking negative,tyre quality positive,garage service negative,wait time negative,...,refund positive,no stock negative,change of date positive,call wait time negative,refund negative,length of fitting negative,balancing negative,mobile fitter negative,discounts negative,response time positive
0,Tires where delivered to the garage of my choi...,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Easy Tyre Selection Process, Competitive Prici...",1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Very easy to use and good value for money.,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Really easy and convenient to arrange,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,It was so easy to select tyre sizes and arrang...,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,service was excellent. Only slight downside wa...,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,User friendly Website. Competitive Prices. Goo...,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Excellent prices and service,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,It was very straightforward and the garage was...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Use of local garage.,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
final_data.shape

(10132, 42)

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(final_data, test_size=0.2, random_state=42)

df_labels_train = df_train[list_1]
df_labels_test = df_test[list_1]

In [None]:
labels_list_train = df_labels_train.values.tolist()
labels_list_test = df_labels_test.values.tolist()

In [None]:
train_texts = df_train['text'].tolist()
train_labels = labels_list_train

eval_texts = df_test['text'].tolist()
eval_labels = labels_list_test



In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

train_encodings = tokenizer(train_texts, padding="max_length", truncation=True, max_length=512)
eval_encodings = tokenizer(eval_texts, padding="max_length", truncation=True, max_length=512)


class TextClassifierDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Cast labels to float
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_dataset = TextClassifierDataset(train_encodings, train_labels)
eval_dataset = TextClassifierDataset(eval_encodings, eval_labels)

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=41
)

training_arguments = TrainingArguments(
    output_dir=".",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.127
1000,0.0723
1500,0.059
2000,0.0484
2500,0.0396
3000,0.0328
3500,0.0284
4000,0.0259


TrainOutput(global_step=4056, training_loss=0.05375608722486439, metrics={'train_runtime': 7606.339, 'train_samples_per_second': 8.524, 'train_steps_per_second': 0.533, 'total_flos': 1.706609469124608e+16, 'train_loss': 0.05375608722486439, 'epoch': 8.0})

In [None]:
# Save model and tokenizer
model.save_pretrained("fine_tuned_subtheme")
tokenizer.save_pretrained("fine_tuned_subtheme")


('fine_tuned_subtheme/tokenizer_config.json',
 'fine_tuned_subtheme/special_tokens_map.json',
 'fine_tuned_subtheme/vocab.txt',
 'fine_tuned_subtheme/added_tokens.json',
 'fine_tuned_subtheme/tokenizer.json')

In [None]:
!zip -r subtheme_model.zip subtheme_model

  adding: subtheme_model/ (stored 0%)
  adding: subtheme_model/vocab.txt (deflated 53%)
  adding: subtheme_model/special_tokens_map.json (deflated 42%)
  adding: subtheme_model/model.safetensors (deflated 7%)
  adding: subtheme_model/tokenizer_config.json (deflated 75%)
  adding: subtheme_model/config.json (deflated 68%)
  adding: subtheme_model/tokenizer.json (deflated 71%)


In [None]:
from google.colab import files

files.download("subtheme_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import shutil
from google.colab import files

# Replace 'folder_name' with your actual folder path
shutil.make_archive('fine_tuned_subtheme', 'zip', 'fine_tuned_subtheme')

# Download the zip file



'/content/fine_tuned_subtheme.zip'

In [None]:
files.download('fine_tuned_subtheme.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [6]:
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Keras,Pytorch,Tensorflow/Project works/Models/subtheme_model (1)")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Keras,Pytorch,Tensorflow/Project works/Models/subtheme_model (1)")



In [48]:
model.eval()

texts = [
    # Product + Delay + Positive Service Handling
    "One tyre went missing, so there was a delay to get the two tyres fitted. The way the garage dealt with it was fantastic.",

    # Price + Convenience + Professionalism
    "Competitively priced and easy to use fitting centre near me who were very professional.",

    # Booking Experience + Service + Wait Time
    "Booking online was seamless, but the wait time at the fitting centre was longer than expected. However, staff were friendly and apologetic.",

    # Stock Availability + Communication + Resolution
    "Tyres I ordered were initially out of stock, but I was promptly informed and offered an upgrade at no extra cost.",

    # Location + Speed + Value for Money
    "The garage was just 5 minutes away and they fitted all four tyres in under an hour. Great value for the price paid!",

    # Product Quality + Follow-up Service
    "The tyres themselves were excellent quality, and the follow-up email to check if everything went well was a nice touch.",

    # Negative Experience + Poor Staff Response
    "Had to wait over an hour past my appointment time and no staff came to update me. Extremely poor customer service.",

    # Convenience + Cleanliness + Overall Satisfaction
    "Very convenient location with a clean waiting area and coffee machine. Overall, a very pleasant experience.",

    # Professionalism + Safety Advice + Upsell Attempt
    "Technician was very professional and gave me useful safety advice, though I felt they were slightly pushing for unnecessary extras.",

    # Return Policy + Refund Handling
    "Had to cancel my order due to an emergency. The refund was processed without any hassle, really appreciated the no-fuss return policy."
]


inputs = tokenizer(texts[4], padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
probabilities = torch.sigmoid(logits)  # Apply sigmoid to convert logits to probabilities

In [49]:
probabilities

tensor([[0.2094, 0.9474, 0.0140, 0.3215, 0.9363, 0.0035, 0.0164, 0.0098, 0.0170,
         0.0259, 0.0759, 0.0063, 0.0035, 0.0125, 0.0070, 0.0074, 0.0058, 0.0090,
         0.0027, 0.0044, 0.0026, 0.0104, 0.0047, 0.0023, 0.0023, 0.0064, 0.0025,
         0.0015, 0.0024, 0.0033, 0.0031, 0.0022, 0.0018, 0.0027, 0.0024, 0.0021,
         0.0289, 0.0035, 0.0022, 0.0017, 0.0024]])

In [50]:
indexes_above_threshold = [i for i, val in enumerate(probabilities[0]) if val > 0.2]

print(indexes_above_threshold)

list_2[indexes_above_threshold]

[0, 1, 3, 4]


array(['garage service positive', 'value for money positive',
       'location positive', 'length of fitting positive'], dtype='<U30')

In [41]:
import numpy as np

list_2 = np.array(['garage service positive',
 'value for money positive',
 'ease of booking positive',
 'location positive',
 'length of fitting positive',
 'ease of booking negative',
 'tyre quality positive',
 'garage service negative',
 'wait time negative',
 'delivery punctuality positive',
 'wait time positive',
 'location negative',
 'damage negative',
 'extra charges positive',
 'value for money negative',
 'mobile fitter positive',
 'advisor/agent service positive',
 'facilities positive',
 'change of time negative',
 'extra charges negative',
 'late notice negative',
 'discounts positive',
 'delivery punctuality negative',
 'change of date negative',
 'booking confusion negative',
 'advisoragent service positive',
 'advisor/agent service negative',
 'advisoragent service negative',
 'incorrect tyres sent negative',
 'tyre quality negative',
 'response time negative',
 'refund positive',
 'no stock negative',
 'change of date positive',
 'call wait time negative',
 'refund negative',
 'length of fitting negative',
 'balancing negative',
 'mobile fitter negative',
 'discounts negative',
 'response time positive'])