<a href="https://colab.research.google.com/github/AzlinRusnan/Optimizing-Customer-Satisfaction-CSAT-Through-Sentiment-Analysis-and-Predictive-ML-Techniques/blob/main/Optimizing_CSAT_Through_BERT_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

file_path = '/content/gdrive/MyDrive/Updated_CSAT_RAW_DATASET.xlsx'
xls = pd.ExcelFile(file_path)

# Check sheet names to understand the structure
xls.sheet_names

['Sheet1']

In [None]:
# Load the data from the first sheet
df = pd.read_excel(xls, sheet_name='Page 1')

# Display the first few rows to understand the structure and locate the 'City' column
df.head()

Unnamed: 0,Number,Location,City,Country,Region,Updated,Average Response (calculated),USS Comment,String value
0,INC19296127,USPO,Pasco,United States of America,NORTH AMERICA,2024-10-31 23:18:02,1,\n\n\n\n,Very Satisfied
1,INC19297125,USGR,Greensboro,United States of America,NORTH AMERICA,2024-10-31 22:35:51,1,Thanks for punctual and quick service resolvin...,Very Satisfied
2,INC19283148,USGR,Greensboro,United States of America,NORTH AMERICA,2024-10-31 22:06:26,1,\n\n\n,Very Satisfied
3,INC19296794,BRSP,Sao Paulo,Brazil,LATAM,2024-10-31 21:39:18,1,\n\n\n,Very Satisfied
4,INC19295496,INPU,Pune,India,APAC,2024-10-31 21:26:21,1,\n\nPrompt response\n,Very Satisfied


In [None]:
df.isnull().sum().to_frame().rename(columns={0:"Total No. of Missing Values"})

Unnamed: 0,Total No. of Missing Values
Number,0
Location,0
City,40380
Country,0
Region,0
Updated,0
Average Response (calculated),0
USS Comment,40536
String value,41040


In [None]:
# To fill the missing 'City' values, I will first explore how to handle it.
# One approach is to use the most frequent city in the respective 'Location' or 'Country'.

# Check for the distribution of cities in the dataset
city_distribution = df['City'].value_counts()

# Checking the first few rows of 'Location' and 'City' to explore patterns
df[['Location','City']].head(10)

Unnamed: 0,Location,City
0,USPO,Pasco
1,USGR,Greensboro
2,USGR,Greensboro
3,BRSP,Sao Paulo
4,INPU,Pune
5,INPU,Pune
6,INPU,Pune
7,BRIF,São Paulo
8,MXMO,Los Mochis
9,GBGU,Guildford


In [None]:
# I will attempt to fill missing 'City' values based on the most frequent city in each 'Location'.
city_mapping = df.groupby('Location')['City'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else 'Unknown').to_dict()

# Apply this mapping to fill missing 'City' values
df['City'] = df['City'].fillna(df['Location'].map(city_mapping))

# Check the number of missing values after filling
missing_values_after = df['City'].isnull().sum()
print("There are now " + str(missing_values_after) + " missing values in City!")

There are now 0 missing values in City!


In [None]:
# Since we have fill in the gap in City. Lets proceed with missing values in String Value

# Check for missing values in the 'City' column
missing_values_sv = df['String value'].isnull().sum()
print("There are " + str(missing_values_sv)  + " missing values in the String Value column")

There are 41040 missing values in the String Value column


In [None]:
# Let's check if there is any pattern to help us fill missing values in the 'String value' column
df[['String value']].drop_duplicates().head(20)

Unnamed: 0,String value
0,Very Satisfied
20,Satisfied
50,Very Dissatisfied
55,Neutral
70,Dissatisfied
49648,


To fill the missing values in the "String value" column based on the "Average Response (calculated)" column, we can use the mapping:

1 → Very Satisfied

2 → Satisfied

3 → Neutral

4 → Dissatisfied

5 → Very Dissatisfied

In [None]:
# Define the mapping
response_mapping = {
    1: "Very Satisfied",
    2: "Satisfied",
    3: "Neutral",
    4: "Dissatisfied",
    5: "Very Dissatisfied"
}

# Fill missing values in the "String value" column using the mapping
df['String value'] = df['String value'].fillna(
    df['Average Response (calculated)'].map(response_mapping)
)

# Verify the changes
df.tail()

Unnamed: 0,Number,Location,City,Country,Region,Updated,Average Response (calculated),USS Comment,String value
90683,INC6838845,INPU,Pune,India,APAC,2022-01-03 03:00:09,1,Thanks for fast action,Very Satisfied
90684,INC6390085,NLEN,Enkhuizen,Netherlands,EAME,2022-01-03 03:00:09,3,,Neutral
90685,INC6524346,USMN,Unknown,United States of America,NORTH AMERICA,2022-01-03 03:00:09,5,Issue not resolved.,Very Dissatisfied
90686,INC6968136,BRSP,Sao Paulo,Brazil,LATAM,2022-01-03 03:00:09,1,,Very Satisfied
90687,INC7269810,IDKD,Kediri,Indonesia,APAC,2022-01-03 03:00:09,1,Fast Responses and helpful,Very Satisfied


In [None]:
# Extract Year and Month-Year from the "Updated" column for us to have a better look on the output in the future

df['Year'] = pd.to_datetime(df['Updated']).dt.year
df['Month-Year'] = pd.to_datetime(df['Updated']).dt.strftime('%b/%Y')

df.tail()

Unnamed: 0,Number,Location,City,Country,Region,Updated,Average Response (calculated),USS Comment,String value,Year,Month-Year
90683,INC6838845,INPU,Pune,India,APAC,2022-01-03 03:00:09,1,Thanks for fast action,Very Satisfied,2022,Jan/2022
90684,INC6390085,NLEN,Enkhuizen,Netherlands,EAME,2022-01-03 03:00:09,3,,Neutral,2022,Jan/2022
90685,INC6524346,USMN,Unknown,United States of America,NORTH AMERICA,2022-01-03 03:00:09,5,Issue not resolved.,Very Dissatisfied,2022,Jan/2022
90686,INC6968136,BRSP,Sao Paulo,Brazil,LATAM,2022-01-03 03:00:09,1,,Very Satisfied,2022,Jan/2022
90687,INC7269810,IDKD,Kediri,Indonesia,APAC,2022-01-03 03:00:09,1,Fast Responses and helpful,Very Satisfied,2022,Jan/2022


In [None]:
# Remove the specified columns
columns_to_remove = ['Number', 'Location', 'Updated']
df = df.drop(columns=columns_to_remove)

df.head()

Unnamed: 0,City,Country,Region,Average Response (calculated),USS Comment,String value,Year,Month-Year
0,Pasco,United States of America,NORTH AMERICA,1,\n\n\n\n,Very Satisfied,2024,Oct/2024
1,Greensboro,United States of America,NORTH AMERICA,1,Thanks for punctual and quick service resolvin...,Very Satisfied,2024,Oct/2024
2,Greensboro,United States of America,NORTH AMERICA,1,\n\n\n,Very Satisfied,2024,Oct/2024
3,Sao Paulo,Brazil,LATAM,1,\n\n\n,Very Satisfied,2024,Oct/2024
4,Pune,India,APAC,1,\n\nPrompt response\n,Very Satisfied,2024,Oct/2024


In [None]:
#df.to_excel('Updated_CSAT_RAW_DATASET.xlsx', index=False)

#from google.colab import files
#files.download('Updated_CSAT_RAW_DATASET.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## **SENTIMENT ANALYSIS**

In [None]:
pip install transformers



In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import pandas as pd


In [3]:
# Load the pre-trained BERT model for sentiment analysis
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [23]:
file_path = '/content/gdrive/MyDrive/Capstone Project/Sentiment Analysis_cleaned_dataset_4.0_Sept.xlsx'
xls = pd.ExcelFile(file_path)

# Check sheet names to understand the structure
xls.sheet_names

['Sheet1']

In [24]:
# Load the data from the first sheet
mismatch_data = pd.read_excel(xls, sheet_name='Sheet1')

mismatch_data.head()

filtered_dataset = mismatch_data[
    (mismatch_data['Language'] == 'english') &
    (mismatch_data['USS Comment'].notnull())
]

filtered_dataset.head()

num_rows = len(filtered_dataset)
print(f"Number of rows: {num_rows}")


Number of rows: 784


In [25]:
# Use the USS Comment column for sentiment analysis
comments = filtered_dataset['USS Comment'].tolist()

In [26]:
# Tokenization function
def preprocess(text):
    return tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")


In [27]:
def predict_sentiment(comment):
    inputs = preprocess(comment)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()
    return prediction


In [28]:
# Apply sentiment analysis to each comment
filtered_dataset['BERT Sentiment'] = filtered_dataset['USS Comment'].apply(predict_sentiment)


In [29]:
sentiment_labels = {
    0: "Very Negative",
    1: "Negative",
    2: "Neutral",
    3: "Positive",
    4: "Very Positive"
}

filtered_dataset['BERT Sentiment Label'] = filtered_dataset['BERT Sentiment'].map(sentiment_labels)

In [30]:
filtered_dataset.head()

Unnamed: 0,Number,City,Country,Region,Average Response (calculated),USS Comment,String value,Year,Month-Year,Language,BERT Sentiment,BERT Sentiment Label
0,INC19082941,Slater,United States of America,NORTH AMERICA,2,Not solved as quickly as hoped but still done ...,Satisfied,2024,Sep/2024,english,3,Positive
1,INC19083389,Durham,United States of America,NORTH AMERICA,1,Pull print now working Just needed the right s...,Very Satisfied,2024,Sep/2024,english,3,Positive
2,INC19082996,Durham,United States of America,NORTH AMERICA,1,Very quick and timely respond Provided feedbac...,Very Satisfied,2024,Sep/2024,english,4,Very Positive
3,INC19115533,Graneros,Chile,LATAM,1,Thanks,Very Satisfied,2024,Sep/2024,english,4,Very Positive
4,INC19003052,Durham,United States of America,NORTH AMERICA,1,Request efficiently escalated and then resolve...,Very Satisfied,2024,Sep/2024,english,3,Positive


In [31]:
filtered_dataset.to_excel('output_with_bert_sentiment.xlsx', index=False)

from google.colab import files
files.download('output_with_bert_sentiment.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### **Fine-tuning a pre-trained BERT model**

In [None]:
pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
#test

In [1]:
!pip install transformers datasets torch scikit-learn pandas openpyxl

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
# Import libraries
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

In [10]:
# Load data
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from transformers import BertConfig, BertForSequenceClassification

file_path = '/content/gdrive/MyDrive/Capstone Project/fine-tuned.xlsx'
data = pd.read_excel(file_path)

# Map sentiment labels to integers
label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
data['Label'] = data['Sentiment'].map(label_mapping)

# Drop rows with missing or duplicate comments
data = data.dropna(subset=['Comment', 'Label']).drop_duplicates(subset=['Comment'])

# Check data
print(data.head())

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['Comment'].tolist(),
    data['Label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Define dataset class
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets
train_dataset = SentimentDataset(train_texts, train_labels)
val_dataset = SentimentDataset(val_texts, val_labels)

model = BertForSequenceClassification.from_pretrained(
    "nlptown/bert-base-multilingual-uncased-sentiment",
    num_labels=3,
    ignore_mismatched_sizes=True
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",  # Disables W&B
    load_best_model_at_end=True
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

                                             Comment Sentiment  Label
0                         thank you for the support   Positive      2
1                                 Excellent solution  Positive      2
2                                       Issue solved  Positive      2
3                       Problem resolved with thanks  Positive      2
4  The ticket was completed in a very short time ...  Positive      2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.821674
2,0.863900,0.701384
3,0.863900,0.614759
4,0.449300,0.60315


TrainOutput(global_step=20, training_loss=0.6566044807434082, metrics={'train_runtime': 137.8509, 'train_samples_per_second': 2.263, 'train_steps_per_second': 0.145, 'total_flos': 20522846582784.0, 'train_loss': 0.6566044807434082, 'epoch': 4.0})

In [11]:
#evaluate the model on the validation set

from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Get predictions and labels
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# Print classification report
print(classification_report(labels, preds, target_names=['Negative', 'Neutral', 'Positive']))

# Calculate overall accuracy
accuracy = accuracy_score(labels, preds)
print(f"Validation Accuracy: {accuracy:.2f}")


              precision    recall  f1-score   support

    Negative       0.71      0.83      0.77         6
     Neutral       0.00      0.00      0.00         2
    Positive       0.85      0.92      0.88        12

    accuracy                           0.80        20
   macro avg       0.52      0.58      0.55        20
weighted avg       0.72      0.80      0.76        20

Validation Accuracy: 0.80


In [14]:
#Evaluate and Save the Model

results = trainer.evaluate()
print(results)

# Save the model
trainer.save_model('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

{'eval_loss': 0.6031501889228821, 'eval_runtime': 0.2186, 'eval_samples_per_second': 91.505, 'eval_steps_per_second': 13.726, 'epoch': 4.0}


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json')

In [27]:
# test the fine-tuned model

from transformers import pipeline

fine_tuned_model = BertForSequenceClassification.from_pretrained('./fine_tuned_model')
fine_tuned_tokenizer = BertTokenizer.from_pretrained('./fine_tuned_model')

# Create a sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model=fine_tuned_model, tokenizer=fine_tuned_tokenizer)

# Test with new comments
test_comments = ["Thank you so much Error is solved"]
results = sentiment_analyzer(test_comments)
print(results)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'LABEL_2', 'score': 0.8381373286247253}]


In [None]:
#next: test with real dataset