<a href="https://colab.research.google.com/github/AzlinRusnan/Optimizing-Customer-Satisfaction-CSAT-Through-Sentiment-Analysis-and-Predictive-ML-Techniques/blob/main/Optimizing_CSAT_Through_BERT_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

file_path = '/content/gdrive/MyDrive/Updated_CSAT_RAW_DATASET.xlsx'
xls = pd.ExcelFile(file_path)

# Check sheet names to understand the structure
xls.sheet_names

['Sheet1']

In [None]:
# Load the data from the first sheet
df = pd.read_excel(xls, sheet_name='Page 1')

# Display the first few rows to understand the structure and locate the 'City' column
df.head()

Unnamed: 0,Number,Location,City,Country,Region,Updated,Average Response (calculated),USS Comment,String value
0,INC19296127,USPO,Pasco,United States of America,NORTH AMERICA,2024-10-31 23:18:02,1,\n\n\n\n,Very Satisfied
1,INC19297125,USGR,Greensboro,United States of America,NORTH AMERICA,2024-10-31 22:35:51,1,Thanks for punctual and quick service resolvin...,Very Satisfied
2,INC19283148,USGR,Greensboro,United States of America,NORTH AMERICA,2024-10-31 22:06:26,1,\n\n\n,Very Satisfied
3,INC19296794,BRSP,Sao Paulo,Brazil,LATAM,2024-10-31 21:39:18,1,\n\n\n,Very Satisfied
4,INC19295496,INPU,Pune,India,APAC,2024-10-31 21:26:21,1,\n\nPrompt response\n,Very Satisfied


In [None]:
df.isnull().sum().to_frame().rename(columns={0:"Total No. of Missing Values"})

Unnamed: 0,Total No. of Missing Values
Number,0
Location,0
City,40380
Country,0
Region,0
Updated,0
Average Response (calculated),0
USS Comment,40536
String value,41040


In [None]:
# To fill the missing 'City' values, I will first explore how to handle it.
# One approach is to use the most frequent city in the respective 'Location' or 'Country'.

# Check for the distribution of cities in the dataset
city_distribution = df['City'].value_counts()

# Checking the first few rows of 'Location' and 'City' to explore patterns
df[['Location','City']].head(10)

Unnamed: 0,Location,City
0,USPO,Pasco
1,USGR,Greensboro
2,USGR,Greensboro
3,BRSP,Sao Paulo
4,INPU,Pune
5,INPU,Pune
6,INPU,Pune
7,BRIF,São Paulo
8,MXMO,Los Mochis
9,GBGU,Guildford


In [None]:
# I will attempt to fill missing 'City' values based on the most frequent city in each 'Location'.
city_mapping = df.groupby('Location')['City'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else 'Unknown').to_dict()

# Apply this mapping to fill missing 'City' values
df['City'] = df['City'].fillna(df['Location'].map(city_mapping))

# Check the number of missing values after filling
missing_values_after = df['City'].isnull().sum()
print("There are now " + str(missing_values_after) + " missing values in City!")

There are now 0 missing values in City!


In [None]:
# Since we have fill in the gap in City. Lets proceed with missing values in String Value

# Check for missing values in the 'City' column
missing_values_sv = df['String value'].isnull().sum()
print("There are " + str(missing_values_sv)  + " missing values in the String Value column")

There are 41040 missing values in the String Value column


In [None]:
# Let's check if there is any pattern to help us fill missing values in the 'String value' column
df[['String value']].drop_duplicates().head(20)

Unnamed: 0,String value
0,Very Satisfied
20,Satisfied
50,Very Dissatisfied
55,Neutral
70,Dissatisfied
49648,


To fill the missing values in the "String value" column based on the "Average Response (calculated)" column, we can use the mapping:

1 → Very Satisfied

2 → Satisfied

3 → Neutral

4 → Dissatisfied

5 → Very Dissatisfied

In [None]:
# Define the mapping
response_mapping = {
    1: "Very Satisfied",
    2: "Satisfied",
    3: "Neutral",
    4: "Dissatisfied",
    5: "Very Dissatisfied"
}

# Fill missing values in the "String value" column using the mapping
df['String value'] = df['String value'].fillna(
    df['Average Response (calculated)'].map(response_mapping)
)

# Verify the changes
df.tail()

Unnamed: 0,Number,Location,City,Country,Region,Updated,Average Response (calculated),USS Comment,String value
90683,INC6838845,INPU,Pune,India,APAC,2022-01-03 03:00:09,1,Thanks for fast action,Very Satisfied
90684,INC6390085,NLEN,Enkhuizen,Netherlands,EAME,2022-01-03 03:00:09,3,,Neutral
90685,INC6524346,USMN,Unknown,United States of America,NORTH AMERICA,2022-01-03 03:00:09,5,Issue not resolved.,Very Dissatisfied
90686,INC6968136,BRSP,Sao Paulo,Brazil,LATAM,2022-01-03 03:00:09,1,,Very Satisfied
90687,INC7269810,IDKD,Kediri,Indonesia,APAC,2022-01-03 03:00:09,1,Fast Responses and helpful,Very Satisfied


In [None]:
# Extract Year and Month-Year from the "Updated" column for us to have a better look on the output in the future

df['Year'] = pd.to_datetime(df['Updated']).dt.year
df['Month-Year'] = pd.to_datetime(df['Updated']).dt.strftime('%b/%Y')

df.tail()

Unnamed: 0,Number,Location,City,Country,Region,Updated,Average Response (calculated),USS Comment,String value,Year,Month-Year
90683,INC6838845,INPU,Pune,India,APAC,2022-01-03 03:00:09,1,Thanks for fast action,Very Satisfied,2022,Jan/2022
90684,INC6390085,NLEN,Enkhuizen,Netherlands,EAME,2022-01-03 03:00:09,3,,Neutral,2022,Jan/2022
90685,INC6524346,USMN,Unknown,United States of America,NORTH AMERICA,2022-01-03 03:00:09,5,Issue not resolved.,Very Dissatisfied,2022,Jan/2022
90686,INC6968136,BRSP,Sao Paulo,Brazil,LATAM,2022-01-03 03:00:09,1,,Very Satisfied,2022,Jan/2022
90687,INC7269810,IDKD,Kediri,Indonesia,APAC,2022-01-03 03:00:09,1,Fast Responses and helpful,Very Satisfied,2022,Jan/2022


In [None]:
# Remove the specified columns
columns_to_remove = ['Number', 'Location', 'Updated']
df = df.drop(columns=columns_to_remove)

df.head()

Unnamed: 0,City,Country,Region,Average Response (calculated),USS Comment,String value,Year,Month-Year
0,Pasco,United States of America,NORTH AMERICA,1,\n\n\n\n,Very Satisfied,2024,Oct/2024
1,Greensboro,United States of America,NORTH AMERICA,1,Thanks for punctual and quick service resolvin...,Very Satisfied,2024,Oct/2024
2,Greensboro,United States of America,NORTH AMERICA,1,\n\n\n,Very Satisfied,2024,Oct/2024
3,Sao Paulo,Brazil,LATAM,1,\n\n\n,Very Satisfied,2024,Oct/2024
4,Pune,India,APAC,1,\n\nPrompt response\n,Very Satisfied,2024,Oct/2024


In [None]:
#df.to_excel('Updated_CSAT_RAW_DATASET.xlsx', index=False)

#from google.colab import files
#files.download('Updated_CSAT_RAW_DATASET.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## **SENTIMENT ANALYSIS**

In [2]:
pip install transformers



In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import pandas as pd


In [4]:
# Load the pre-trained BERT model for sentiment analysis
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [5]:
file_path = '/content/gdrive/MyDrive/Capstone Project/Sentiment Analysis_cleaned_dataset_3.0.xlsx'
xls = pd.ExcelFile(file_path)

# Check sheet names to understand the structure
xls.sheet_names

['Sheet1']

In [7]:
# Load the data from the first sheet
mismatch_data = pd.read_excel(xls, sheet_name='Sheet1')

mismatch_data.head()

filtered_dataset = mismatch_data[
    (mismatch_data['Language'] == 'english') &
    (mismatch_data['USS Comment'].notnull())
]

filtered_dataset.head()

num_rows = len(filtered_dataset)
print(f"Number of rows: {num_rows}")


Number of rows: 8125


In [8]:
# Use the USS Comment column for sentiment analysis
comments = filtered_dataset['USS Comment'].tolist()

In [9]:
# Tokenization function
def preprocess(text):
    return tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")


In [10]:
def predict_sentiment(comment):
    inputs = preprocess(comment)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()  # Get the predicted class (0, 1, 2, ...)
    return prediction


In [11]:
# Apply sentiment analysis to each comment
filtered_dataset['BERT Sentiment'] = filtered_dataset['USS Comment'].apply(predict_sentiment)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset['BERT Sentiment'] = filtered_dataset['USS Comment'].apply(predict_sentiment)


In [12]:
sentiment_labels = {
    0: "Very Negative",
    1: "Negative",
    2: "Neutral",
    3: "Positive",
    4: "Very Positive"
}

filtered_dataset['BERT Sentiment Label'] = filtered_dataset['BERT Sentiment'].map(sentiment_labels)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset['BERT Sentiment Label'] = filtered_dataset['BERT Sentiment'].map(sentiment_labels)


In [13]:
filtered_dataset.head()

Unnamed: 0,Number,City,Country,Region,Average Response (calculated),USS Comment,String value,Year,Month-Year,Language,BERT Sentiment,BERT Sentiment Label
0,INC19297125,Greensboro,United States of America,NORTH AMERICA,1,Thanks for punctual and quick service resolvin...,Very Satisfied,2024,Oct/2024,english,4,Very Positive
1,INC19295496,Pune,India,APAC,1,Prompt response,Very Satisfied,2024,Oct/2024,english,3,Positive
2,INC19295442,Pune,India,APAC,1,Quick support,Very Satisfied,2024,Oct/2024,english,4,Very Positive
3,INC19296419,Guildford,United Kingdom,EAME,1,Andre provided excellent support He used Teams...,Very Satisfied,2024,Oct/2024,english,4,Very Positive
4,INC19296218,Toronto,Canada,NORTH AMERICA,1,Assistance was immediate and resolved my issue...,Very Satisfied,2024,Oct/2024,english,4,Very Positive


In [14]:
filtered_dataset.to_excel('output_with_bert_sentiment_extended_3.0.xlsx', index=False)

from google.colab import files
files.download('output_with_bert_sentiment_extended_3.0.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>