### Project Objectives
+ Identify factors affecting CSAT score.
+ Build a predictive model.
+ Reccommend strategies to improve customer satisfcation in the future

__Target Variable: CSAT Score.__

In [108]:
import numpy as np
import pandas as pd
import seaborn as sns

In [109]:
# df = our dataset
df = pd.read_csv("C:/Users/HP/OneDrive/Documents/DANNY DATA/Customer Support Data/Customer_support_data.csv")
df_copy = df.copy()

In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85907 entries, 0 to 85906
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unique id                85907 non-null  object 
 1   channel_name             85907 non-null  object 
 2   category                 85907 non-null  object 
 3   Sub-category             85907 non-null  object 
 4   Customer Remarks         28756 non-null  object 
 5   Order_id                 67675 non-null  object 
 6   order_date_time          17214 non-null  object 
 7   Issue_reported at        85907 non-null  object 
 8   issue_responded          85907 non-null  object 
 9   Survey_response_Date     85907 non-null  object 
 10  Customer_City            17079 non-null  object 
 11  Product_category         17196 non-null  object 
 12  Item_price               17206 non-null  float64
 13  connected_handling_time  242 non-null    float64
 14  Agent_name            

In [111]:
df.head()

Unnamed: 0,Unique id,channel_name,category,Sub-category,Customer Remarks,Order_id,order_date_time,Issue_reported at,issue_responded,Survey_response_Date,Customer_City,Product_category,Item_price,connected_handling_time,Agent_name,Supervisor,Manager,Tenure Bucket,Agent Shift,CSAT Score
0,7e9ae164-6a8b-4521-a2d4-58f7c9fff13f,Outcall,Product Queries,Life Insurance,,c27c9bb4-fa36-4140-9f1f-21009254ffdb,,01/08/2023 11:13,01/08/2023 11:47,01-Aug-23,,,,,Richard Buchanan,Mason Gupta,Jennifer Nguyen,On Job Training,Morning,5
1,b07ec1b0-f376-43b6-86df-ec03da3b2e16,Outcall,Product Queries,Product Specific Information,,d406b0c7-ce17-4654-b9de-f08d421254bd,,01/08/2023 12:52,01/08/2023 12:54,01-Aug-23,,,,,Vicki Collins,Dylan Kim,Michael Lee,>90,Morning,5
2,200814dd-27c7-4149-ba2b-bd3af3092880,Inbound,Order Related,Installation/demo,,c273368d-b961-44cb-beaf-62d6fd6c00d5,,01/08/2023 20:16,01/08/2023 20:38,01-Aug-23,,,,,Duane Norman,Jackson Park,William Kim,On Job Training,Evening,5
3,eb0d3e53-c1ca-42d3-8486-e42c8d622135,Inbound,Returns,Reverse Pickup Enquiry,,5aed0059-55a4-4ec6-bb54-97942092020a,,01/08/2023 20:56,01/08/2023 21:16,01-Aug-23,,,,,Patrick Flores,Olivia Wang,John Smith,>90,Evening,5
4,ba903143-1e54-406c-b969-46c52f92e5df,Inbound,Cancellation,Not Needed,,e8bed5a9-6933-4aff-9dc6-ccefd7dcde59,,01/08/2023 10:30,01/08/2023 10:32,01-Aug-23,,,,,Christopher Sanchez,Austin Johnson,Michael Lee,0-30,Morning,5


We will only make use of columns that may be relevant to customer satisfaction score (CSAT Score). 

Choice of columns used can be adjusted depending on business needs and model performance.

This is where speaking to project managers or acquiring more data sample is necessary if possible.  

In [112]:
relevant_columns = [
    'channel_name', 'category', 'Sub-category', 'Issue_reported at',
    'issue_responded', 'connected_handling_time', 'Agent_name',
    'Supervisor', 'Manager', 'Tenure Bucket', 'Agent Shift', 'CSAT Score'
]

In [113]:
df = df[relevant_columns]
df.head()

Unnamed: 0,channel_name,category,Sub-category,Issue_reported at,issue_responded,connected_handling_time,Agent_name,Supervisor,Manager,Tenure Bucket,Agent Shift,CSAT Score
0,Outcall,Product Queries,Life Insurance,01/08/2023 11:13,01/08/2023 11:47,,Richard Buchanan,Mason Gupta,Jennifer Nguyen,On Job Training,Morning,5
1,Outcall,Product Queries,Product Specific Information,01/08/2023 12:52,01/08/2023 12:54,,Vicki Collins,Dylan Kim,Michael Lee,>90,Morning,5
2,Inbound,Order Related,Installation/demo,01/08/2023 20:16,01/08/2023 20:38,,Duane Norman,Jackson Park,William Kim,On Job Training,Evening,5
3,Inbound,Returns,Reverse Pickup Enquiry,01/08/2023 20:56,01/08/2023 21:16,,Patrick Flores,Olivia Wang,John Smith,>90,Evening,5
4,Inbound,Cancellation,Not Needed,01/08/2023 10:30,01/08/2023 10:32,,Christopher Sanchez,Austin Johnson,Michael Lee,0-30,Morning,5


In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85907 entries, 0 to 85906
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   channel_name             85907 non-null  object 
 1   category                 85907 non-null  object 
 2   Sub-category             85907 non-null  object 
 3   Issue_reported at        85907 non-null  object 
 4   issue_responded          85907 non-null  object 
 5   connected_handling_time  242 non-null    float64
 6   Agent_name               85907 non-null  object 
 7   Supervisor               85907 non-null  object 
 8   Manager                  85907 non-null  object 
 9   Tenure Bucket            85907 non-null  object 
 10  Agent Shift              85907 non-null  object 
 11  CSAT Score               85907 non-null  int64  
dtypes: float64(1), int64(1), object(10)
memory usage: 7.9+ MB


In [115]:
print(df.isna().sum())

# Too many missing values

channel_name                   0
category                       0
Sub-category                   0
Issue_reported at              0
issue_responded                0
connected_handling_time    85665
Agent_name                     0
Supervisor                     0
Manager                        0
Tenure Bucket                  0
Agent Shift                    0
CSAT Score                     0
dtype: int64


connected_handling_time shows how long it took to resolve the issue. This column is important but has too many missing values so we will drop it.

In [116]:
df.drop(columns=["connected_handling_time"], axis=1, inplace=True)
df.isna().sum()

channel_name         0
category             0
Sub-category         0
Issue_reported at    0
issue_responded      0
Agent_name           0
Supervisor           0
Manager              0
Tenure Bucket        0
Agent Shift          0
CSAT Score           0
dtype: int64

In [117]:
# show how fast customer issues where responded to
df["response_time"] = (
    pd.to_datetime(df["Issue_reported at"]) - pd.to_datetime(df["issue_responded"])
).dt.total_seconds()

# df.columns.get_loc("Issue_reported at")

# move position of response_time
move_col = df.pop("response_time")
df.insert(5, "response_time", move_col)

In [118]:
df.head()

Unnamed: 0,channel_name,category,Sub-category,Issue_reported at,issue_responded,response_time,Agent_name,Supervisor,Manager,Tenure Bucket,Agent Shift,CSAT Score
0,Outcall,Product Queries,Life Insurance,01/08/2023 11:13,01/08/2023 11:47,-2040.0,Richard Buchanan,Mason Gupta,Jennifer Nguyen,On Job Training,Morning,5
1,Outcall,Product Queries,Product Specific Information,01/08/2023 12:52,01/08/2023 12:54,-120.0,Vicki Collins,Dylan Kim,Michael Lee,>90,Morning,5
2,Inbound,Order Related,Installation/demo,01/08/2023 20:16,01/08/2023 20:38,-1320.0,Duane Norman,Jackson Park,William Kim,On Job Training,Evening,5
3,Inbound,Returns,Reverse Pickup Enquiry,01/08/2023 20:56,01/08/2023 21:16,-1200.0,Patrick Flores,Olivia Wang,John Smith,>90,Evening,5
4,Inbound,Cancellation,Not Needed,01/08/2023 10:30,01/08/2023 10:32,-120.0,Christopher Sanchez,Austin Johnson,Michael Lee,0-30,Morning,5
