In [1]:
# 1. Imports and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

# Plot settings
plt.rcParams['figure.figsize'] = (10, 6)
sns.set(style="whitegrid")


ModuleNotFoundError: No module named 'xgboost'

In [2]:
# 2. Load the Data
df = pd.read_csv('customer_support_tickets.csv')

# First quick look
df.head()


Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Social media,2023-06-01 12:15:36,,
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Chat,2023-06-01 16:45:38,,
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,2023-06-01 11:14:38,2023-06-01 18:05:38,3.0
3,4,Christina Dillon,bradleyolson@example.org,27,Female,Microsoft Office,2020-11-13,Billing inquiry,Account access,I'm having an issue with the {product_purchase...,Closed,Try capital clearly never color toward story.,Low,Social media,2023-06-01 07:29:40,2023-06-01 01:57:40,3.0
4,5,Alexander Carroll,bradleymark@example.com,67,Female,Autodesk AutoCAD,2020-02-04,Billing inquiry,Data loss,I'm having an issue with the {product_purchase...,Closed,West decision evidence bit.,Low,Email,2023-06-01 00:12:42,2023-06-01 19:53:42,1.0


In [3]:
# 3. Data Understanding
print(df.info())
print(df.describe())
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8469 entries, 0 to 8468
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ticket ID                     8469 non-null   int64  
 1   Customer Name                 8469 non-null   object 
 2   Customer Email                8469 non-null   object 
 3   Customer Age                  8469 non-null   int64  
 4   Customer Gender               8469 non-null   object 
 5   Product Purchased             8469 non-null   object 
 6   Date of Purchase              8469 non-null   object 
 7   Ticket Type                   8469 non-null   object 
 8   Ticket Subject                8469 non-null   object 
 9   Ticket Description            8469 non-null   object 
 10  Ticket Status                 8469 non-null   object 
 11  Resolution                    2769 non-null   object 
 12  Ticket Priority               8469 non-null   object 
 13  Tic

In [4]:
# 4. Data Preparation
# Convert timestamps
df['Created_Time'] = pd.to_datetime(df['Created_Time'])
df['Resolved_Time'] = pd.to_datetime(df['Resolved_Time'])

# Calculate response and resolution times if needed
df['Response_Time_Hours'] = df['Response_Time'] / 3600  # assuming it's in seconds
df['Resolution_Time_Hours'] = (df['Resolved_Time'] - df['Created_Time']).dt.total_seconds() / 3600

# Create time-based features
df['Hour'] = df['Created_Time'].dt.hour
df['DayOfWeek'] = df['Created_Time'].dt.dayofweek
df['Month'] = df['Created_Time'].dt.month

# Encode categorical variables
df['Channel'] = df['Channel'].astype('category')
df['Channel_Code'] = df['Channel'].cat.codes

# Drop unnecessary columns (adjust based on data)
# df = df.drop(columns=['Ticket_ID', 'Customer_ID'], errors='ignore')


KeyError: 'Created_Time'

In [None]:
# 5. Exploratory Data Analysis (EDA)

# Ticket volume by month
df.groupby('Month').size().plot(kind='bar')
plt.title('Tickets by Month')
plt.ylabel('Number of Tickets')
plt.show()

# Average satisfaction by channel
sns.barplot(x='Channel', y='Satisfaction_Score', data=df)
plt.title('Customer Satisfaction by Support Channel')
plt.xticks(rotation=45)
plt.show()

# Response time vs satisfaction
sns.scatterplot(x='Response_Time_Hours', y='Satisfaction_Score', data=df)
plt.title('Response Time vs Satisfaction')
plt.show()

# Correlation heatmap
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [None]:
# 6. Machine Learning Models

# Classification: Predict Satisfaction High/Low
df['Satisfaction_Level'] = np.where(df['Satisfaction_Score'] >= 4, 1, 0)

# Features and Target
X = df[['Response_Time_Hours', 'Resolution_Time_Hours', 'Channel_Code', 'Hour', 'DayOfWeek', 'Month']]
y = df['Satisfaction_Level']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)

# Metrics
print(classification_report(y_test, y_pred_rfc))
conf_matrix = confusion_matrix(y_test, y_pred_rfc)
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix - Random Forest')
plt.show()


In [None]:
# 7. Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5]
}

grid_rfc = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_rfc.fit(X_train, y_train)

print("Best Parameters:", grid_rfc.best_params_)
print("Best Cross-Val Score:", grid_rfc.best_score_)


In [None]:
# 8. Forecasting Ticket Volumes

# Prepare for forecasting (group by date)
df['Date'] = df['Created_Time'].dt.date
daily_tickets = df.groupby('Date').size().reset_index(name='Ticket_Count')

# Features
daily_tickets['DayOfWeek'] = pd.to_datetime(daily_tickets['Date']).dt.dayofweek
daily_tickets['Month'] = pd.to_datetime(daily_tickets['Date']).dt.month

X_forecast = daily_tickets[['DayOfWeek', 'Month']]
y_forecast = daily_tickets['Ticket_Count']

# Train-Test
Xf_train, Xf_test, yf_train, yf_test = train_test_split(X_forecast, y_forecast, test_size=0.2, random_state=42)

# Model
xgb_model = XGBRegressor()
xgb_model.fit(Xf_train, yf_train)
yf_pred = xgb_model.predict(Xf_test)

# Metrics
print("MAE:", mean_absolute_error(yf_test, yf_pred))
print("R²:", r2_score(yf_test, yf_pred))

# Plot actual vs predicted
plt.plot(yf_test.values, label='Actual')
plt.plot(yf_pred, label='Predicted')
plt.legend()
plt.title('Actual vs Predicted Ticket Volumes')
plt.show()
