In [13]:
# Cell 1 — Imports & Load Data
# Purpose: load cleaned tickets for ML (CSAT prediction)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv(r"D:\SaaS Ticket Analytics Dashboard\cleaned\tickets_master_clean.csv")
df.head()


Unnamed: 0,ticket_id,created_at,first_response_at,resolved_at,priority,status,issue_type,channel,customer_id,agent_id,...,csat,created_date,created_hour,created_weekday,agent_name,team,experience_years,customer_size,customer_segment,year_month
0,T000001,2025-02-21 10:34:51,2025-02-21 12:04:51,2025-02-22 17:58:20.092908,High,Closed,Performance Issue,Email,C0716,A007,...,4.2,2025-02-21,10,Friday,Agent_7,Tier 1,1.7,Large,Retail,2025-02
1,T000002,2025-08-20 08:09:08,2025-08-20 11:52:08,2025-08-21 00:37:32.619889,Low,Closed,Api Failure,Chat,C0694,A010,...,3.8,2025-08-20,8,Wednesday,Agent_10,Tier 2,1.4,Small,Healthcare,2025-08
2,T000003,2024-11-27 20:38:08,2024-11-27 22:58:08,2024-11-30 00:22:49.088104,Low,Closed,Integration Config,Phone,C0371,A017,...,3.3,2024-11-27,20,Wednesday,Agent_17,Tier 1,1.2,Medium,Healthcare,2024-11
3,T000004,2024-11-21 21:36:48,2024-11-21 21:42:48,2024-11-21 22:36:48.000000,Urgent,Closed,Login Issue,Email,C0719,A030,...,3.2,2024-11-21,21,Thursday,Agent_30,Tier 1,9.7,Medium,Education,2024-11
4,T000005,2025-02-21 08:59:35,2025-02-21 15:07:35,2025-02-22 09:06:43.772157,Low,Closed,Billing Query,Email,C0147,A021,...,2.9,2025-02-21,8,Friday,Agent_21,Tier 2,2.0,Enterprise,Education,2025-02


In [14]:
# Cell 2 — Create target column (CSAT Good/Bad)
# Purpose: 1 = Good CSAT (>=4), 0 = Bad CSAT (<4)

df['csat_good'] = df['csat'].apply(lambda x: 1 if x >= 4 else 0)
df['csat_good'].value_counts()


csat_good
0    7615
1    3385
Name: count, dtype: int64

In [15]:
# Cell 3 — Select simple features
# Purpose: keep only the required columns for ML

df = df[['priority', 'issue_type', 'created_hour', 'first_response_mins', 'resolution_time_hours', 'csat_good']].dropna()
df.head()


Unnamed: 0,priority,issue_type,created_hour,first_response_mins,resolution_time_hours,csat_good
0,High,Performance Issue,10,90.0,31.391415,1
1,Low,Api Failure,8,223.0,16.473506,0
2,Low,Integration Config,20,140.0,51.744747,0
3,Urgent,Login Issue,21,6.0,1.0,0
4,Low,Billing Query,8,368.0,24.119103,0


In [16]:
# Cell 4 — Encode categorical variables
# Purpose: convert priority and issue_type into numeric codes

df['priority'] = df['priority'].astype('category').cat.codes
df['issue_type'] = df['issue_type'].astype('category').cat.codes


In [17]:
# Cell 5 — Train/test split
# Purpose: split data into 80% train and 20% test

X = df[['priority', 'issue_type', 'created_hour', 'first_response_mins', 'resolution_time_hours']]
y = df['csat_good']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((8800, 5), (2200, 5))

In [18]:
# Cell 6 — Train model
# Purpose: Fit a simple Logistic Regression model

model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [19]:
# Cell 7 — Model Evaluation
# Purpose: check accuracy and classification report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.7236363636363636

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.86      0.82      1556
           1       0.54      0.39      0.45       644

    accuracy                           0.72      2200
   macro avg       0.66      0.63      0.63      2200
weighted avg       0.70      0.72      0.71      2200


Confusion Matrix:
 [[1342  214]
 [ 394  250]]
