In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported successfully.")


‚úÖ All libraries imported successfully.


In [4]:
DATA_PATH = "customer_support_ticket.csv"  # correct spelling

In [5]:
from google.colab import files
uploaded = files.upload()

Saving customer_support_tickets.csv to customer_support_tickets.csv


In [6]:
import os
print(os.listdir())  # ye current directory ke files show karega

['.config', 'customer_support_tickets.csv', 'sample_data']


In [7]:
import pandas as pd

DATA_PATH = "customer_support_tickets.csv"  # correct file name (with 's')
df = pd.read_csv(DATA_PATH)

print("‚úÖ Loaded:", DATA_PATH, "Shape:", df.shape)
df.head()

‚úÖ Loaded: customer_support_tickets.csv Shape: (8469, 17)


Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Social media,2023-06-01 12:15:36,,
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Chat,2023-06-01 16:45:38,,
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,2023-06-01 11:14:38,2023-06-01 18:05:38,3.0
3,4,Christina Dillon,bradleyolson@example.org,27,Female,Microsoft Office,2020-11-13,Billing inquiry,Account access,I'm having an issue with the {product_purchase...,Closed,Try capital clearly never color toward story.,Low,Social media,2023-06-01 07:29:40,2023-06-01 01:57:40,3.0
4,5,Alexander Carroll,bradleymark@example.com,67,Female,Autodesk AutoCAD,2020-02-04,Billing inquiry,Data loss,I'm having an issue with the {product_purchase...,Closed,West decision evidence bit.,Low,Email,2023-06-01 00:12:42,2023-06-01 19:53:42,1.0


In [8]:
# Handle duplicates, missing values, and invalid data
import numpy as np

# Drop duplicates
df.drop_duplicates(inplace=True)

# Replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill missing categorical and numerical columns
for col in df.select_dtypes(include='object'):
    df[col] = df[col].fillna('Unknown')

for col in df.select_dtypes(include=['int64','float64']):
    df[col] = df[col].fillna(df[col].median())

print("‚úÖ Cleaning complete.")
print("Remaining missing values:")
print(df.isnull().sum())


‚úÖ Cleaning complete.
Remaining missing values:
Ticket ID                       0
Customer Name                   0
Customer Email                  0
Customer Age                    0
Customer Gender                 0
Product Purchased               0
Date of Purchase                0
Ticket Type                     0
Ticket Subject                  0
Ticket Description              0
Ticket Status                   0
Resolution                      0
Ticket Priority                 0
Ticket Channel                  0
First Response Time             0
Time to Resolution              0
Customer Satisfaction Rating    0
dtype: int64


In [10]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)

# Normalize column names (convert to lowercase & replace spaces)
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Now check for ticket_description column safely
if 'ticket_description' in df.columns:
    df = df.dropna(subset=['ticket_description'])

    # Clean text column
    df['ticket_description_clean'] = (
        df['ticket_description']
        .astype(str)
        .str.lower()
        .str.replace('[^a-zA-Z ]', '', regex=True)
    )
    print("‚úÖ Basic cleaning done. Rows left:", len(df))
else:
    print("‚ö†Ô∏è Column 'ticket_description' not found in dataset. Available columns:")
    print(df.columns)

df.head(2)


‚úÖ Basic cleaning done. Rows left: 8469


Unnamed: 0,ticket_id,customer_name,customer_email,customer_age,customer_gender,product_purchased,date_of_purchase,ticket_type,ticket_subject,ticket_description,ticket_status,resolution,ticket_priority,ticket_channel,first_response_time,time_to_resolution,customer_satisfaction_rating,ticket_description_clean
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,Unknown,Critical,Social media,2023-06-01 12:15:36,Unknown,3.0,im having an issue with the productpurchased p...
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,Unknown,Critical,Chat,2023-06-01 16:45:38,Unknown,3.0,im having an issue with the productpurchased p...


In [11]:
text = " ".join(df['ticket_description_clean'].dropna().values[:1000])
plt.figure(figsize=(10,6))
WordCloud(width=800, height=400, background_color='white').generate(text).to_image().show()


<Figure size 1000x600 with 0 Axes>

In [12]:
# Target column (adjust if different)
target_col = 'Ticket_Priority' if 'Ticket_Priority' in df.columns else df.columns[-1]

# Numeric and categorical features
numeric_candidates = ['Customer_Age','resp_delay_hours','resolve_delay_hours','response_efficiency','ticket_count','ticket_sentiment','is_repeat_customer']
categorical_candidates = ['Customer_Gender','Product_Purchased','Ticket_Type','Ticket_Status','Ticket_Priority','Ticket_Channel','Ticket_Subject']

numeric_features = [c for c in numeric_candidates if c in df.columns]
categorical_features = [c for c in categorical_candidates if c in df.columns]

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)


Numeric features: []
Categorical features: []


In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure all columns are lowercase
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Define feature groups safely
numeric_features = [col for col in df.select_dtypes(include=['int64','float64']).columns
                    if col not in ['customer_satisfaction_rating']]  # exclude target
categorical_features = [col for col in df.select_dtypes(include='object').columns
                        if col not in ['ticket_description_clean', 'ticket_description']]

# Ensure 'ticket_description_clean' exists
if 'ticket_description_clean' not in df.columns and 'ticket_description' in df.columns:
    df['ticket_description_clean'] = (
        df['ticket_description'].astype(str)
        .str.lower()
        .str.replace('[^a-zA-Z ]',' ', regex=True)
    )

# Numeric pipeline
num_transformer = Pipeline([
    ('scaler', StandardScaler())
])

# Categorical pipeline
cat_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Text vectorizer
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=5000, min_df=3)

# Combine all using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numeric_features),
        ('cat', cat_transformer, categorical_features),
        ('text', tfidf, 'ticket_description_clean')
    ],
    remainder='drop'
)

print("‚úÖ Preprocessor ready!")
print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)


‚úÖ Preprocessor ready!
Numeric features: ['ticket_id', 'customer_age']
Categorical features: ['customer_name', 'customer_email', 'customer_gender', 'product_purchased', 'date_of_purchase', 'ticket_type', 'ticket_subject', 'ticket_status', 'resolution', 'ticket_priority', 'ticket_channel', 'first_response_time', 'time_to_resolution']


In [16]:
# Ensure target_col defined properly
target_col = 'customer_satisfaction_rating'  # üëà update if your dataset uses different case

# Convert all column names to lowercase (safe)
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Create text-clean column if not present
if 'ticket_description_clean' not in df.columns and 'ticket_description' in df.columns:
    df['ticket_description_clean'] = (
        df['ticket_description']
        .astype(str)
        .str.lower()
        .str.replace('[^a-zA-Z ]', ' ', regex=True)
    )

# --- Feature group definitions ---
numeric_features = [col for col in df.select_dtypes(include=['int64','float64']).columns
                    if col != target_col]
categorical_features = [col for col in df.select_dtypes(include='object').columns
                        if col not in ['ticket_description', 'ticket_description_clean']]

print("Numeric:", numeric_features)
print("Categorical:", categorical_features)
print("Text: ticket_description_clean")

# --- Drop rows with missing target values ---
df_model = df.dropna(subset=[target_col]).copy()

# Encode target variable if it‚Äôs categorical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_model[target_col] = le.fit_transform(df_model[target_col])

# --- Define X and y ---
X = df_model[numeric_features + categorical_features + ['ticket_description_clean']]
y = df_model[target_col]

# --- Train/Test Split ---
from sklearn.model_selection import train_test_split

if len(y.unique()) > 1:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.18, stratify=y, random_state=42
    )
    print(f"‚úÖ Split done ‚Äî Train: {X_train.shape}, Test: {X_test.shape}")
else:
    print("‚ö†Ô∏è Target column has only one class. Check your data before training.")



Numeric: ['ticket_id', 'customer_age']
Categorical: ['customer_name', 'customer_email', 'customer_gender', 'product_purchased', 'date_of_purchase', 'ticket_type', 'ticket_subject', 'ticket_status', 'resolution', 'ticket_priority', 'ticket_channel', 'first_response_time', 'time_to_resolution']
Text: ticket_description_clean
‚úÖ Split done ‚Äî Train: (6944, 16), Test: (1525, 16)


In [18]:
# ‚úÖ Step ‚Äì Model Training Pipeline (Fixed & Optimized)
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# XGBoost model parameters
clf = XGBClassifier(
    n_estimators=150,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=2
)

# ‚öôÔ∏è If preprocessor not yet defined, define it again safely
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Ensure all lowercase names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Prepare lists (in case not already defined)
numeric_features = [col for col in df.select_dtypes(include=['int64','float64']).columns
                    if col != 'customer_satisfaction_rating']
categorical_features = [col for col in df.select_dtypes(include='object').columns
                        if col not in ['ticket_description', 'ticket_description_clean']]

# Text column check
if 'ticket_description_clean' not in df.columns and 'ticket_description' in df.columns:
    df['ticket_description_clean'] = (
        df['ticket_description'].astype(str)
        .str.lower()
        .str.replace('[^a-zA-Z ]', ' ', regex=True)
    )

# Create transformers
num_transformer = Pipeline([('scaler', StandardScaler())])
cat_transformer = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore'))])
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=3000, min_df=3)

# Combine everything into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numeric_features),
        ('cat', cat_transformer, categorical_features),
        ('text', tfidf, 'ticket_description_clean')
    ],
    remainder='drop'
)

# ‚úÖ Full pipeline with SMOTE
pipeline = ImbPipeline([
    ('pre', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('clf', clf)
])

# üöÄ Train the model
print("üöÄ Training pipeline (this may take a few minutes)...")
pipeline.fit(X_train, y_train)
print("‚úÖ Training finished successfully!")


üöÄ Training pipeline (this may take a few minutes)...
‚úÖ Training finished successfully!


In [19]:
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-macro:", f1_score(y_test, y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7436065573770492
F1-macro: 0.35575024259876364

Classification Report:
               precision    recall  f1-score   support

           0       0.26      0.15      0.19        99
           1       0.21      0.51      0.29        99
           2       0.97      0.92      0.94      1131
           3       0.24      0.20      0.22        98
           4       0.15      0.11      0.13        98

    accuracy                           0.74      1525
   macro avg       0.37      0.38      0.36      1525
weighted avg       0.77      0.74      0.75      1525



In [20]:
try:
    y_pred_original = le.inverse_transform(y_pred)
    print("‚úÖ Decoded predictions example:", y_pred_original[:10])
except Exception as e:
    print("‚ö†Ô∏è Inverse transform failed:", e)


‚úÖ Decoded predictions example: [3. 3. 2. 3. 2. 3. 3. 3. 3. 4.]


In [22]:
# Step 1 ‚Äî Load model and encoder back
import joblib

loaded_model = joblib.load("customer_ticket_model.pkl")
loaded_encoder = joblib.load("label_encoder.pkl")

print("‚úÖ Model and LabelEncoder loaded successfully!")

# Test with a few random samples
sample = X_test.sample(3, random_state=42)
pred = loaded_model.predict(sample)
decoded_pred = loaded_encoder.inverse_transform(pred)

print("üéØ Sample Predictions:")
for i, val in enumerate(decoded_pred):
    print(f"Sample {i+1}: Predicted Satisfaction = {val}")


‚úÖ Model and LabelEncoder loaded successfully!
üéØ Sample Predictions:
Sample 1: Predicted Satisfaction = 3.0
Sample 2: Predicted Satisfaction = 3.0
Sample 3: Predicted Satisfaction = 2.0


In [23]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_loaded = loaded_model.predict(X_test)
print("‚úÖ Accuracy after reloading:", round(accuracy_score(y_test, y_pred_loaded)*100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_loaded))


‚úÖ Accuracy after reloading: 74.36 %

Classification Report:
               precision    recall  f1-score   support

           0       0.26      0.15      0.19        99
           1       0.21      0.51      0.29        99
           2       0.97      0.92      0.94      1131
           3       0.24      0.20      0.22        98
           4       0.15      0.11      0.13        98

    accuracy                           0.74      1525
   macro avg       0.37      0.38      0.36      1525
weighted avg       0.77      0.74      0.75      1525



In [27]:
import pandas as pd
import numpy as np

# 1Ô∏è‚É£ Expected features from training
expected_cols = numeric_features + categorical_features + ['ticket_description_clean']

# 2Ô∏è‚É£ Example new ticket (change values as needed)
new_ticket = {
    'customer_age': 32,
    'customer_gender': 'Female',
    'product_purchased': 'Laptop',
    'ticket_type': 'Technical issue',
    'ticket_priority': 'High',
    'ticket_channel': 'Chat',
    'ticket_description_clean': "having problem with the laptop screen not turning on"
}

# 3Ô∏è‚É£ Create dataframe
new_df = pd.DataFrame([new_ticket])

# 4Ô∏è‚É£ Clean column names
new_df.columns = new_df.columns.str.lower().str.replace(' ', '_')

# 5Ô∏è‚É£ Add any missing columns properly
for col in expected_cols:
    if col not in new_df.columns:
        # if column was numeric -> fill with np.nan
        if col in numeric_features:
            new_df[col] = np.nan
        else:
            new_df[col] = 'Unknown'  # for categorical/text features

# 6Ô∏è‚É£ Reorder to match training data
new_df = new_df[expected_cols]

# 7Ô∏è‚É£ Convert datatypes correctly
# numeric ‚Üí float, categorical/text ‚Üí string
for col in numeric_features:
    new_df[col] = pd.to_numeric(new_df[col], errors='coerce')
for col in categorical_features + ['ticket_description_clean']:
    new_df[col] = new_df[col].astype(str)

# 8Ô∏è‚É£ Now predict safely
pred_encoded = loaded_model.predict(new_df)
pred_decoded = loaded_encoder.inverse_transform(pred_encoded)

print("üí¨ Predicted Satisfaction Rating:", int(pred_decoded[0]))


üí¨ Predicted Satisfaction Rating: 3


In [28]:
import joblib

joblib.dump(pipeline, "customer_ticket_model.pkl")
joblib.dump(le, "label_encoder.pkl")

print("‚úÖ Model and label encoder saved successfully.")


‚úÖ Model and label encoder saved successfully.
