In [1]:
# 1. Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 2. Load dataset
df = pd.read_csv('process_dataset.csv')  # Replace with your filename

# 3. Separate features (X) and target (y)
X = df.drop('remainder__satisfaction_encoded', axis=1)
y = df['remainder__satisfaction_encoded']

In [3]:
# 4. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [4]:
# MODEL 1 – BASELINE MODEL
# --------------------------
dt_default = DecisionTreeClassifier(random_state=42)
dt_default.fit(X_train, y_train)

y_pred_default = dt_default.predict(X_test)

print("=== BASELINE DECISION TREE ===")
print("Accuracy:", accuracy_score(y_test, y_pred_default))
print("Precision:", precision_score(y_test, y_pred_default))
print("Recall:", recall_score(y_test, y_pred_default))
print("F1 Score:", f1_score(y_test, y_pred_default))
print("AUC:", roc_auc_score(y_test, y_pred_default))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_default))

=== BASELINE DECISION TREE ===
Accuracy: 0.9139072847682119
Precision: 0.905868680999419
Recall: 0.90534262485482
F1 Score: 0.9056055765320941
AUC: 0.9132168555350573

Confusion Matrix:
 [[1891  162]
 [ 163 1559]]


In [5]:
# MODEL 2 – MANUAL TUNING
# --------------------------
dt_manual = DecisionTreeClassifier(
    criterion='entropy',      # measure of split quality
    max_depth=8,              # controls tree complexity
    min_samples_split=5,      # minimum samples to split an internal node
    min_samples_leaf=3,       # minimum samples in each leaf
    random_state=42
)
dt_manual.fit(X_train, y_train)
y_pred_manual = dt_manual.predict(X_test)

print("\n=== MANUALLY TUNED DECISION TREE ===")
print("Accuracy:", accuracy_score(y_test, y_pred_manual))
print("Precision:", precision_score(y_test, y_pred_manual))
print("Recall:", recall_score(y_test, y_pred_manual))
print("F1 Score:", f1_score(y_test, y_pred_manual))
print("AUC:", roc_auc_score(y_test, y_pred_manual))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_manual))


=== MANUALLY TUNED DECISION TREE ===
Accuracy: 0.9234437086092715
Precision: 0.9292989814260036
Recall: 0.9006968641114983
F1 Score: 0.9147744028310233
AUC: 0.9216100004921836

Confusion Matrix:
 [[1935  118]
 [ 171 1551]]


In [6]:
# MODEL 3 – GRID SEARCH TUNING
# --------------------------
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [4, 6, 8, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

best_dt = grid_search.best_estimator_
y_pred_grid = best_dt.predict(X_test)

print("\n=== GRIDSEARCHCV BEST DECISION TREE ===")
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_grid))
print("Precision:", precision_score(y_test, y_pred_grid))
print("Recall:", recall_score(y_test, y_pred_grid))
print("F1 Score:", f1_score(y_test, y_pred_grid))
print("AUC:", roc_auc_score(y_test, y_pred_grid))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_grid))



=== GRIDSEARCHCV BEST DECISION TREE ===
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 10}
Accuracy: 0.9343046357615894
Precision: 0.9445114595898673
Recall: 0.9094076655052264
F1 Score: 0.9266272189349113
AUC: 0.93229759797424

Confusion Matrix:
 [[1961   92]
 [ 156 1566]]


In [10]:
# Compute probabilities for AUC
y_proba_default = dt_default.predict_proba(X_test)[:, 1]
y_proba_manual  = dt_manual.predict_proba(X_test)[:, 1]
y_proba_grid    = best_dt.predict_proba(X_test)[:, 1]

# COMPARISON TABLE
comparison = pd.DataFrame({
    'Model': ['Baseline', 'Manual Tuned', 'GridSearchCV Best'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_default),
        accuracy_score(y_test, y_pred_manual),
        accuracy_score(y_test, y_pred_grid)
    ],
    'Precision': [
        precision_score(y_test, y_pred_default),
        precision_score(y_test, y_pred_manual),
        precision_score(y_test, y_pred_grid)
    ],
    'Recall': [
        recall_score(y_test, y_pred_default),
        recall_score(y_test, y_pred_manual),
        recall_score(y_test, y_pred_grid)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred_default),
        f1_score(y_test, y_pred_manual),
        f1_score(y_test, y_pred_grid)
    ],
    'AUC': [
        roc_auc_score(y_test, y_proba_default),
        roc_auc_score(y_test, y_proba_manual),
        roc_auc_score(y_test, y_proba_grid)
    ]
})

# Print the table
print("\n==== Model Comparison Table ====")
print(comparison)



==== Model Comparison Table ====
               Model  Accuracy  Precision    Recall  F1 Score       AUC
0           Baseline  0.913907   0.905869  0.905343  0.905606  0.913217
1       Manual Tuned  0.923444   0.929299  0.900697  0.914774  0.977373
2  GridSearchCV Best  0.934305   0.944511  0.909408  0.926627  0.976436
