In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)


In [2]:
# Load dataset
df = pd.read_csv("/Users/prateekgupta/Desktop/WiLp/ML/in-vehicle-coupon-recommendation.csv")

# Features and target
X = df.drop("Y", axis=1)
y = df["Y"]

print("Dataset Shape:", df.shape)
df.head()


Dataset Shape: (12684, 26)


Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0


In [3]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(exclude=["object"]).columns

# Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

print("Processed feature shape:", X_processed.shape)


Processed feature shape: (12684, 120)


In [4]:
# Check class distribution
class_counts = y.value_counts()
class_counts


Y
1    7210
0    5474
Name: count, dtype: int64

In [5]:
scale_pos_weight = class_counts[0] / class_counts[1]
scale_pos_weight


np.float64(0.7592233009708738)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 10147
Testing samples: 2537


In [7]:
xgb_model = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.5,
    reg_lambda=1.0,
    scale_pos_weight=scale_pos_weight,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)

print("XGBoost training completed.")


XGBoost training completed.


In [8]:
# Predictions
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

# Evaluation metrics
metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_prob),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1 Score": f1_score(y_test, y_pred),
    "MCC": matthews_corrcoef(y_test, y_pred)
}

metrics


{'Accuracy': 0.7382735514387071,
 'AUC': 0.8134741195321059,
 'Precision': 0.772027972027972,
 'Recall': 0.7656033287101248,
 'F1 Score': 0.7688022284122563,
 'MCC': 0.4672919652505043}

In [9]:
# Save trained XGBoost model
joblib.dump(xgb_model, "/Users/prateekgupta/Desktop/WiLp/ML/saved_models/XGBoost.pkl")

# Save preprocessor (overwrite only if identical across notebooks)
joblib.dump(preprocessor, "/Users/prateekgupta/Desktop/WiLp/ML/saved_models/preprocessor_XGBoost.pkl")

print("XGBoost model and preprocessor saved successfully.")


XGBoost model and preprocessor saved successfully.
