In [2]:
# ===============================
# ML Assignment 2 
# Target: High_ROAS (Binary Classification)
# ===============================

import os
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


# ===============================
# 1. Load Dataset
# ===============================

data = pd.read_csv("global_ads_performance_dataset.csv")

print("Dataset Shape:", data.shape)


# ===============================
# 2. Create Best Target Column
# ===============================

# Profitable Campaign = ROAS > 1
data["High_ROAS"] = (data["ROAS"] > 1).astype(int)

# Drop original ROAS to prevent leakage
data.drop(columns=["ROAS"], inplace=True)


# ===============================
# 3. Preprocessing
# ===============================

data = data.dropna()

# Drop date (not useful for classification directly)
data.drop(columns=["date"], inplace=True)

# Encode categorical variables
label_encoders = {}

for col in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


# ===============================
# 4. Define Features & Target
# ===============================

X = data.drop(columns=["High_ROAS"])
y = data["High_ROAS"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# ===============================
# 5. Initialize Models
# ===============================

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}


# ===============================
# 6. Train, Evaluate & Save Models
# ===============================

results = []

for name, model in models.items():

    if name in ["Logistic Regression", "KNN"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append([
        name, accuracy, auc, precision, recall, f1, mcc
    ])

    # Save model
    os.makedirs("model", exist_ok=True)
    with open(f"model/{name.replace(' ', '_')}.pkl", "wb") as f:
        pickle.dump(model, f)


# Save scaler
with open("model/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


# ===============================
# 7. Tabular Output
# ===============================

results_df = pd.DataFrame(results, columns=[
    "Model", "Accuracy", "AUC", "Precision",
    "Recall", "F1 Score", "MCC"
])

print("\n==============================")
print("MODEL COMPARISON TABLE")
print("==============================\n")
print(results_df)

results_df.to_csv("model/model_comparison_results.csv", index=False)

print("\nAll models saved successfully.")


Dataset Shape: (1800, 14)

MODEL COMPARISON TABLE

                 Model  Accuracy       AUC  Precision    Recall  F1 Score  \
0  Logistic Regression  0.952778  0.978200   0.959302  0.990991  0.974889   
1        Decision Tree  0.966667  0.845846   0.976261  0.987988  0.982090   
2                  KNN  0.930556  0.804527   0.940000  0.987988  0.963397   
3          Naive Bayes  0.900000  0.869536   0.968454  0.921922  0.944615   
4        Random Forest  0.958333  0.932544   0.959538  0.996997  0.977909   
5              XGBoost  0.980556  0.984985   0.982249  0.996997  0.989568   

        MCC  
0  0.603867  
1  0.744968  
2  0.336918  
3  0.447947  
4  0.651875  
5  0.851942  

All models saved successfully.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [14]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.2.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.2.0-py3-none-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.3/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.3/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.3/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.3/101.7 MB ? eta -:--:--



[notice] A new release of pip is available: 25.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting xgboost
  Using cached xgboost-3.2.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.2.0-py3-none-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.5/101.7 MB 4.4 MB/s eta 0:00:23
   ---------------------------------------- 0.8/101.7 MB 3.3 MB/s eta 0:00:31
    --------------------------------------- 1.3/101.7 MB 3.1 MB/s eta 0:00:33
    --------------------------------------- 1.6/101.7 MB 1.9 MB/s eta 0:00:52
    --------------------------------------- 1.8/101.7 MB 2.0 MB/s eta 0:00:51
    --------------------------------------- 2.1/101.7 MB 1.8 MB/s eta 0:00:55
    --------------------------------------- 2.4/101.7 MB 1.7 MB/s eta 0:00:58
    --------------------------------------- 2.4/101.7 MB 1.7 MB/s eta 0:00:58
   - -------------------------------------- 2.6/101.7 MB 1.5 MB/s eta 0:01:07
   - --


[notice] A new release of pip is available: 25.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
