In [1]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.5/124.9 MB 5.6 MB/s eta 0:00:23
    --------------------------------------- 1.8/124.9 MB 7.2 MB/s eta 0:00:18
   - -------------------------------------- 3.4/124.9 MB 6.9 MB/s eta 0:00:18
   - -------------------------------------- 4.7/124.9 MB 6.6 MB/s eta 0:00:19
   - -------------------------------------- 5.8/124.9 MB 6.5 MB/s eta 0:00:19
   -- ------------------------------------- 7.1/124.9 MB 6.1 MB/s eta 0:00:20
   -- ------------------------------------- 8.1/124.9 MB 6.0 MB/s eta 0:00:20
   -- ------------------------------------- 9.2/124.9 MB 5.9 MB/s eta 0:00:20
   --- ------------------------------------ 10.2/124.9 MB 5.7 MB/s eta 0:00:20
   --- ------------------------------------ 11.3/124.9 MB 5.7 MB/s eta 0:00:2

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [3]:
# Load processed data
X_train = pd.read_csv("../data/X_train.csv")
y_train = pd.read_csv("../data/y_train.csv")
X_test = pd.read_csv("../data/X_test.csv")
y_test = pd.read_csv("../data/y_test.csv")

In [4]:
# Train Logistic Regression
log_reg = LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train.values.ravel())

# Train Decision Tree
tree_clf = DecisionTreeClassifier(max_depth=5, random_state=42)
tree_clf.fit(X_train, y_train.values.ravel())

# Train XGBoost
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
xgb_clf.fit(X_train, y_train.values.ravel())

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



In [5]:
# Evaluate models
models = {"Logistic Regression": log_reg, "Decision Tree": tree_clf, "XGBoost": xgb_clf}

for name, model in models.items():
    y_pred = model.predict(X_test)
    auc = roc_auc_score(y_test, y_pred)
    print(f"{name} AUC-ROC Score: {auc:.4f}")
    print(f"{name} Classification Report:\n", classification_report(y_test, y_pred))

Logistic Regression AUC-ROC Score: 0.6477
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.74      0.80      4673
           1       0.38      0.55      0.45      1327

    accuracy                           0.70      6000
   macro avg       0.62      0.65      0.62      6000
weighted avg       0.75      0.70      0.72      6000

Decision Tree AUC-ROC Score: 0.6732
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87      4673
           1       0.54      0.46      0.49      1327

    accuracy                           0.79      6000
   macro avg       0.69      0.67      0.68      6000
weighted avg       0.78      0.79      0.79      6000

XGBoost AUC-ROC Score: 0.6568
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.92      0.88      4673
           1   

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train.values.ravel())

# Predictions
y_pred_rf = rf_clf.predict(X_test)

# Evaluate Random Forest
from sklearn.metrics import classification_report

print("Random Forest Performance:\n", classification_report(y_test, y_pred_rf))


Random Forest Performance:
               precision    recall  f1-score   support

           0       0.85      0.91      0.88      4673
           1       0.58      0.43      0.50      1327

    accuracy                           0.81      6000
   macro avg       0.72      0.67      0.69      6000
weighted avg       0.79      0.81      0.79      6000



In [9]:
# Evaluate models
models = {"Logistic Regression": log_reg, "Decision Tree": tree_clf, "XGBoost": xgb_clf, "RF": rf_clf}

for name, model in models.items():
    y_pred = model.predict(X_test)
    auc = roc_auc_score(y_test, y_pred)
    print(f"{name} AUC-ROC Score: {auc:.4f}")
    print(f"{name} Classification Report:\n", classification_report(y_test, y_pred))

Logistic Regression AUC-ROC Score: 0.6477
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.74      0.80      4673
           1       0.38      0.55      0.45      1327

    accuracy                           0.70      6000
   macro avg       0.62      0.65      0.62      6000
weighted avg       0.75      0.70      0.72      6000

Decision Tree AUC-ROC Score: 0.6732
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87      4673
           1       0.54      0.46      0.49      1327

    accuracy                           0.79      6000
   macro avg       0.69      0.67      0.68      6000
weighted avg       0.78      0.79      0.79      6000

XGBoost AUC-ROC Score: 0.6568
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.92      0.88      4673
           1   