In [20]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,accuracy_score
from sklearn.utils.class_weight import compute_sample_weight

In [21]:
# 1. Load dataset
df = pd.read_csv("../PreProcessing/preprocessed_data.csv")
X = df.drop(columns="label")
y = df["label"]

# 2. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [22]:
# 3. Compute sample weights to handle class imbalance
sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)

# 4. Create and train the Gradient Boosted Classifier
model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
model.fit(X_train, y_train, sample_weight=sample_weights)

In [23]:
# 5. Make predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # probability for positive class

# 6. Evaluatef
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

print("Accuracy:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[235278 176729]
 [ 11153   6760]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.57      0.71    412007
           1       0.04      0.38      0.07     17913

    accuracy                           0.56    429920
   macro avg       0.50      0.47      0.39    429920
weighted avg       0.92      0.56      0.69    429920

ROC-AUC Score: 0.4590015385634231
Accuracy: 0.5629838109415706
