In [6]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

# 2. Load data (updated file paths)
train_df = pd.read_csv('/content/fda_trainingset.csv')
test_df = pd.read_csv('/content/fda_testset.csv')

# 3. Setup features and target
X = train_df.drop(columns=['ID', 'Y'])  # Clean: drop ID and label
y = train_df['Y'].astype(int)           # Ensure integer labels


# 4. Combine for preprocessing
combined = pd.concat([X, test_df], axis=0)

# 5. Impute missing values
imputer = SimpleImputer(strategy='median')
combined_imputed = pd.DataFrame(imputer.fit_transform(combined), columns=combined.columns)

# 6. Feature Engineering
combined_imputed['feature_sum'] = combined_imputed.sum(axis=1)
combined_imputed['feature_mean'] = combined_imputed.mean(axis=1)
combined_imputed['feature_std'] = combined_imputed.std(axis=1)
combined_imputed['non_zero_count'] = (combined_imputed != 0).sum(axis=1)
combined_imputed['max_min_ratio'] = combined_imputed.max(axis=1) / (combined_imputed.min(axis=1) + 1e-6)

# 7. Standardization
scaler = StandardScaler()
combined_scaled = pd.DataFrame(scaler.fit_transform(combined_imputed), columns=combined_imputed.columns)

# 8. Split processed back
X_processed = combined_scaled.iloc[:len(X)]
test_processed = combined_scaled.iloc[len(X):]
test_ids = test_df['ID']

# 9. Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.25, random_state=42, stratify=y)

# 10. Updated model with eval_metric in constructor
model_v4 = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.02,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.8,
    gamma=0.4,
    reg_alpha=1.0,
    reg_lambda=2.0,
    scale_pos_weight=10,
    tree_method='hist',
    eval_metric='auc',        # <-- Put eval_metric here
    random_state=42
)

# 11. Safe .fit() (no eval_metric here)
model_v4.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100
)


# 12. Evaluate
y_pred_v4 = model_v4.predict(X_val)
y_proba_v4 = model_v4.predict_proba(X_val)[:, 1]

print("Classification Report:\n", classification_report(y_val, y_pred_v4))
print("AUC-ROC Score:", roc_auc_score(y_val, y_proba_v4))

# 13. Predict on test set
test_predictions_v4 = model_v4.predict_proba(test_processed)[:, 1]

# 14. Create submission
submission_v4 = pd.DataFrame({
    'ID': test_ids,
    'Y': test_predictions_v4
})

submission_path = '/content/xgb_boosted_final_submission.csv'
submission_v4.to_csv(submission_path, index=False)
print("✅ Saved final submission to:", submission_path)

# 15. Auto download
from google.colab import files
files.download(submission_path)


[0]	validation_0-auc:0.81642
[100]	validation_0-auc:0.92050
[200]	validation_0-auc:0.93909
[300]	validation_0-auc:0.94417
[400]	validation_0-auc:0.94649
[500]	validation_0-auc:0.94814
[600]	validation_0-auc:0.94902
[700]	validation_0-auc:0.94993
[800]	validation_0-auc:0.95087
[900]	validation_0-auc:0.95085
[999]	validation_0-auc:0.95088
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     49869
           1       0.64      0.12      0.21       131

    accuracy                           1.00     50000
   macro avg       0.82      0.56      0.60     50000
weighted avg       1.00      1.00      1.00     50000

AUC-ROC Score: 0.9508824570757063
✅ Saved final submission to: /content/xgb_boosted_final_submission.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>