<a href="https://colab.research.google.com/github/Akshatha-Gadasandula/ML_LAB/blob/main/ML_Assignment_2_(150).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Machine Learning Assignment 2
# Research Paper : “XGBoost: A Scalable Tree Boosting System”
#  -by Tianqi Chen and Carlos Guestrin
#  -Published in Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD ’16), 2016.
# Dataset: Breast Cancer (from sklearn)


# Step 1: Import libraries-
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier


# Step 2: Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

print("Dataset shape:", X.shape)
print("Classes:", np.unique(y))


# Step 3: Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Step 4: Baseline Model (XGBoost Default)
xgb_default = XGBClassifier(eval_metric='logloss')
xgb_default.fit(X_train, y_train)

y_pred_default = xgb_default.predict(X_test)
print("\nBaseline Accuracy:", accuracy_score(y_test, y_pred_default))
print("\nClassification Report:\n", classification_report(y_test, y_pred_default))


# Step 5: Hyperparameter Tuning (GridSearch)
params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid = GridSearchCV(
    estimator=XGBClassifier( eval_metric='logloss'),
    param_grid=params,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("\nBest Parameters:", grid.best_params_)
print("Best Cross-Validation Accuracy:", grid.best_score_)


# Step 6: Tuned Model
xgb_tuned = grid.best_estimator_
y_pred_tuned = xgb_tuned.predict(X_test)

print("\nTuned Accuracy:", accuracy_score(y_test, y_pred_tuned))
print("\nClassification Report (Tuned):\n", classification_report(y_test, y_pred_tuned))


# Step 7: Confusion Matrix
cm = confusion_matrix(y_test, y_pred_tuned)
print("\nConfusion Matrix:\n", cm)


Dataset shape: (569, 30)
Classes: [0 1]

Baseline Accuracy: 0.956140350877193

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.90      0.94        42
           1       0.95      0.99      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114

Fitting 3 folds for each of 108 candidates, totalling 324 fits

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.8}
Best Cross-Validation Accuracy: 0.9692546764261647

Tuned Accuracy: 0.956140350877193

Classification Report (Tuned):
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        42
           1       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       