# XGBoost End-to-End Machine Learning Pipeline

This notebook demonstrates a complete machine learning workflow using XGBoost, including:
- Data loading and exploration
- Preprocessing and feature engineering
- Model training with hyperparameter tuning
- Evaluation and explainability
- Unit testing with pytest

## Section 1: Environment Setup

Install and verify required packages

In [None]:
import sys
import os

# Add parent directory to path for imports
sys.path.insert(0, os.path.abspath('..'))

# Set random seeds for reproducibility
import numpy as np
import random

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

print("Random seeds set for reproducibility")
print(f"Python version: {sys.version}")
print(f"Working directory: {os.getcwd()}")

In [None]:
import pandas as pd
print(f"pandas: {pd.__version__}")

import numpy as np
print(f"numpy: {np.__version__}")

import xgboost as xgb
print(f"xgboost: {xgb.__version__}")

from sklearn import __version__ as sklearn_version
print(f"scikit-learn: {sklearn_version}")

import matplotlib
print(f"matplotlib: {matplotlib.__version__}")

import seaborn
print(f"seaborn: {seaborn.__version__}")

print("\n✓ All packages imported successfully!")

## Section 2: Import Libraries

Import all required libraries for the ML pipeline

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, SimpleImputer, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.datasets import load_breast_cancer
import xgboost as xgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Visualization settings
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("✓ All libraries imported successfully!")

## Section 3: Load Dataset

Load and explore the breast cancer dataset

In [None]:
# Load breast cancer dataset
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = pd.Series(cancer.target, name='target')

# Create combined dataframe
df = X.copy()
df['target'] = y

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nData types:")
print(df.dtypes)
print(f"\nBasic statistics:")
print(df.describe())

In [None]:
# Check for missing values
print(f"\nMissing values:")
print(df.isnull().sum().sum())
print(f"\nTarget variable distribution:")
print(df['target'].value_counts())
print(f"\nClass proportions:")
print(df['target'].value_counts(normalize=True))

## Section 4: Exploratory Data Analysis (EDA)

Analyze and visualize the dataset

In [None]:
# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Target distribution
df['target'].value_counts().plot(kind='bar', ax=axes[0], color=['#FF6B6B', '#4ECDC4'])
axes[0].set_title('Target Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['Malignant (0)', 'Benign (1)'], rotation=0)

# Target proportions
df['target'].value_counts(normalize=True).plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
axes[1].set_title('Class Proportions', fontsize=12, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

print("✓ Target visualization complete")

In [None]:
# Feature distributions
fig, axes = plt.subplots(4, 4, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(X.columns[:16]):
    axes[idx].hist(X[col], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(col, fontsize=9)
    axes[idx].set_xlabel('')

plt.tight_layout()
plt.show()

print("✓ Feature distributions visualized")

## Section 5: Preprocessing & Feature Engineering

Define preprocessing pipelines

In [None]:
# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numeric features ({len(numeric_features)}): {numeric_features[:5]}...")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

print("\n✓ Preprocessor defined")

## Section 6: Train/Validation Split & Cross-Validation Strategy

Prepare data for model training