In [None]:
# %% [markdown]
"""
# Pipeline Construction Notebook
**Team:** [Your Team Name]  
**Authors:** [Team Members]  
**Date:** [Date]

## Objective
This notebook demonstrates the construction of a machine learning pipeline for [your dataset/problem], including data preprocessing, model training, hyperparameter tuning via GridSearchCV, and model persistence.
"""
# %%
# =====================
# IMPORTS
# =====================
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

# Optional: For non-Gaussian targets
from sklearn.linear_model import PoissonRegressor  

# Optional: For imbalanced classification
from imblearn.pipeline import Pipeline as ImbPipeline  
from imblearn.over_sampling import SMOTE

# %% [markdown]
"""
## 1. Data Loading
Utility functions for loading and inspecting data
"""
# %%
def load_data(file_path: str) -> pd.DataFrame:
    """
    Load dataset from CSV file
    
    Parameters:
    file_path (str): Path to CSV file
    
    Returns:
    pd.DataFrame: Loaded dataframe
    """
    df = pd.read_csv(file_path)
    print(f"Loaded data with shape: {df.shape}")
    return df

# %% [markdown]
"""
## 2. Preprocessing Setup
Define preprocessing steps for different feature types
"""
# %%
class ExampleTransformer:
    """
    Custom transformer example for demonstration purposes
    
    Methods:
    fit: Learn parameters from data
    transform: Apply transformation to data
    """
    
    def __init__(self, param: float = 0.5):
        self.param = param
        
    def fit(self, X, y=None):
        """Learn any necessary parameters from training data"""
        return self
    
    def transform(self, X):
        """Apply transformation to data"""
        return X * self.param

# Define numerical and categorical features
NUM_FEATURES = ['age', 'income']
CAT_FEATURES = ['gender', 'occupation']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('custom_transform', ExampleTransformer())
        ]), NUM_FEATURES),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), CAT_FEATURES)
    ])

# %% [markdown]
"""
## 3. Model Pipeline Construction
Combine preprocessing with supervised and unsupervised estimators
"""
# %%
def train_pipeline():
    """
    Main function to train and save the pipeline
    
    Steps:
    1. Load training data
    2. Split into features/target
    3. Create complete pipeline
    4. Perform grid search
    5. Save best model
    """
    # Load data
    train_df = load_data('../data/train.csv')
    
    # Split features and target
    X = train_df.drop('INCOME', axis=1)
    y = train_df['INCOME']
    
    # Create complete pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier()),
        ('cluster', KMeans(n_init=10))  # Example of unsupervised component
    ])
    
    # Parameter grid for GridSearchCV
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 5, 10]
    }
    
    # Grid search setup
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    # Fit the model
    grid_search.fit(X, y)
    
    # Save best model
    save_model(grid_search.best_estimator_, '../models/best_pipeline.pkl')
    
    return grid_search

def save_model(model, file_path: str):
    """
    Save trained model to disk
    
    Parameters:
    model: Trained model object
    file_path (str): Path to save model
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    joblib.dump(model, file_path)
    print(f"Model saved to {file_path}")

# %% [markdown]
"""
## 4. Execution
Run the complete training process
"""
# %%
if __name__ == "__main__":
    trained_model = train_pipeline()
    print("Training completed successfully!")
    print(f"Best parameters: {trained_model.best_params_}")
    print(f"Best score: {trained_model.best_score_:.3f}")