In [1]:
import kagglehub
import pandas as pd
import numpy as np
import shutil
import json
import pickle
import os

from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [4]:
DATA_PATH = '../data/raw'

In [None]:
df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Create Training Pipeline

### Processing and Encoding Pipeline

In [None]:
# df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
df.set_index('PassengerId', inplace=True)


In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22.0,38.0,26.0,35.0,35.0
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


In [7]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
df.PassengerId.nunique()

891

In [10]:
df.duplicated().sum()

np.int64(0)

Cleaning Pipeline:
- Remove unnecessary columns
- 

In [13]:
drop_features = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived']
reduced_df = df.drop(drop_features, axis=1)
reduced_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [16]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


In [None]:
numeric_features = ['Age', 'Fare', 'SibSp', 'Parch']
numeric_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

In [18]:
categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transfomer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])


In [19]:
preporcessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(), categorical_features),
         ("drop",'drop',drop_features)
    ]
)

In [21]:
preporcessor.fit_transform(df)

array([[-0.53037664, -0.50244517,  0.43279337, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.57183099,  0.78684529,  0.43279337, ...,  0.        ,
         0.        ,  0.        ],
       [-0.25482473, -0.48885426, -0.4745452 , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [        nan, -0.17626324,  0.43279337, ...,  0.        ,
         1.        ,  0.        ],
       [-0.25482473, -0.04438104, -0.4745452 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.15850313, -0.49237783, -0.4745452 , ...,  1.        ,
         0.        ,  0.        ]], shape=(891, 13))

# Trining Pipelien
### processing

In [None]:
# import pandas as pd
# import os
# import joblib
# from sklearn.model_selection import train_test_split
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import FunctionTransformer
# from feature_engine.imputation import MeanMedianImputer
# from feature_engine.encoding import OneHotEncoder
# from feature_engine.selection import DropFeatures
# from feature_engine.wrappers import SklearnTransformerWrapper
# from sklearn.preprocessing import StandardScaler

# SOURCE = os.path.join("data", "raw")
# DESTINATION = os.path.join("data", "processed")

# def dtype_conversion(X, cat_cols):
#     """Handle missing values and convert to categorical"""
#     X = X.copy()
#     for col in cat_cols:
#         if col in X.columns:
#             # Fill NA and convert to category
#             X[col] = X[col].fillna('missing').astype('category')
#     return X
# def read_process_data(
#     file_name: str,
#     target: str,
#     num_cols: list,
#     cat_cols: list,
#     drop_cols: list,
#     logger
# ):
#     """Data processing pipeline"""
#     logger.info("Starting data processing")
    
#     try:
#         # 1. Load data
#         df = pd.read_csv(os.path.join(SOURCE, f'{file_name}.csv'))
#         logger.info(f"Raw data loaded: {df.shape}")
        
#         # 2. Validate initial data
#         if df[target].isna().any():
#             raise ValueError(f"Target column '{target}' contains missing values")
        
#         # 3. Split data
#         train_df, test_df = train_test_split(
#             df, test_size=0.2, random_state=42, stratify=df[target]
#         )
#         logger.info(f"Train/Test split: {train_df.shape}/{test_df.shape}")

#         # 4. Create processing pipeline
#         processing_pipeline = Pipeline([
#             ('dtype_conversion', FunctionTransformer(
#                 func=dtype_conversion,
#                 kw_args={'cat_cols': cat_cols},
#                 validate=False
#             )),
            
#             ('numeric_imputer', MeanMedianImputer(
#                 imputation_method='median',
#                 variables=num_cols
#             )),
            
#             ('encoder', OneHotEncoder(
#                 drop_last=True,
#                 variables=cat_cols
#             )),
            
#             ('scaler', SklearnTransformerWrapper(
#                 transformer=StandardScaler(),
#                 variables=num_cols
#             )),
            
#             ('drop_features', DropFeatures(
#                 features_to_drop=drop_cols + [target]
#             ))
#         ])

#         # 5. Process data
#         X_train = processing_pipeline.fit_transform(train_df)
#         X_test = processing_pipeline.transform(test_df)

#         # 6. Combine with target (critical fix)
#         train_clean = pd.concat([
#             X_train,
#             train_df[target].rename(target)  # Preserve original index
#         ], axis=1)
        
#         test_clean = pd.concat([
#             X_test,
#             test_df[target].rename(target)  # Preserve original index
#         ], axis=1)

#         # 7. Validate output
#         if len(train_clean) != len(train_df):
#             raise ValueError("Row count mismatch in training data")
#         if len(test_clean) != len(test_df):
#             raise ValueError("Row count mismatch in test data")

#         # 8. Save artifacts
#         os.makedirs(DESTINATION, exist_ok=True)
#         train_clean.to_parquet(os.path.join(DESTINATION, f"{file_name}-train.parquet"))
#         test_clean.to_parquet(os.path.join(DESTINATION, f"{file_name}-test.parquet"))
#         joblib.dump(processing_pipeline, os.path.join(DESTINATION, "pipeline.pkl"))

#         logger.info(f"Processing complete. Final shapes: Train {train_clean.shape}, Test {test_clean.shape}")
#     except Exception as e:
#         logger.error(f"Processing failed: {str(e)}")
#         raise

### Training and Tuning the Model

In [None]:
# import pandas as pd
# from sklearn.preprocessing import FunctionTransformer, LabelEncoder
# from sklearn.linear_model import LogisticRegression
# import pickle
# SOURCE = os.path.join("data", "processed")
# MODEL_PATH = 'models'
# def encode_target(file_name : str, 
#                   target_col : str,
#                   model_name : str,
#                   logger):
#     df_train = pd.read_parquet(os.path.join(DESTINATION, f"{file_name}-train.parquet"))
#     df_test = pd.read_parquet(os.path.join(DESTINATION, f"{file_name}-test.parquet"))
#     X_train , y_train = df_train.drop(columns=[target_col],axis=1) , df_train[target_col]    
#     X_test ,y_test = df_test.drop(columns=[target_col],axis=1) , df_test[target_col]

#     logger.info("Fitting the encoder/decoder of target variable")
#     logger.info(f"Number of classes: {len(y_train.unique())}")
#     """Create and fit encoder/decoder for target variable"""
#     encoder = LabelEncoder()
#     encoder.fit(y_train)
#     # Create decoder mapping
#     classes = encoder.classes_
#     decoder = {i: cls for i, cls in enumerate(classes)}
#     target_translator = {
#         'encoder': encoder,
#         'decoder': decoder,
#     }
#     logger.info("encoder/decoder of target created successfully")
#     # Save the artifacts
    
#     if not os.path.exists(os.path.join(MODEL_PATH, model_name)):
#         os.makedirs(os.path.join(MODEL_PATH, model_name))
#     with open(
#         os.path.join(MODEL_PATH, model_name, "model_target_translator.pkl"),
#         "wb",
#     ) as pkl:
#         pickle.dump(target_translator, pkl)
#     logger.info("encoder/decoder of target saved")
#     return X_train, y_train, X_test, y_test


In [None]:
# from functools import partial
# import os
# import pickle
# import numpy as np
# from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
# from hyperopt.pyll import scope
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import cross_validate

# N_FOLDS = 3
# MAX_EVALS = 3

# # Updated search space with compatible parameters
# SPACE = {
#     "penalty": hp.choice("penalty", ["l1", "l2", "elasticnet"]),
#     "C": hp.loguniform("C", -4, 4),
#     "solver": hp.choice("solver", ["saga"]),  # Saga supports all penalties
#     "l1_ratio": hp.uniform("l1_ratio", 0, 1)  # Required for elasticnet
# }

# def objective(params, X, y, n_folds: int = N_FOLDS):
#     """Wrapper function for hyperparameter optimization"""
#     try:
#         # Handle elasticnet specific parameters
#         if params["penalty"] == "elasticnet":
#             params["l1_ratio"] = params.get("l1_ratio", 0.5)
#         else:
#             params.pop("l1_ratio", None)
            
#         model = LogisticRegression(**params, max_iter=1000)
#         scores = cross_validate(
#             model, X, y, 
#             cv=n_folds, 
#             scoring="accuracy",
#             error_score="raise"  # Get detailed errors
#         )
#         return {
#             "loss": -np.mean(scores["test_score"]),  # Minimize negative accuracy
#             "params": params,
#             "status": STATUS_OK
#         }
#     except Exception as e:
#         return {
#             "loss": 0,
#             "status": STATUS_FAIL,
#             "exception": str(e)
#         }

# def train_model(X, y, model_name: str, logger):
#     """Complete training pipeline with error handling"""
#     logger.info("Loading target encoder/decoder")
#     try:
#         with open(os.path.join(MODEL_PATH, model_name, "model_target_translator.pkl"), "rb") as pkl:
#             translator = pickle.load(pkl)
        
#         y_train_enc = translator['encoder'].transform(y)
        
#         logger.info("Starting hyperparameter optimization")
#         bayes_trials = Trials()
        
#         best = fmin(
#             fn=partial(objective, X=X, y=y_train_enc),
#             space=SPACE,
#             algo=tpe.suggest,
#             max_evals=MAX_EVALS,
#             trials=bayes_trials,
#             show_progressbar=False
#         )
        
#         # Get best parameters from trials
#         best_params = bayes_trials.best_trial["result"]["params"]
#         logger.info(f"Best parameters: {best_params}")
        
#         # Train final model
#         final_model = LogisticRegression(**best_params, max_iter=1000)
#         final_model.fit(X, y_train_enc)
        
#         # Save artifacts
#         os.makedirs(os.path.join(MODEL_PATH, model_name), exist_ok=True)
#         with open(os.path.join(MODEL_PATH, model_name, "final_model.pkl"), "wb") as pkl:
#             pickle.dump(final_model, pkl)
            
#         logger.info("Model trained and saved successfully")
        
        
#     except Exception as e:
#         logger.error(f"Training failed: {str(e)}")
#         raise




### Evaluation Report

In [None]:
# import json
# import os
# import pickle
# from sklearn.metrics import classification_report
# from skore import EstimatorReport

# MODEL_PATH = "models"
# REPORT_PATH = "reports"
# def evaluate(X_test, y_test, model_name: str, logger):
#     """Proper evaluation function with correct encoding"""
#     logger.info("Starting model evaluation")
    
#     try:
#         # Load artifacts
#         with open(os.path.join(MODEL_PATH, model_name, "model_target_translator.pkl"), "rb") as pkl:
#             translator = pickle.load(pkl)
            
#         with open(os.path.join(MODEL_PATH, model_name, "final_model.pkl"), "rb") as pkl:
#             model = pickle.load(pkl)
        
#         # Encode test labels
#         y_test_enc = translator['encoder'].transform(y_test)
        
#         # Generate predictions
#         y_pred = model.predict(X_test)
        
#         # Convert numeric class labels to strings
#         class_names = [str(v) for v in translator['decoder'].values()]
        
#         # Generate classification report
#         evaluation_report = classification_report(
#             y_test_enc,
#             y_pred,
#             target_names=class_names  # Use string labels
#         )
        
#         logger.info("saving evaluation report")
#         if not os.path.exists(os.path.join(REPORT_PATH, model_name)):
#             os.makedirs(os.path.join(REPORT_PATH, model_name))
#         with open(
#             os.path.join(REPORT_PATH, model_name, "evaluation_report.json"), "w"
#         ) as js:
#             json.dump(evaluation_report, js, indent=4)
#         logger.info(f"Evaluation Report:\n{evaluation_report}")

#     except Exception as e:
#         logger.error(f"Evaluation failed: {str(e)}")
#         raise

### Trainer.py

In [1]:
from src.logger import ExecutorLogger
from src.training.process_data import read_process_data
from src.training.evaluate import evaluate
from src.training.train import train_model,encode_target

logger = ExecutorLogger('train pipeline')
def train_pipeline(logger: ExecutorLogger):
    # process pipeline
    numeric_features = ['Age', 'Fare', 'SibSp', 'Parch']
    categorical_features = ['Pclass', 'Sex', 'Embarked']
    drop_features = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived']
    logger.info("Training Started...")
    read_process_data(
        file_name ='titanic',
        target = 'Survived',
        num_cols = numeric_features,
        cat_cols= categorical_features,
        drop_cols = drop_features,
        logger = logger
    )
    X_train, y_train, X_test, y_test = encode_target(file_name ='titanic', 
                  target_col = 'Survived',
                  model_name = 'basemodel',
                  logger = logger)
    train_model(X = X_train,
            y = y_train,
            model_name = 'basemodel',
            logger = logger)
    evaluate(X_test, y_test, "basemodel", logger)
    logger.info("Training Completed...")
    


In [2]:
train_pipeline(logger = ExecutorLogger('training logger'))

[32m2025-05-01 14:01:58[0m | [1mINFO    [0m | [1mTraining Started...[0m
[32m2025-05-01 14:01:58[0m | [1mINFO    [0m | [1mStarting data processing[0m
[32m2025-05-01 14:01:59[0m | [1mINFO    [0m | [1mRaw data loaded: (891, 12)[0m
[32m2025-05-01 14:01:59[0m | [1mINFO    [0m | [1mTrain/Test split: (712, 12)/(179, 12)[0m


[32m2025-05-01 14:02:00[0m | [1mINFO    [0m | [1mProcessing complete. Final shapes: Train (712, 11), Test (179, 11)[0m
[32m2025-05-01 14:02:01[0m | [1mINFO    [0m | [1mFitting the encoder/decoder of target variable[0m
[32m2025-05-01 14:02:01[0m | [1mINFO    [0m | [1mNumber of classes: 2[0m
[32m2025-05-01 14:02:01[0m | [1mINFO    [0m | [1mencoder/decoder of target created successfully[0m
[32m2025-05-01 14:02:01[0m | [1mINFO    [0m | [1mencoder/decoder of target saved[0m
[32m2025-05-01 14:02:01[0m | [1mINFO    [0m | [1mLoading target encoder/decoder[0m
[32m2025-05-01 14:02:01[0m | [1mINFO    [0m | [1mStarting hyperparameter optimization[0m
[32m2025-05-01 14:02:02[0m | [1mINFO    [0m | [1mBest parameters: {'C': 0.04803799418099026, 'penalty': 'l2', 'solver': 'saga'}[0m
[32m2025-05-01 14:02:02[0m | [1mINFO    [0m | [1mModel trained and saved successfully[0m
[32m2025-05-01 14:02:02[0m | [1mINFO    [0m | [1mStarting model evaluation