In [9]:
import os
%pwd

'/home/amit/python/Industrial_AI_project/IITM_MLops_titanic_dataset_github_clone/IITM-MLProject-kaggle-Titanic-dataset'

In [2]:
os.chdir('../')
%pwd

'/home/amit/python/Industrial_AI_project/IITM_MLops_titanic_dataset_github_clone/IITM-MLProject-kaggle-Titanic-dataset'

In [10]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    input_data_file: Path
    test_data_file: Path
    params_splitratio: list
    params_seed: int
    params_regParam: list
    params_elasticNetParam: list
    params_number_of_folds: int
    params_sparkSessionTitle: str

In [11]:
# from Titanic_dataset_analysis import constants as c
# from Titanic_dataset_analysis.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
CONFIG_FILE_PATH = Path("config/config.yaml")
PARAMS_FILE_PATH = Path("params.yaml")
from Titanic_dataset_analysis.utils.common import read_yaml, create_directories

In [12]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        print(os.getcwd())
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_training
        self.params = self.params

        create_directories([config.root_dir])
        # print(f"Params received under: {self.params} and {self.params.splitratio}")
        model_training_config = ModelTrainingConfig(
            root_dir=config.root_dir,
            input_data_file=config.input_data_file,
            test_data_file=config.test_data_file,
            params_splitratio=self.params.splitratio,
            params_seed=self.params.seed,
            params_regParam=self.params.regParam,
            params_elasticNetParam=self.params.elasticNetParam,
            params_number_of_folds=self.params.number_of_folds,
            params_sparkSessionTitle=self.params.sparkSessionTitle
            
        )

        return model_training_config

In [13]:
import os
import pandas as pd
import findspark
from pathlib import Path
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from Titanic_dataset_analysis import logger
from Titanic_dataset_analysis.utils.common import get_size

In [14]:
class ModelTraining:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config


    
    def read_file(self):
        if not os.path.exists(self.config.input_data_file):
            logger.info(f"File download failed in previous step! Please check the location mentioned : {self.config.input_data_file}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.input_data_file))}")

        self.df = pd.read_csv(self.config.input_data_file)
        logger.info(f"Processed Input file read from {self.config.input_data_file}")
    
    def model_training_and_save_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        # print(self.config)
        # logger.info(f"Spark session title is: {self.config.params_sparkSessionTitle}")
        spark = SparkSession.builder.appName(self.config.params_sparkSessionTitle).getOrCreate()
        spark.sparkContext.setLogLevel("WARN")
        
        df = spark.read.csv(str(self.config.input_data_file), header=True, inferSchema=True)
        # logger.info(df.show())
        # Drop Cabin + Name + Ticket (not useful for ML in our setup)
        columns_to_drop = ["Cabin", "Name", "Ticket"]
        # Categorical feature processing
        categorical_cols = ["Sex", "Embarked"]
        indexed_cols = [c + "_indexed" for c in categorical_cols]
        encoded_cols = [c + "_encoded" for c in categorical_cols]

        indexers = [StringIndexer(inputCol=c, outputCol=c + "_indexed", handleInvalid="keep") for c in categorical_cols]
        encoders = [OneHotEncoder(inputCol=ic, outputCol=ec) for ic, ec in zip(indexed_cols, encoded_cols)]

        # Vector Assembler
        feature_columns = [
            "Pclass", "Age", "SibSp", "Parch", "Fare",
            "FamilySize", "IsAlone",
            "Sex_encoded", "Embarked_encoded"
        ]
        assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

        # Logistic Regression model
        lr = LogisticRegression(featuresCol="features", labelCol="Survived")
        # -------------------------------
        # Pipeline
        # -------------------------------
        pipeline = Pipeline(stages=indexers + encoders + [assembler, lr])

        # -------------------------------
        # Train/Test split
        # -------------------------------
        train_data, test_data = df.drop(*columns_to_drop).randomSplit(self.config.params_splitratio, seed=self.config.params_seed)
        test_data.write.mode("overwrite").option("header", True).csv(self.config.test_data_file)
        logger.info(f"Test Data is saved at: {self.config.test_data_file}")

        # -------------------------------
        # Param Grid & CrossValidator
        # -------------------------------
        paramGrid = ParamGridBuilder() \
            .addGrid(lr.regParam, self.config.params_regParam) \
            .addGrid(lr.elasticNetParam, self.config.params_elasticNetParam) \
            .build()

        # Evaluator
        evaluator = BinaryClassificationEvaluator(labelCol="Survived", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

        crossval = CrossValidator(estimator=pipeline,
                                estimatorParamMaps=paramGrid,
                                evaluator=evaluator,
                                numFolds=self.config.params_number_of_folds)
        cv_model = crossval.fit(train_data)
        best_lr = cv_model.bestModel.stages[-1]
        logger.info(f"Best model: {best_lr}")
        
        os.makedirs(self.config.root_dir, exist_ok=True)
        model_path =f"{self.config.root_dir}/best_model"
        if os.path.exists(model_path):
            cv_model.bestModel.write().overwrite().save(model_path)
        else:
            cv_model.bestModel.write().save(model_path)
        # cv_model.bestModel.write().save(f"{self.config.root_dir}/best_model") # .overwrite()
        logger.info(f"Model training done successfully. Model saved at {self.config.root_dir}/best_model")
        
        spark.stop()

In [16]:
try:
    config = ConfigurationManager()
    model_training_config = config.model_training_config()
    model_training = ModelTraining(config=model_training_config)
    model_training.read_file()
    model_training.model_training_and_save_file()
except Exception as e:
    raise e

/home/amit/python/Industrial_AI_project/IITM_MLops_titanic_dataset_github_clone/IITM-MLProject-kaggle-Titanic-dataset
[2025-08-28 12:10:32,234: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-08-28 12:10:32,241: INFO: common: yaml file: params.yaml loaded successfully]
[2025-08-28 12:10:32,246: INFO: common: created directory at: artifacts]
[2025-08-28 12:10:32,251: INFO: common: created directory at: artifacts/model_training]
[2025-08-28 12:10:32,253: INFO: 3205805082: File already exists of size: ~ 67 KB]
[2025-08-28 12:10:32,263: INFO: 3205805082: Processed Input file read from artifacts/data_preprocessing/titanic_preprocessed.csv]


25/08/28 12:10:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


[2025-08-28 12:10:32,926: INFO: 3205805082: Test Data is saved at: artifacts/model_training/test_data]
[2025-08-28 12:11:59,947: INFO: clientserver: Closing down clientserver connection]
[2025-08-28 12:11:59,948: INFO: clientserver: Closing down clientserver connection]
[2025-08-28 12:11:59,958: INFO: 3205805082: Best model: LogisticRegressionModel: uid=LogisticRegression_8f633fed113f, numClasses=2, numFeatures=12]
[2025-08-28 12:12:01,867: INFO: 3205805082: Model training done successfully. Model saved at artifacts/model_training/best_model]
