In [1]:
import os

os.chdir("../")
%pwd

'd:\\DHIRAJ\\Data_Science\\Jupyter_Workspace\\Projects\\SpaceX-Falcon-9-first-stage-Landing-Prediction'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    data_path: Path
    encoder_ckpt: Path
    processed_data_path: Path

In [3]:
from SpaceXF9LandingPred.constants import *
from SpaceXF9LandingPred.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(self, 
                config_filepath=CONFIG_FILE_PATH, 
                params_filepath=PARAMS_FILE_PATH):
        
        self.config_file=read_yaml(config_filepath)
        self.params_file=read_yaml(params_filepath)

        create_directories([self.config_file.artifacts_root])
    
    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        
        config=self.config_file.data_preprocessing
        create_directories([config.root_dir])

        data_preprocessing_config=DataPreprocessingConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            encoder_ckpt=config.encoder_ckpt,
            processed_data_path=config.processed_data_path
        )

        return data_preprocessing_config

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

from SpaceXF9LandingPred.logging import logger

from SpaceXF9LandingPred.entity import DataPreprocessingConfig


class DataPreprocessing:
    def __init__(self, config:DataPreprocessingConfig):
        self.config=config

    def process_data(self):

        data = pd.read_csv(self.config.data_path)

        data['PayloadMass']=data['PayloadMass'].replace(np.nan, data['PayloadMass'].mean())
        data["Orbit"]=data["Orbit"].replace(np.nan, data["Orbit"].mode()[0])

        logger.info("Fixed Null/Missing Values")

        landing_outcomes = data['Outcome'].value_counts()
        bad_outcomes=set(landing_outcomes.keys()[[1,3,5,6,7]])

        outcome_list = []
        for outcome in data['Outcome']:
            if outcome in bad_outcomes:
                outcome_list.append(0)
            else:
                outcome_list.append(1)

        data['Class']=outcome_list

        features = data[['FlightNumber', 'PayloadMass', 'Orbit', 'LaunchSite', 'Flights', 
                 'GridFins', 'Reused', 'Legs', 'LandingPad', 'Block', 'ReusedCount', 
                 'Serial', 'Class']]

        categorical_cols = ['Orbit', 'LaunchSite', 'LandingPad', 'Serial', 
                            'GridFins', 'Reused', 'Legs']
        numerical_cols = ['FlightNumber', 'PayloadMass', 'Flights', 'Block', 'ReusedCount', 'Class']

        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', dtype=float)

        encoded_cats = encoder.fit_transform(features[categorical_cols])

        encoded_cat_cols = encoder.get_feature_names_out(categorical_cols)
        
        encoded_df = pd.DataFrame(encoded_cats, columns=encoded_cat_cols, index=features.index)

        features_encoded = pd.concat([features[numerical_cols], encoded_df], axis=1)

        features_encoded.astype(float)

        logger.info("Features are Encoded Successfully")

        features_encoded.to_csv(self.config.processed_data_path, index=False)

        logger.info("Processed Data file has been created in artifacts")

In [15]:
try:
    config=ConfigurationManager()
    data_preprocessing_config=config.get_data_preprocessing_config()
    data_preprocessing=DataPreprocessing(config=data_preprocessing_config)
    data_preprocessing.process_data()
except Exception as e:
    raise e

[2025-08-11 13:55:38,365: INFO: common: yaml file : config\config.yaml loaded successfully]
[2025-08-11 13:55:38,368: INFO: common: yaml file : params.yaml loaded successfully]
[2025-08-11 13:55:38,369: INFO: common: created directory at : artifacts]
[2025-08-11 13:55:38,371: INFO: common: created directory at : artifacts/data_preprocessing]
[2025-08-11 13:55:38,377: INFO: 3604251675: Fixed Null/Missing Values]
[2025-08-11 13:55:38,389: INFO: 3604251675: Features are Encoded Successfully]
[2025-08-11 13:55:38,397: INFO: 3604251675: Processed Data file has been created in artifacts]
