In [1]:
import os


In [2]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\premium-price-prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\premium-price-prediction'

In [47]:

from dataclasses import  dataclass
from typing import List,Dict
from pathlib import Path
@dataclass
class FeatureEngineeringConfig:
    root_dir: Path
    input_path: Path
    output_path: Path
    model_path: Path
    test_filepath : Path
    train_filepath : Path
    target_column : str


In [48]:
from src.Premium_Price_Prediction.constants import *
from src.Premium_Price_Prediction.utils.common import read_yaml , create_directories , load_df , save_object , save_object
from src.Premium_Price_Prediction import logger

In [49]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath=CONFIG_FILE_PATH, 
                 params_filepath=PARAMS_FILE_PATH, 
                 schema_filepath=SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
    
    def get_feature_engineering_config(self) -> FeatureEngineeringConfig:
        config = self.config.feature_engineering
        create_directories([config.root_dir])

        
        feature_engineering_config = FeatureEngineeringConfig(
            root_dir=Path(config["root_dir"]),
            input_path=Path(config["input_path"]),
            output_path=Path(config["output_path"]),
            model_path=Path(config["model_path"]),
            test_filepath=Path(config["test_filepath"]),
            train_filepath=Path(config["train_filepath"]),
            target_column = config["target_column"]
        )
        return feature_engineering_config

In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
from pathlib import Path
from typing import Tuple
import logging



class FeatureEngineering:
    def __init__(self, config: FeatureEngineeringConfig):
        self.config = config
        self.train_df = None
        self.test_df = None
        self.preprocessor = None

    def load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Load training and testing datasets from specified file paths."""
        self.train_df = pd.read_csv(self.config.train_filepath)
        self.test_df = pd.read_csv(self.config.test_filepath)
        logger.info(f"Data loaded from {self.config.train_filepath} and {self.config.test_filepath}")
        return self.train_df, self.test_df

    def preprocess_medical_history(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process the 'medical_history' column and compute risk scores."""
        risk_scores = {
            "diabetes": 6,
            "heart disease": 8,
            "high blood pressure": 6,
            "thyroid": 5,
            "no disease": 0,
            "none": 0
        }

        # Split and map risk scores
        df[['disease1', 'disease2']] = (
            df['medical_history'].str.split(" & ", expand=True).apply(lambda x: x.str.lower())
        ).fillna('none')

        df['total_risk_score'] = df[['disease1', 'disease2']].apply(
            lambda x: risk_scores.get(x['disease1'], 0) + risk_scores.get(x['disease2'], 0), axis=1
        )

        # Normalize risk scores
        max_score, min_score = df['total_risk_score'].max(), df['total_risk_score'].min()
        df['normalized_risk_score'] = (df['total_risk_score'] - min_score) / (max_score - min_score)
        logger.info("Risk scores calculated and normalized")
        return df

    def build_preprocessor(self):
        """Construct the preprocessing pipeline for numerical and categorical features."""
        # Identify numerical and categorical columns
        numerical_columns = self.train_df.select_dtypes(exclude=['object']).columns.tolist()
        categorical_columns = self.train_df.select_dtypes(include=['object']).columns.tolist()

        # Define preprocessing transformers
        numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])
        categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', drop="first"))])

        # Combine transformers into a preprocessor
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_columns),
                ('cat', categorical_transformer, categorical_columns)
            ],
            remainder='passthrough'
        )
        logger.info("Preprocessing pipeline built")

    def fit_transform_data(self) -> Tuple[np.ndarray, np.ndarray]:
        """Fit the preprocessor and transform the datasets."""
        X_train_transformed = self.preprocessor.fit_transform(self.train_df)
        X_test_transformed = self.preprocessor.transform(self.test_df)
        logger.info("Transformations applied to the dataset")
        return X_train_transformed, X_test_transformed

    def save_object(self, file_path: Path, obj):
        """Save an object to a specified file path using joblib."""
        file_path.parent.mkdir(parents=True, exist_ok=True)
        try:
            joblib.dump(obj, file_path)
            logger.info(f"Object saved to {file_path}")
        except Exception as e:
            logger.error(f"Failed to save object to {file_path}: {str(e)}")
            raise  # Raise the exception after logging

    def feature_engineering(self) -> Tuple[np.ndarray, np.ndarray]:
        """Execute the feature engineering pipeline: load data, process, transform, and save."""
        try:
            # Load and preprocess medical history data
            self.train_df, self.test_df = self.load_data()

            # Check if 'annual_premium_amount' exists
            if 'annual_premium_amount' not in self.train_df.columns:
                logger.error("Column 'annual_premium_amount' not found in training data.")
                raise KeyError("Column 'annual_premium_amount' not found in training data.")

            # Preprocess medical history
            self.train_df = self.preprocess_medical_history(self.train_df)
            self.test_df = self.preprocess_medical_history(self.test_df)

            # Build the preprocessor
            self.build_preprocessor()

            # Fit and transform data
            train_data, test_data = self.fit_transform_data()

            # Save the preprocessor and transformed data
            self.save_object(self.config.output_path / 'preprocessor.joblib', self.preprocessor)

            # Save transformed data
            pd.DataFrame(train_data).to_csv(self.config.output_path / 'train_data.csv', index=False)
            pd.DataFrame(test_data).to_csv(self.config.output_path / 'test_data.csv', index=False)

            logger.info("Transformed data saved successfully.")
            return train_data, test_data
        except Exception as e:
            logger.error(f"Error during feature engineering: {str(e)}")
            raise  # Maintain the original error handling for external catch


In [55]:
config = ConfigurationManager()
feature_engineering_config = config.get_feature_engineering_config()  # Corrected variable name
feature_engineering = FeatureEngineering(feature_engineering_config)
feature_engineering.feature_engineering()


[2024-11-04 14:06:00,472: INFO: common: 30] YAML file : config\config.yaml loaded successfully
[2024-11-04 14:06:00,475: INFO: common: 30] YAML file : params.yaml loaded successfully
[2024-11-04 14:06:00,478: INFO: common: 30] YAML file : schema.yaml loaded successfully
[2024-11-04 14:06:00,480: INFO: common: 50] Directory artifacts created successfully.
[2024-11-04 14:06:00,482: INFO: common: 50] Directory artifacts/feature_engineering created successfully.
[2024-11-04 14:06:00,557: INFO: 2080276664: 24] Data loaded from artifacts\data_preprocessing\train.csv and artifacts\data_preprocessing\test.csv
[2024-11-04 14:06:01,019: INFO: 2080276664: 50] Risk scores calculated and normalized
[2024-11-04 14:06:01,103: INFO: 2080276664: 50] Risk scores calculated and normalized
[2024-11-04 14:06:01,120: INFO: 2080276664: 71] Preprocessing pipeline built
[2024-11-04 14:06:01,289: INFO: 2080276664: 77] Transformations applied to the dataset
[2024-11-04 14:06:01,289: INFO: 2080276664: 85] Object 

(array([[ 0.22652497, -1.14451085, -0.93676115, ...,  0.        ,
          1.        ,  0.        ],
        [-0.83342392, -0.47448297, -0.83075245, ...,  0.        ,
          1.        ,  0.        ],
        [ 0.09403136,  0.1955449 ,  2.40251319, ...,  0.        ,
          1.        ,  0.        ],
        ...,
        [-0.56843669, -0.47448297,  2.19049578, ...,  0.        ,
          1.        ,  0.        ],
        [-0.17095586, -1.14451085, -0.51272632, ...,  0.        ,
          1.        ,  0.        ],
        [-0.6346835 ,  0.1955449 , -0.14169583, ...,  0.        ,
          1.        ,  0.        ]]),
 array([[-1.03216433, -1.14451085, -0.8837568 , ...,  0.        ,
          1.        ,  0.        ],
        [-0.56843669, -1.14451085, -0.67173938, ...,  0.        ,
          1.        ,  0.        ],
        [ 1.28647385,  0.86557278, -0.40671761, ...,  0.        ,
          1.        ,  0.        ],
        ...,
        [-1.09841114, -0.47448297,  0.97139562, ...,  