In [1]:
import os 
path= os.getcwd()

if path.endswith('notebooks'):
    os.chdir('../')

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from src.Home_Premium_Prediction.utils import create_directories, read_yaml
from src.Home_Premium_Prediction.constants import CONFIG_FILE_PATH


class DataTransfromationConfig:
    def __init__(self, data_transformation_dir: Path, train_data_path: Path, test_data_path: Path, 
                 processed_train_data_path: Path, processed_test_data_path: Path, preprocessor_path: Path):
        self.data_transformation_dir = data_transformation_dir
        self.train_data_path = train_data_path
        self.test_data_path = test_data_path
        self.processed_train_data_path = processed_train_data_path
        self.processed_test_data_path = processed_test_data_path
        self.preprocessor_path= preprocessor_path


class DataTransformationConfigManager:
    def __init__(self, config_file=CONFIG_FILE_PATH):
        self.config_file = read_yaml(config_file)

    def get_data_transformation_config(self) -> DataTransfromationConfig:
        # Ensure directory creation step for the transformation directory
        create_directories([self.config_file['data_transformation']['data_transformation_dir']])
        
        return DataTransfromationConfig(
            data_transformation_dir=Path(self.config_file['data_transformation']['data_transformation_dir']),
            train_data_path=Path(self.config_file['data_transformation']['train_data_path']),
            test_data_path=Path(self.config_file['data_transformation']['test_data_path']),
            processed_train_data_path=Path(self.config_file['data_transformation']['processed_train_data_path']),
            processed_test_data_path=Path(self.config_file['data_transformation']['processed_test_data_path']),
            preprocessor_path= self.config_file['data_transformation']['preprocessor_path']
        )


class DataTransformation:
    def __init__(self, config: DataTransfromationConfig):
        self.config = config
        self.preprocessor = self.create_preprocessor()

    def create_preprocessor(self):
        # Define column types
        self.nominal_cols = ['property_type', 'broker_name', 'ownership_status']
        self.ordinal_cols = ['coverage_level', 'energy_efficiency_rating']
        self.high_cardinality_col = ['pcd']
        self.uniform_cols = ['year_built', 'building_value', 'contents_value',
                        'flood_risk_score', 'fire_risk_score', 'crime_rate_score',
                        'distance_to_fire_station']
        self.normal_cols = ['long', 'lat']

        # Ordinal mappings
        self.ordinal_mapping = [['Gold', 'Silver', 'Platinum', 'Bronze'], ['A', 'B', 'C', 'D', 'E']]

        # Define pipelines for preprocessing
        nominal_pipeline = Pipeline([ 
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
        ])

        ordinal_pipeline = Pipeline([ 
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ordinal', OrdinalEncoder(categories=self.ordinal_mapping))
        ])

        uniform_pipeline = Pipeline([ 
            ('imputer', SimpleImputer(strategy='median')),
            ('minmax', MinMaxScaler())
        ])

        normal_pipeline = Pipeline([ 
            ('imputer', SimpleImputer(strategy='median')),
            ('std', StandardScaler())
        ])

        # Combined preprocessor
        return ColumnTransformer(transformers=[ 
            ('nominal', nominal_pipeline, self.nominal_cols),
            ('ordinal', ordinal_pipeline, self.ordinal_cols),
            ('uniform', uniform_pipeline, self.uniform_cols),
            ('normal', normal_pipeline, self.normal_cols)
        ], remainder='passthrough')

    def process_data(self, data_path: Path, fit=False):
        # Load data
        df = pd.read_csv(data_path)

        # Drop unnecessary columns
        df.drop(columns=['uuid', 'quote_id'], inplace=True)

        # Separate target
        target = df['Premium']
        df.drop(columns=['Premium'], inplace=True)

        # Frequency encode high cardinality column
        df['pcd'] = df['pcd'].map(df['pcd'].value_counts() / len(df))

        # Fit and transform if it's the training data, otherwise just transform
        if fit:
            processed_features = self.preprocessor.fit_transform(df)
        else:
            processed_features = self.preprocessor.transform(df)

        # Get transformed column names
        nominal_encoded = self.preprocessor.named_transformers_['nominal']['onehot'].get_feature_names_out(['property_type', 'broker_name', 'ownership_status'])
        ordinal_encoded = self.ordinal_cols  # Correcting to match original input columns
        uniform_encoded = self.uniform_cols
        normal_encoded = self.normal_cols
        passthrough_cols = [col for col in df.columns if col not in self.nominal_cols + self.ordinal_cols + self.uniform_cols + self.normal_cols]

        # Debugging step: print the lengths of columns
        print("Nominal Columns:", len(nominal_encoded))
        print("Ordinal Columns:", len(ordinal_encoded))
        print("Uniform Columns:", len(uniform_encoded))
        print("Normal Columns:", len(normal_encoded))
        print("Passthrough Columns:", len(passthrough_cols))

        final_columns = list(nominal_encoded) + ordinal_encoded + uniform_encoded + normal_encoded + passthrough_cols

        # Create DataFrame
        X_df = pd.DataFrame(processed_features, columns=final_columns)
        y_df = pd.DataFrame(target, columns=['Premium'])

        return X_df, y_df

    def run(self):
        # Process train data with fit_transform (fit the preprocessor)
        X_train, y_train = self.process_data(self.config.train_data_path, fit=True)

        # Process test data with transform (use fitted preprocessor)
        X_test, y_test = self.process_data(self.config.test_data_path, fit=False)

        # Save processed train and test data
        create_directories([self.config.processed_train_data_path, self.config.processed_test_data_path])

        X_train.to_csv(self.config.processed_train_data_path / 'train_features.csv', index=False)
        y_train.to_csv(self.config.processed_train_data_path / 'train_target.csv', index=False)

        X_test.to_csv(self.config.processed_test_data_path / 'test_features.csv', index=False)
        y_test.to_csv(self.config.processed_test_data_path / 'test_target.csv', index=False)

        # Save the preprocessor
        import joblib
        joblib.dump(self.preprocessor, self.config.preprocessor_path)
        print(f"✅ Preprocessor saved at: {self.config.preprocessor_path}")

        print("✅ Preprocessing complete. Files saved at:", self.config.processed_train_data_path, "and", self.config.processed_test_data_path)



# ✅ Main runner
if __name__ == "__main__":
    try:
        config = DataTransformationConfigManager().get_data_transformation_config()
        transformer = DataTransformation(config)
        transformer.run()
    except Exception as e:
        print(f"❌ Error in preprocessing: {e}")


created directory at: artifacts/data_transformation
Nominal Columns: 10
Ordinal Columns: 2
Uniform Columns: 7
Normal Columns: 2
Passthrough Columns: 11
Nominal Columns: 10
Ordinal Columns: 2
Uniform Columns: 7
Normal Columns: 2
Passthrough Columns: 11
created directory at: artifacts\data_transformation\train_data
created directory at: artifacts\data_transformation\test_data
✅ Preprocessor saved at: artifacts/data_transformation/preprocessor.joblib
✅ Preprocessing complete. Files saved at: artifacts\data_transformation\train_data and artifacts\data_transformation\test_data


In [3]:
df.head()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\Bilal Ahmad\Desktop\Home-Insurance-Price-Predictive-Model-1\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3670, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
    ~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Bilal Ahmad\AppData\Local\Temp\ipykernel_15828\964094849.py", line 1, in <module>
    df.head()
    ^^
NameError: name 'df' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\Bilal Ahmad\Desktop\Home-Insurance-Price-Predictive-Model-1\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 2176, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
        etype, value, tb, tb_offset=tb_offset
    )
  File "c:\Users\Bilal Ahmad\Desktop\Home-Insurance-Price-Predictive-Model-1\.venv\Lib\site-packages\IPython\core\ultratb.py", line 1182, in structured_traceback
    retu

In [None]:
df= pd.read_csv('artifacts/data_ingestion/raw_data/home_insurance_train.csv')

df= df.drop(columns= ['uuid', 'quote_id'], axis= 1)

# define the cols 
# Define column types
nominal_cols = ['property_type', 'broker_name', 'ownership_status']
ordinal_cols = ['coverage_level', 'energy_efficiency_rating']
high_cardinality_col = ['pcd']
uniform_cols = ['year_built', 'building_value', 'contents_value', 'flood_risk_score',
                        'fire_risk_score', 'crime_rate_score', 'distance_to_fire_station']
normal_cols = ['long', 'lat']

# Ordinal mappings
ordinal_mapping = [['Gold', 'Silver', 'Platinum', 'Bronze'],  # coverage_level
                           ['A', 'B', 'C', 'D', 'E']]                  # energy_efficiency_rating

# High cardinality encoding (frequency)
df['pcd'] = df['pcd'].map(df['pcd'].value_counts() / len(df))

# define the pipelines 
# Pipelines
nominal_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
        ])

ordinal_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ordinal', OrdinalEncoder(categories=ordinal_mapping))
        ])

uniform_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('minmax', MinMaxScaler())
        ])

normal_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('std', StandardScaler())
        ])

preprocessor = ColumnTransformer(transformers=[
            ('nominal', nominal_pipeline, nominal_cols),
            ('ordinal', ordinal_pipeline, ordinal_cols),
            ('uniform', uniform_pipeline, uniform_cols),
            ('normal', normal_pipeline, normal_cols)
        ], remainder='passthrough')

# Fit and transform
processed_features = preprocessor.fit_transform(df)

In [None]:
print(processed_features)

[[0.00000e+00 0.00000e+00 1.00000e+00 ... 0.00000e+00 1.00000e-06
  1.74049e+03]
 [0.00000e+00 0.00000e+00 0.00000e+00 ... 0.00000e+00 2.00000e-06
  6.77710e+02]
 [0.00000e+00 1.00000e+00 0.00000e+00 ... 0.00000e+00 2.00000e-06
  1.44161e+03]
 ...
 [1.00000e+00 0.00000e+00 0.00000e+00 ... 0.00000e+00 1.00000e-06
  1.76800e+03]
 [0.00000e+00 0.00000e+00 1.00000e+00 ... 1.00000e+00 1.00000e-06
  1.69568e+03]
 [1.00000e+00 0.00000e+00 0.00000e+00 ... 1.00000e+00 1.00000e-06
  1.19807e+03]]


In [None]:
# Get transformed column names
nominal_feature_names = preprocessor.named_transformers_['nominal']['onehot'].get_feature_names_out(nominal_cols)
ordinal_feature_names = ordinal_cols
uniform_feature_names = uniform_cols
normal_feature_names = normal_cols

# Columns that were passed through (remainder='passthrough')
passthrough_cols = [col for col in df.columns if col not in nominal_cols + ordinal_cols + uniform_cols + normal_cols]

# Combine all
final_columns = list(nominal_feature_names) + ordinal_feature_names + uniform_feature_names + normal_feature_names + passthrough_cols

# Convert to DataFrame
processed_df = pd.DataFrame(processed_features, columns=final_columns)

In [None]:
processed_df

Unnamed: 0,property_type_Detached,property_type_Flat,property_type_Semi-Detached,property_type_Terraced,broker_name_BrokerA,broker_name_BrokerB,broker_name_BrokerC,ownership_status_Owner-occupied,ownership_status_Rented,ownership_status_Vacant,coverage_level,energy_efficiency_rating,year_built,building_value,contents_value,flood_risk_score,fire_risk_score,crime_rate_score,distance_to_fire_station,long,lat,sale_flag,number_of_bedrooms,number_of_bathrooms,number_of_occupants,has_smoke_alarms,has_burglar_alarm,pets_present,policy_term,previous_claims_count,has_security_cameras,pcd,Premium
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,1.0,0.048780,0.593139,0.741966,0.49,0.36,0.06,0.736546,0.955423,-0.327974,0.0,6.0,3.0,4.0,1.0,0.0,0.0,12.0,3.0,0.0,0.000001,1740.49
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,0.0,0.373984,0.055600,0.016903,0.08,0.33,0.20,0.173494,-2.936565,0.217246,1.0,4.0,2.0,3.0,1.0,0.0,1.0,36.0,2.0,0.0,0.000002,677.71
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.211382,0.388759,0.242772,0.32,0.49,0.27,0.840964,0.195749,-0.452149,0.0,5.0,2.0,3.0,1.0,0.0,1.0,36.0,5.0,0.0,0.000002,1441.61
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,4.0,0.430894,0.625089,0.691269,0.69,0.31,0.37,0.726104,2.116493,-0.109637,0.0,1.0,3.0,1.0,1.0,0.0,0.0,24.0,5.0,0.0,0.000001,1957.38
4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,0.081301,0.566262,0.431028,0.87,0.74,0.32,0.148996,-0.330532,-0.025734,0.0,4.0,1.0,6.0,0.0,0.0,0.0,24.0,1.0,0.0,0.000001,1543.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,0.0,0.918699,0.382361,0.082034,0.99,0.72,0.55,0.551004,0.253003,-0.071472,0.0,4.0,1.0,2.0,1.0,0.0,1.0,36.0,5.0,0.0,0.000001,1578.68
999996,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.796748,0.127495,0.589841,0.29,0.18,0.97,0.285542,1.056331,-0.381962,0.0,2.0,2.0,4.0,1.0,0.0,1.0,24.0,1.0,0.0,0.000002,972.78
999997,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,3.0,0.138211,0.554779,0.699103,0.75,0.72,0.81,0.868273,0.717826,-0.235771,0.0,4.0,1.0,4.0,0.0,1.0,0.0,12.0,3.0,0.0,0.000001,1768.00
999998,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,3.0,0.146341,0.578201,0.884614,0.18,0.66,0.40,0.448594,0.451117,-0.135771,0.0,1.0,1.0,3.0,1.0,1.0,0.0,24.0,4.0,1.0,0.000001,1695.68
