In [36]:
import os 
path= os.getcwd()

if path.endswith('notebooks'):
    os.chdir('../')

In [None]:
# import dependencies
import numpy as np 
import pandas as pd 
import seaborn as sns 
from pathlib import Path
import matplotlib.pyplot as plt
from src.Home_Premium_Prediction.utils import create_directories, read_yaml
from src.Home_Premium_Prediction.constants import CONFIG_FILE_PATH
# %matplotlib inline  # <- remove this if running as a script

class DataTransfromationConfig:
    def __init__(self, data_transformation_dir: Path, train_data_path: Path, processed_data_path: Path):
        self.data_transformation_dir = data_transformation_dir
        self.train_data_path = train_data_path
        self.processed_data_path = processed_data_path

class DataTransformationConfigManager:
    def __init__(self, config_file=CONFIG_FILE_PATH):
        self.config_file = read_yaml(config_file)
    
    def get_data_transformation_config(self) -> DataTransfromationConfig:
        create_directories([self.config_file['data_transformation']['data_transformation_dir']])

        return DataTransfromationConfig(
            data_transformation_dir=self.config_file['data_transformation']['data_transformation_dir'],
            train_data_path=self.config_file['data_transformation']['train_data_path'],
            processed_data_path=self.config_file['data_transformation']['processed_data_path']
        )

class DataTransformation:
    def __init__(self, config: DataTransfromationConfig):
        self.config = config
    
    def read_data(self):
        self.df = pd.read_csv(self.config.train_data_path)

    def drop_unwanted_cols(self):
        self.df = self.df.drop(columns=['uuid', 'quote_id'], axis=1)

    def splitting_data_into_features_and_target(self):
        self.features = self.df.drop(columns=['Premium'], axis=1)
        self.target = self.df['Premium']

    def prepare_data_for_catboost(self):
        self.categorical_columns = self.features.select_dtypes(include=['category', 'object']).columns.to_list()
        return self.features, self.target, self.categorical_columns

# ✅ Main runner block
if __name__ == "__main__":
    config = DataTransformationConfigManager().get_data_transformation_config()
    transformer = DataTransformation(config)
    
    transformer.read_data()
    transformer.drop_unwanted_cols()
    transformer.splitting_data_into_features_and_target()
    
    X, y, cat_cols = transformer.prepare_data_for_catboost()
    
    print("✅ Features and target prepared for CatBoost")
    print("Categorical columns:", cat_cols)


created directory at: artifacts/data_transformation
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\Bilal Ahmad\Desktop\Home-Insurance-Price-Predictive-Model\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3670, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
    ~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Bilal Ahmad\AppData\Local\Temp\ipykernel_8712\1285367500.py", line 50, in <module>
    config = DataTransformationConfigManager().get_data_transformation_config()
  File "C:\Users\Bilal Ahmad\AppData\Local\Temp\ipykernel_8712\1285367500.py", line 24, in get_data_transformation_config
    return DataTransfromationConfig(
        data_transformation_dir=self.config_file['data_transformation']['data_transformation_dir'],
        train_data_path=self.config_file['data_transformation']['train_data_path'],
        processed_data_path=self.config_file['data_transformation']['processed_data_path']
    )
TypeError: DataTransfromationConfig.__init__() got an unexpected keyword argumen

In [None]:
df= pd.read_csv('artifacts/data_ingestion/raw_data/home_insurance_train.csv')

In [None]:
df= df.drop(columns= ['uuid', 'quote_id'], axis= 1)

In [None]:
df.columns

Index(['sale_flag', 'property_type', 'year_built', 'number_of_bedrooms',
       'number_of_bathrooms', 'building_value', 'contents_value',
       'coverage_level', 'number_of_occupants', 'has_smoke_alarms',
       'has_burglar_alarm', 'pets_present', 'flood_risk_score',
       'fire_risk_score', 'crime_rate_score', 'broker_name', 'policy_term',
       'previous_claims_count', 'distance_to_fire_station',
       'has_security_cameras', 'ownership_status', 'energy_efficiency_rating',
       'pcd', 'lat', 'long', 'Premium'],
      dtype='object')

In [None]:
df.isnull().sum()

sale_flag                   0
property_type               0
year_built                  0
number_of_bedrooms          0
number_of_bathrooms         0
building_value              0
contents_value              0
coverage_level              0
number_of_occupants         0
has_smoke_alarms            0
has_burglar_alarm           0
pets_present                0
flood_risk_score            0
fire_risk_score             0
crime_rate_score            0
broker_name                 0
policy_term                 0
previous_claims_count       0
distance_to_fire_station    0
has_security_cameras        0
ownership_status            0
energy_efficiency_rating    0
pcd                         0
lat                         0
long                        0
Premium                     0
dtype: int64

* No missing data.

In [None]:
cat_cols= [col for col in df.columns if df[col].dtypes == 'object']

In [None]:
df[cat_cols]

Unnamed: 0,property_type,coverage_level,broker_name,ownership_status,energy_efficiency_rating,pcd
0,Semi-Detached,Bronze,BrokerB,Vacant,B,N4 3BH
1,Terraced,Bronze,BrokerC,Vacant,A,BT342PL
2,Flat,Bronze,BrokerA,Owner-occupied,A,SO506JS
3,Detached,Silver,BrokerA,Owner-occupied,E,NR302SA
4,Terraced,Bronze,BrokerA,Vacant,B,ST4 2QE
...,...,...,...,...,...,...
999995,Detached,Bronze,BrokerB,Vacant,A,LE115YU
999996,Semi-Detached,Silver,BrokerA,Owner-occupied,A,TN163LT
999997,Detached,Bronze,BrokerA,Owner-occupied,D,MK451ZE
999998,Semi-Detached,Bronze,BrokerB,Owner-occupied,D,LE167NN


In [None]:
num_cols= [col for col in df.columns if df[col].dtypes != 'object']

In [None]:
df[num_cols]

Unnamed: 0,sale_flag,year_built,number_of_bedrooms,number_of_bathrooms,building_value,contents_value,number_of_occupants,has_smoke_alarms,has_burglar_alarm,pets_present,flood_risk_score,fire_risk_score,crime_rate_score,policy_term,previous_claims_count,distance_to_fire_station,has_security_cameras,lat,long,Premium
0,0,1906,6,3,475369.0,112585.0,4,1,0,0,0.49,0.36,0.06,12,3,18.44,0,51.566409,-0.119991,1740.49
1,1,1946,4,2,112530.0,7451.0,3,1,0,1,0.08,0.33,0.20,36,2,4.42,0,54.154194,-6.313653,677.71
2,0,1926,5,2,337412.0,40202.0,3,1,0,1,0.32,0.49,0.27,36,5,21.04,0,50.977035,-1.328926,1441.61
3,0,1953,1,3,496935.0,105234.0,1,1,0,0,0.69,0.31,0.37,24,5,18.18,0,52.602705,1.727722,1957.38
4,0,1910,4,1,457227.0,67499.0,6,0,0,0,0.87,0.74,0.32,24,1,3.81,0,53.000934,-2.166444,1543.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0,2013,4,1,333094.0,16895.0,2,1,0,1,0.99,0.72,0.55,36,5,13.82,0,52.783850,-1.237813,1578.68
999996,0,1998,2,2,161059.0,90527.0,4,1,0,1,0.29,0.18,0.97,24,1,7.21,0,51.310164,0.040593,972.78
999997,0,1917,4,1,449476.0,106370.0,4,0,1,0,0.75,0.72,0.81,12,3,21.72,0,52.004036,-0.498100,1768.00
999998,0,1918,1,1,465286.0,133269.0,3,1,1,0,0.18,0.66,0.40,24,4,11.27,1,52.478663,-0.922537,1695.68
