In [19]:
import os 
path= os.getcwd()

if path.endswith('notebooks'):
    os.chdir('../')

In [20]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from src.Home_Premium_Prediction.utils import create_directories, read_yaml
from src.Home_Premium_Prediction.constants import CONFIG_FILE_PATH

class DataTransfromationConfig:
    def __init__(self, data_transformation_dir: Path, train_data_path: Path, processed_data_path: Path):
        self.data_transformation_dir = data_transformation_dir
        self.train_data_path = train_data_path
        self.processed_data_path = processed_data_path

class DataTransformationConfigManager:
    def __init__(self, config_file=CONFIG_FILE_PATH):
        self.config_file = read_yaml(config_file)
        print(f"Config file content: {self.config_file}")  # Debugging line

    def get_data_transformation_config(self) -> DataTransfromationConfig:
        # Print the config paths
        print(f"Data Transformation Dir: {self.config_file['data_transformation']['data_transformation_dir']}")
        create_directories([self.config_file['data_transformation']['data_transformation_dir']])
        return DataTransfromationConfig(
            data_transformation_dir=self.config_file['data_transformation']['data_transformation_dir'],
            train_data_path=self.config_file['data_transformation']['train_data_path'],
            processed_data_path=self.config_file['data_transformation']['processed_data_path']
        )


class DataTransformation:
    def __init__(self, config: DataTransfromationConfig):
        self.config = config

    def run(self):
        df = pd.read_csv(self.config.train_data_path)
        df.drop(columns=['uuid', 'quote_id'], inplace=True)

        target = df['Premium']
        df.drop(columns=['Premium'], inplace=True)

        # Define column types
        nominal_cols = ['property_type', 'broker_name', 'ownership_status']
        ordinal_cols = ['coverage_level', 'energy_efficiency_rating']
        high_cardinality_col = ['pcd']
        uniform_cols = ['year_built', 'building_value', 'contents_value', 'flood_risk_score',
                        'fire_risk_score', 'crime_rate_score', 'distance_to_fire_station']
        normal_cols = ['long', 'lat']

        # Ordinal mappings
        ordinal_mapping = [['Gold', 'Silver', 'Platinum', 'Bronze'],  # coverage_level
                           ['A', 'B', 'C', 'D', 'E']]                  # energy_efficiency_rating

        # High cardinality encoding (frequency)
        df['pcd'] = df['pcd'].map(df['pcd'].value_counts() / len(df))

        # Pipelines
        nominal_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
        ])

        ordinal_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ordinal', OrdinalEncoder(categories=ordinal_mapping))
        ])

        uniform_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('minmax', MinMaxScaler())
        ])

        normal_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('std', StandardScaler())
        ])

        preprocessor = ColumnTransformer(transformers=[
            ('nominal', nominal_pipeline, nominal_cols),
            ('ordinal', ordinal_pipeline, ordinal_cols),
            ('uniform', uniform_pipeline, uniform_cols),
            ('normal', normal_pipeline, normal_cols)
        ], remainder='passthrough')

        # Fit and transform
        processed_features = preprocessor.fit_transform(df)

        # Column names after transformation
        nominal_encoded = preprocessor.named_transformers_['nominal'].named_steps['onehot'].get_feature_names_out(nominal_cols)
        ordinal_encoded = ordinal_cols  # OrdinalEncoder does not change names
        uniform_encoded = uniform_cols  # MinMaxScaler doesn't change names
        normal_encoded = normal_cols    # StandardScaler doesn't change names
        remainder_encoded = ['pcd']     # manually encoded

        final_columns = list(nominal_encoded) + ordinal_encoded + uniform_encoded + normal_encoded + remainder_encoded

        # Make sure shapes match
        assert processed_features.shape[1] == len(final_columns), f"Shape mismatch: {processed_features.shape[1]} != {len(final_columns)}"

        # Save as DataFrame
        X_df = pd.DataFrame(processed_features, columns=final_columns)
        y_df = pd.DataFrame(target, columns=['Premium'])

        output_dir = Path(self.config.processed_data_path)
        create_directories([output_dir])

        X_df.to_csv(output_dir / 'preprocessed_features.csv', index=False)
        y_df.to_csv(output_dir / 'preprocessed_target.csv', index=False)

        print("✅ Preprocessing complete. Files saved.")


# ✅ Main runner
if __name__ == "__main__":
    try:
        config = DataTransformationConfigManager().get_data_transformation_config()
        transformer = DataTransformation(config)
        transformer.run()
    except Exception as e:
        print(e)


Config file content: {'artifacts_root': 'artifacts', 'data_ingestion': {'data_ingestion_dir': 'artifacts/data_ingestion', 'test_data_url': 'https://drive.google.com/file/d/1Dy1CmtS30nSa2lxgBr-VK_ufMz7AXpV9', 'train_data_url': 'https://drive.google.com/file/d/1dYWh2YlTwtbnPbN6bcOOJgR011W0fWwx', 'train_data_path': 'artifacts/data_ingestion/raw_data', 'test_data_path': 'artifacts/data_ingestion/raw_data'}, 'data_validation': {'data_validation_dir': 'artifacts/data_validation', 'train_data_path': 'artifacts/data_ingestion/raw_data/home_insurance_train.csv', 'status_file': 'artifacts/data_validation/status.txt'}, 'data_transformation': {'data_transformation_dir': 'artifacts/data_transformation', 'train_data_path': 'artifacts/data_ingestion/raw_data/home_insurance_train.csv', 'processed_data_path': 'artifacts/data_transformation/transformed_data'}}
Data Transformation Dir: artifacts/data_transformation
created directory at: artifacts/data_transformation
Shape mismatch: 32 != 22
