In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [3]:
COLUMN_DATA_TYPES = {
    "Age": "int64",
    "Gender": "category",
    "Smoking": "category",
    "Alcohol_Consumption": "category",
    "Physical_Activity_Level": "category",
    "BMI": "float64",
    "Diabetes": "category",
    "Hypertension": "category",
    "Cholesterol_Level": "float64",
    "Resting_BP": "int64",
    "Heart_Rate": "int64",
    "Family_History": "category",
    "Stress_Level": "category",
    "Chest_Pain_Type": "category",
    "Thalassemia": "category",
    "Fasting_Blood_Sugar": "category",
    "ECG_Results": "category",
    "Exercise_Induced_Angina": "category",
    "Max_Heart_Rate_Achieved": "int64",
    "Heart_Attack_Risk": "category",
}

TARGET_COLUMN = "Heart_Attack_Risk"

In [2]:
import os

%pwd

'd:\\Projects\\heart-attack risk prediction\\research'

In [3]:
os.chdir('../')

print(os.getcwd())

d:\Projects\heart-attack risk prediction


In [10]:
# Updating the entity file


from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    all_schema: dict
    target_column: str

In [5]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [11]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        schema = self.schema.COLUMNS
        target_column = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            all_schema=schema,
            target_column = target_column,
        )

        return data_transformation_config
    

In [12]:
from mlProject import logger
import pandas as pd
from imblearn.over_sampling import SMOTE

In [13]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def transform_data(self):
        """
        Performs data transformations, including encoding categorical features,
        scaling numerical features, and splitting the data.
        """
        data = pd.read_csv(self.config.data_path)
        colmns = self.config.all_schema

        # Convert columns to specified data types
        for column, dtype in colmns.items():
            try:
                data[column] = data[column].astype(dtype)
            except ValueError as e:
                logger.warning(f"Error converting column '{column}' to type '{dtype}': {e}")

        # Define categorical and numerical columns
        cat_cols = ["Smoking", "Alcohol_Consumption", "Diabetes", "Hypertension", 
                    "Family_History", "Fasting_Blood_Sugar", "Exercise_Induced_Angina",
                    "Physical_Activity_Level", "Gender", "Stress_Level", 
                    "Chest_Pain_Type", "Thalassemia", "ECG_Results"]  # Include all categorical columns
        num_cols = [col for col in data.columns if col not in cat_cols and col != 'Heart_Attack_Risk']

        # One-Hot Encoding for Categorical Columns
        # Create a OneHotEncoder object
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first') # Set sparse=False for dense output
        encoded_data = encoder.fit_transform(data[cat_cols])
        encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(cat_cols))
        data = data.drop(columns=cat_cols)
        data = pd.concat([data, encoded_df], axis=1)

        # Label Encoding for Target Variable
        label_encoder = LabelEncoder()
        label_encoder.fit(data['Heart_Attack_Risk'])
        print(label_encoder.classes_)
        print(label_encoder.transform(label_encoder.classes_))
        data['Heart_Attack_Risk_Encoded'] = label_encoder.transform(data['Heart_Attack_Risk'])
        data.drop('Heart_Attack_Risk', axis=1, inplace=True)

        # Scaling Numerical Features
        scaler = StandardScaler()
        data[num_cols] = scaler.fit_transform(data[num_cols])

        
        smote = SMOTE(random_state=42)
        X = data.drop('Heart_Attack_Risk_Encoded', axis=1)
        y = data['Heart_Attack_Risk_Encoded']
        X_resampled, y_resampled = smote.fit_resample(X, y)

        # Combine resampled data
        data_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['Heart_Attack_Risk_Encoded'])], axis=1)

        # Split the data into training and test sets
        train, test = train_test_split(data_resampled, test_size=0.25, random_state=42)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        logger.info("Data transformation, handling imbalanced data, and splitting completed.")
        logger.info(f"Train data shape: {train.shape}")

        print(train.shape)
        print(test.shape)
        print(data) 

In [14]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.transform_data()
except Exception as e:
    raise e

[2025-02-15 14:05:33,922: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-02-15 14:05:33,930: INFO: common: yaml file: params.yaml loaded successfully]
[2025-02-15 14:05:33,938: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-02-15 14:05:33,938: INFO: common: created directory at: artifacts]
[2025-02-15 14:05:33,938: INFO: common: created directory at: artifacts/data_transformation]
['High' 'Low' 'Moderate']
[0 1 2]
[2025-02-15 14:05:37,679: INFO: 1852055140: Data transformation, handling imbalanced data, and splitting completed.]
[2025-02-15 14:05:37,679: INFO: 1852055140: Train data shape: (56304, 26)]
(56304, 26)
(18768, 26)
            Age       BMI  Cholesterol_Level  Resting_BP  Heart_Rate  \
0      0.750106  0.981390          -1.683786    1.411391   -0.472112   
1     -1.028843 -0.660109          -1.344427   -0.325544    0.419555   
2      1.711700  1.079658           1.091110   -0.441340    1.608443   
3      1.182823 -1.285706           0