### ETL (Extract Load Transform)
1. #### Pipeline Steps
    1. Load the Raw Data
    2. Apply Cleaning/ Dropping and Handling Missing Values
    3. Apply Feature Engineering
        1. Adding New Features
           1. Pulse Pressure (ap_hi - ap_lo)
           2. Mean Arterial Pressure (ap_lo + (ap_hi - ap_lo)/ 3)
           3. Pulse_map_interaction
           4. Glucose_cholesterol_interaction
           5. Bp_cholesterol_interaction
           6. BMI
           7. Age_BMI_Interaction
        2. Normalization and Scaling for Numerical Features
        3. Encoding (if needed) for Categorical Features
    4. Final Pipeline.
2. #### Testing Pipeline

#### 1. Data Pipeline Steps

In [44]:
# lists for numerical and categorical columns
scaling_num_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 
            'pulse_pressure', 'map', 'pulse_map_interaction', 'gluc_chol', 
            'bp_chol', 'bmi', 'age_bmi_interaction']

encoding_cat_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
unwanted_cols = ['id']

# Print the lists
print("Numerical Columns:", scaling_num_cols)
print("Categorical Columns:", encoding_cat_cols)
print("Unwanted Columns:", unwanted_cols)

Numerical Columns: ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'pulse_pressure', 'map', 'pulse_map_interaction', 'gluc_chol', 'bp_chol', 'bmi', 'age_bmi_interaction']
Categorical Columns: ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
Unwanted Columns: ['id']


In [45]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder

# ===============================
# FEATURE TRANSFORMATION STEPS
# ===============================

# Define the drop functionality
def drop_columns(df, columns_to_drop=unwanted_cols):
    return df.drop(columns=columns_to_drop, axis=1)

# Define individual feature engineering functions
def add_pulse_pressure(df):
    df['pulse_pressure'] = df['ap_hi'] - df['ap_lo']
    return df

def add_mean_arterial_pressure(df):
    df['map'] = df['ap_lo'] + (df['ap_hi'] - df['ap_lo']) / 3
    return df

def add_pulse_map_interaction(df):
    df['pulse_map_interaction'] = df['pulse_pressure'] * df['map']
    return df

def add_glucose_chol_interaction(df):
    df['gluc_chol'] = df['gluc'] * df['cholesterol']
    return df

def add_bp_chol_interaction(df):
    df['bp_chol'] = (df['ap_hi'] + df['ap_lo']) * df['cholesterol']
    return df

def add_bmi(df):
    df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
    return df

def add_age_bmi_interaction(df):
    df['age_bmi_interaction'] = df['age'] * df['bmi']
    return df


# Custom transformer to output a DataFrame with updated column names
class DataFrameTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column_transformer, scaling_cols, encoding_cols):
        self.column_transformer = column_transformer
        self.scaling_cols = scaling_cols
        self.encoding_cols = encoding_cols

    def fit(self, X, y=None):
        self.column_transformer.fit(X)
        return self

    def transform(self, X):
        # Transform data using the ColumnTransformer
        transformed_array = self.column_transformer.transform(X)

        # Get feature names for numeric and categorical columns
        num_feature_names = self.column_transformer.named_transformers_['num'].get_feature_names_out()
        cat_feature_names = self.column_transformer.named_transformers_['cat'].get_feature_names_out()

        # Combine feature names
        feature_names = list(num_feature_names) + list(cat_feature_names)

        # Return a DataFrame with updated column names
        return pd.DataFrame(transformed_array, columns=feature_names, index=X.index)


# ===============================
# PIPELINE STEPS
# ===============================

# Derived Features
feature_engineering = Pipeline(steps=[
    ('pulse_pressure', FunctionTransformer(add_pulse_pressure)),
    ('mean_arterial_pressure', FunctionTransformer(add_mean_arterial_pressure)),
    ('pulse_map_interaction', FunctionTransformer(add_pulse_map_interaction)),
    ('glucose_chol_interaction', FunctionTransformer(add_glucose_chol_interaction)),
    ('bp_chol_interaction', FunctionTransformer(add_bp_chol_interaction)),
    ('bmi', FunctionTransformer(add_bmi)),
    ('age_bmi_interaction', FunctionTransformer(add_age_bmi_interaction))
])

# the column transformer for scaling and encoding
column_transformer = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), scaling_num_cols),
        ('cat', OneHotEncoder(), encoding_cat_cols)
    ]
)

# Define the preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('cleaning', FunctionTransformer(lambda df: df.dropna())),
    ('feature_engineering', feature_engineering),
])

# Final pipeline
final_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing_pipeline),
    ('dataframe_transformer', DataFrameTransformer(
        column_transformer=column_transformer,
        scaling_cols=scaling_num_cols,
        encoding_cols=encoding_cat_cols
    ))
])

### Testing Pipeline

In [46]:
# testing pipeline
from cardiovascular_disease_prediction.dataset import load_dataset
df = load_dataset()
print(df.columns)

Attempting to load dataset from: C:\Users\visha\OneDrive\Documents\GitHub\Data-Science-Stuff\predicting_cardio_disease\data\raw\cardio_train.csv
Dataset loaded successfully from C:\Users\visha\OneDrive\Documents\GitHub\Data-Science-Stuff\predicting_cardio_disease\data\raw\cardio_train.csv
Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')


In [47]:
sample = df.iloc[0:10]
sample

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,8,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,9,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,12,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,13,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,14,19834,1,164,68.0,110,60,1,1,0,0,0,0


In [48]:
X = sample.drop(labels='cardio', axis=1)
y = sample[['cardio']]

In [49]:
final_pipeline.fit(X)

In [52]:
from sklearn import set_config

# Save the pipeline diagram
set_config(display='diagram')  # To display the pipeline diagram
final_pipeline.fit(X)  # Fit the pipeline

# Save the diagram as an image (optional: SVG or PNG)
from sklearn.utils import estimator_html_repr
html_repr = estimator_html_repr(final_pipeline)

# Write the HTML to an .svg file
import os
path_to_report = os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), 'reports\\figures\\')

# Save the pipeline diagram with UTF-8 encoding
with open(path_to_report + "pipeline_diagram.svg", "w", encoding="utf-8") as f:
    f.write(html_repr)

In [50]:
# fitting transform pipeline
X_ = final_pipeline.fit_transform(X)
X_.shape

(10, 24)

In [38]:
X_

Unnamed: 0,age,height,weight,ap_hi,ap_lo,pulse_pressure,map,pulse_map_interaction,gluc_chol,bp_chol,...,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0,alco_0,active_0,active_1
0,-0.680528,0.756636,-0.958105,-0.874475,0.160128,-1.860521,-0.234818,-1.452813,-0.717741,-0.936758,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
1,0.298285,-0.808818,0.833474,1.143544,0.960769,0.620174,1.069727,1.086396,0.12666,1.370528,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
2,-0.433025,0.365273,-0.802316,0.470871,-0.640513,1.860521,-0.234818,1.150679,0.12666,0.955216,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
3,-1.091256,0.887091,0.59979,1.816217,1.76141,0.620174,1.852454,1.568524,-0.717741,-0.659884,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
4,-1.170735,-0.808818,-1.425474,-1.547147,-1.441153,-0.620174,-1.539363,-1.22782,-0.717741,-1.075195,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
5,1.197619,-1.461091,-0.568632,-0.201802,0.160128,-0.620174,0.026091,-0.456415,0.54886,0.032302,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
6,1.303768,-0.678363,1.456632,0.470871,0.160128,0.620174,0.287,0.604267,0.12666,1.093653,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
7,1.555005,2.061181,1.612421,0.470871,0.960769,-0.620174,0.808818,-0.070712,2.659862,1.23209,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
8,-1.067253,-0.547909,-0.257053,-0.874475,-0.640513,-0.620174,-0.756636,-0.842117,-0.717741,-0.982904,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
9,0.08812,0.234818,-0.490737,-0.874475,-1.441153,0.620174,-1.278454,-0.359989,-0.717741,-1.029049,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
