In [1]:
"etl_pipeline"

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

def extract(file_path):
    """Step 1: Extract data from CSV file"""
    file_path = "heart.csv"
    return pd.read_csv(file_path)

def preprocess(df):
    """Step 2: Handle missing values"""
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    imputer_num = SimpleImputer(strategy='mean')
    df[numeric_cols] = imputer_num.fit_transform(df[numeric_cols])

    imputer_cat = SimpleImputer(strategy='most_frequent')
    df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

    return df

def transform(df):
    """Step 3: Encode categorical and scale numeric data"""
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    for col in categorical_cols:
        df[col] = LabelEncoder().fit_transform(df[col])

    df[numeric_cols] = StandardScaler().fit_transform(df[numeric_cols])
    return df

def load(df, output_path):
    """Step 4: Load processed data to a new CSV file"""
    df.to_csv(output_path, index=False)
    print(f"Data loaded to {output_path}")

if __name__ == "__main__":
    input_path = 'raw_data.csv'
    output_path = 'cleaned_data.csv'

    df = extract(input_path)
    df = preprocess(df)
    df = transform(df)
    load(df, output_path)


Data loaded to cleaned_data.csv
