In [17]:
# IMPORTS
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import json # Added json import

# FUNCTIONS
def extract_data() -> pd.DataFrame:
     data = {
        'Name': ['John', 'Anna', 'Peter', 'Linda', np.nan],
        'Age': [28, 22, np.nan, 32, 45],
        'Gender': ['Male', 'Female', 'Male', np.nan, 'Female'],
        'Salary': [50000, 60000, 52000, np.nan, 58000],
        'Department': ['IT', 'HR', 'IT', 'Finance', 'HR']
    }

     df = pd.DataFrame(data)
     print("✅ Data extracted successfully.")
     return df
def transform_data(df: pd.DataFrame) -> pd.DataFrame:

  #  Clean, preprocess, and transform the dataset using Scikit-learn pipelines.
  # Define numeric and categorical columns
    numeric_features = ['Age', 'Salary']
    categorical_features = ['Gender', 'Department']

    # Numeric transformer: impute missing values, scale
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Categorical transformer: impute missing values, one-hot encode
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine transformations
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Full ETL pipeline
    etl_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

    # Apply transformations
    processed_data = etl_pipeline.fit_transform(df)

    # Retrieve encoded column names
    encoded_columns = etl_pipeline.named_steps['preprocessor'].transformers_[1][1] \
        .named_steps['encoder'].get_feature_names_out(categorical_features)

    # Create processed DataFrame
    processed_df = pd.DataFrame(
        processed_data,
        columns=list(numeric_features) + list(encoded_columns)
    )

    print("✅ Data transformed successfully.")
    return processed_df


def load_data(processed_df: pd.DataFrame, file_name: str = "processed_data.csv"):

    processed_df.to_csv(file_name, index=False)
    print(f"✅ Data loaded successfully and saved as '{file_name}'.")

# MAIN EXECUTION
def main():

    df = extract_data()
    processed_df = transform_data(df)
    load_data(processed_df)

# Call the main function
if __name__ == "__main__":
    main()

✅ Data extracted successfully.
✅ Data transformed successfully.
✅ Data loaded successfully and saved as 'processed_data.csv'.
