In [4]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [5]:
# Step 1: Extract - Load Data
def load_data(file_path):
    return pd.read_csv(file_path)

In [6]:
# Step 2: Transform - Data Cleaning and Preprocessing
def preprocess_data(df):
    # Identify numerical and categorical columns
    num_features = df.select_dtypes(include=['int64', 'float64']).columns
    cat_features = df.select_dtypes(include=['object']).columns

    # Define transformers
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine transformers into a preprocessor
    preprocessor = ColumnTransformer([
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ])

    return preprocessor.fit_transform(df)


In [8]:
def main():
    input_file = "employee_data.csv"  # Ensure no trailing spaces in the filename
    output_file = "processed_data.csv"

    df = load_data(input_file)  # Ensure the function is defined before calling it
    transformed_data = preprocess_data(df)
    save_data(transformed_data, output_file)
    print(f"Data processing complete. Processed file saved as {output_file}")

if __name__ == "__main__":
    main()

Data processing complete. Processed file saved as processed_data.csv
