In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer


In [12]:
def extract_data(file_path):
    """Load data from CSV file."""
    try:
        data = pd.read_csv(file_path)
        print("✅ Data extracted successfully.")
        return data
    except Exception as e:
        print(f"❌ Error in extracting data: {e}")


In [13]:
def transform_data(data):
    """Preprocess the data (cleaning, encoding, scaling)."""

    num_imputer = SimpleImputer(strategy="mean")
    cat_imputer = SimpleImputer(strategy="most_frequent")

    numeric_cols = data.select_dtypes(include=["int64", "float64"]).columns
    categorical_cols = data.select_dtypes(include=["object"]).columns

    data[numeric_cols] = num_imputer.fit_transform(data[numeric_cols])

    data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])

    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le

    scaler = StandardScaler()
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

    print("✅ Data transformed successfully.")
    return data

In [14]:
if __name__ == "__main__":
    input_file = "raw_data.csv"    
    output_file = "processed_data.csv"


In [15]:
raw_data = extract_data(input_file)
if raw_data is not None:
        processed_data = transform_data(raw_data)
        load_data(processed_data, output_file)


✅ Data extracted successfully.
✅ Data transformed successfully.
✅ Data loaded successfully into processed_data.csv
