In [1]:
# Notebook: 01_data_preprocessing.ipynb

import sys
import os
import pandas as pd

# Add src/ to the path to import local modules
sys.path.append(os.path.abspath("../src"))

from data_processing import load_data, preprocess_data, get_train_test_data

# Load data
data_path = "../data/employee_data.csv"
df = load_data(data_path)

# Preview data
print("First 5 rows of raw data:")
display(df.head())

# Preprocess data
X, y, preprocessor = preprocess_data(df)

# Train-test split
X_train, X_test, y_train, y_test = get_train_test_data(X, y)

# Fit the preprocessor on training data and transform both sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Check processed shapes
print("Processed X_train shape:", X_train_processed.shape)
print("Processed X_test shape:", X_test_processed.shape)

# Preview one-hot encoded feature names
feature_names = preprocessor.get_feature_names_out()
print("\nFeature names after encoding:")
print(feature_names)


First 5 rows of raw data:


Unnamed: 0,employee_id,age,gender,marital_status,salary,employment_type,region,has_dependents,tenure_years,enrolled
0,10001,60,Female,Single,55122.97,Part-time,West,No,1.5,0
1,10002,50,Female,Single,89549.66,Full-time,West,Yes,12.8,1
2,10003,36,Male,Divorced,74145.66,Part-time,Midwest,No,3.8,0
3,10004,64,Female,Married,53877.83,Full-time,Northeast,No,3.3,0
4,10005,29,Male,Single,63404.63,Contract,Midwest,Yes,10.0,0


Processed X_train shape: (8000, 19)
Processed X_test shape: (2000, 19)

Feature names after encoding:
['cat__gender_Female' 'cat__gender_Male' 'cat__gender_Other'
 'cat__marital_status_Divorced' 'cat__marital_status_Married'
 'cat__marital_status_Single' 'cat__marital_status_Widowed'
 'cat__employment_type_Contract' 'cat__employment_type_Full-time'
 'cat__employment_type_Part-time' 'cat__region_Midwest'
 'cat__region_Northeast' 'cat__region_South' 'cat__region_West'
 'cat__has_dependents_No' 'cat__has_dependents_Yes' 'num__age'
 'num__salary' 'num__tenure_years']
