In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

def preprocess_data(df, target_column):
    # 1. Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # 2. Identify column types
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # 3. Define pipelines
    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'))
    ])

    # 4. Combine pipelines
    preprocessor = ColumnTransformer([
        ('num', numeric_pipeline, numeric_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])

    # 5. Split data for reproducibility
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 6. Fit and transform on training data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # 7. Retrieve feature names
    encoded_cat_names = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_cols)
    final_features = numeric_cols + list(encoded_cat_names)

    # 8. Return clean DataFrames
    X_train_df = pd.DataFrame(X_train_processed, columns=final_features)
    X_test_df = pd.DataFrame(X_test_processed, columns=final_features)

    return X_train_df, X_test_df, y_train.reset_index(drop=True), y_test.reset_index(drop=True)

# Example usage
data = {
    'Age': [25, 32, None, 51, 62, 28],
    'Salary': [50000, 60000, 80000, None, 150000, 48000],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Male'],
    'Department': ['HR', 'IT', 'Finance', 'IT', None, 'HR'],
    'Left_Company': [0, 1, 0, 1, 0, 0]
}
df = pd.DataFrame(data)

X_train, X_test, y_train, y_test = preprocess_data(df, target_column='Left_Company')

print("X_train:\n", X_train)
print("\nX_test:\n", X_test)