In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

np.random.seed(42)
data = pd.DataFrame({
    'feature1': np.random.randint(0, 100, 100),
    'feature2': np.random.choice(['A', 'B', 'C'], 100),
    'feature3': np.random.randn(100),
    'target': np.random.randint(0, 2, 100)
})

print("Original Data:")
print(data.head())

X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = ['feature1', 'feature3']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = ['feature2']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

X_train_df = pd.DataFrame(X_train_processed, columns=(numeric_features + list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out())))
X_test_df = pd.DataFrame(X_test_processed, columns=(numeric_features + list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out())))

print("\nProcessed Training Data:")
print(X_train_df.head())

print("\nProcessed Test Data:")
print(X_test_df.head())


Original Data:
   feature1 feature2  feature3  target
0        51        B  0.127012       1
1        92        A -1.249444       0
2        14        A  1.945116       1
3        71        A -0.153336       0
4        60        A -0.906987       0

Processed Training Data:
   feature1  feature3  x0_A  x0_B  x0_C
0 -1.375882  0.266630   0.0   1.0   0.0
1 -0.065918 -0.233911   0.0   0.0   1.0
2  0.269970 -0.304771   1.0   0.0   0.0
3  0.169204 -0.615359   0.0   0.0   1.0
4 -0.468984  1.029852   1.0   0.0   0.0

Processed Test Data:
   feature1  feature3  x0_A  x0_B  x0_C
0  0.807391  0.043901   0.0   1.0   0.0
1  0.337148 -0.602139   0.0   0.0   1.0
2  0.001260 -1.048881   1.0   0.0   0.0
3  0.034848 -2.481322   0.0   1.0   0.0
4 -1.577415  0.315117   1.0   0.0   0.0
