In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


n_samples = 100000
n_features = 10

np.random.seed(0)
numerical_data = np.random.randn(n_samples, n_features)

categories = ['A', 'B', 'C', 'D']
categorical_data = np.random.choice(categories, size=(n_samples, 3))

df = pd.DataFrame(numerical_data, columns=[f'num_feature_{i}' for i in range(n_features)])
for i in range(3):
    df[f'cat_feature_{i}'] = categorical_data[:, i]

nan_indices = np.random.choice(df.index, size=5000, replace=False)
df.loc[nan_indices, 'num_feature_0'] = np.nan

X = df.drop(columns=['num_feature_0'])  
y = df['num_feature_0']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

numerical_features = [f'num_feature_{i}' for i in range(1, n_features)]
categorical_features = [f'cat_feature_{i}' for i in range(3)]

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2, include_bias=False)

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('poly', poly_features)
    ]), numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f'Original shape: {X_train.shape}')
print(f'Processed shape: {X_train_processed.shape}')


Original shape: (80000, 12)
Processed shape: (80000, 66)
