In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Generate a sample dataset
np.random.seed(42)

# Sample data
data = pd.DataFrame({
    'num_feature1': np.random.randn(100),
    'num_feature2': np.random.randn(100) * 10,
    'cat_feature1': np.random.choice(['A', 'B', 'C'], 100),
    'cat_feature2': np.random.choice(['X', 'Y'], 100),
    'target': np.random.choice([0, 1], 100)
})

# Introduce some missing values
data.loc[::10, 'num_feature1'] = np.nan
data.loc[::5, 'cat_feature1'] = np.nan

# Splitting the dataset into features and target variable
X = data.drop('target', axis=1)
y = data['target']

# Handling missing values
# Using SimpleImputer to fill missing values with the mean for numerical features and most frequent value for categorical features

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying transformations
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Feature Engineering - Creating new features
# Example: Adding polynomial features

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Converting back to DataFrame for ease of understanding
X_train_poly_df = pd.DataFrame(X_train_poly, columns=poly.get_feature_names_out())
X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly.get_feature_names_out())

# Display the transformed and new features
print(X_train_poly_df.head())
print(X_test_poly_df.head())

# Now you can proceed to use X_train_poly and X_test_poly for training machine learning models


         x0        x1   x2   x3   x4   x5   x6     x0 x1     x0 x2     x0 x3  \
0  1.196369 -0.784395  0.0  1.0  0.0  0.0  1.0 -0.938426  0.000000  1.196369   
1 -0.418010 -0.923715  1.0  0.0  0.0  1.0  0.0  0.386122 -0.418010 -0.000000   
2 -1.104442 -1.074922  0.0  1.0  0.0  0.0  1.0  1.187190 -0.000000 -1.104442   
3  0.039564 -1.723908  0.0  1.0  0.0  1.0  0.0 -0.068205  0.000000  0.039564   
4 -0.545476 -0.825825  1.0  0.0  0.0  1.0  0.0  0.450468 -0.545476 -0.000000   

   ...  x2 x3  x2 x4  x2 x5  x2 x6  x3 x4  x3 x5  x3 x6  x4 x5  x4 x6  x5 x6  
0  ...    0.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0  
1  ...    0.0    0.0    1.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  
2  ...    0.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0  
3  ...    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0    0.0  
4  ...    0.0    0.0    1.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  

[5 rows x 28 columns]
         x0        x1 