In [9]:
import pandas as pd # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.impute import SimpleImputer # type: ignore
from sklearn.preprocessing import StandardScaler, OneHotEncoder # type: ignore
from sklearn.compose import ColumnTransformer # type: ignore
from sklearn.pipeline import Pipeline # type: ignore

In [10]:
# Load sample dataset
data = {
    'age': [25, 30, 35, None, 40],
    'income': [50000, 60000, None, 80000, 100000],
    'gender': ['male', 'female', None, 'female', 'male'],
    'purchased': [0, 1, 0, 1, 1]
}

In [11]:
df = pd.DataFrame(data)

In [12]:
# Separate features and target
X = df.drop('purchased', axis=1)
y = df['purchased']

In [13]:
# Define numerical and categorical columns
numerical_features = ['age', 'income']
categorical_features = ['gender']

In [14]:
# Preprocessing for numerical data: Impute missing values and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: Impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [15]:

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [16]:
# Apply the preprocessing steps to the dataset
X_preprocessed = preprocessor.fit_transform(X)

# Split the preprocessed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

print("Preprocessed training features:")
print(X_train)

Preprocessed training features:
[[ 1.5         1.6011119   0.          1.          0.        ]
 [ 0.5         0.          0.          0.          1.        ]
 [-1.5        -1.31000065  0.          1.          0.        ]
 [ 0.          0.43666688  1.          0.          0.        ]]
