In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.datasets import fetch_openml

In [3]:
data = fetch_openml(name="adult", version=1, as_frame=True)
X, y = data.data, data.target
# Separate features into numerical and categorical columns
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns

In [4]:
# Preprocessor to handle numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numerical_cols),  # Handle missing values in numerical columns
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),   # Handle missing values in categorical columns
            ("encoder", OneHotEncoder(handle_unknown="ignore"))    # One-hot encode categorical columns
        ]), categorical_cols)
    ]
)

In [6]:
# Fit and transform the data using ColumnTransformer
X_transformed = preprocessor.fit_transform(X)

# Checking the shape of transformed data
print("Transformed data shape:", X_transformed.shape)

Transformed data shape: (48842, 2)


In [7]:
# Define a pipeline with the preprocessor and a classifier
model_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier())
    ]
)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

# Evaluate the model
print("Model accuracy on test data:", model_pipeline.score(X_test, y_test))


Model accuracy on test data: 0.7048827925069097
