In [7]:
import sklearn.model_selection
from sklearn.datasets import fetch_openml
import sklearn.metrics
from sklearn.metrics import accuracy_score
from autosklearn.metrics import accuracy
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

X, y = fetch_openml(data_id=40691, as_frame=True, return_X_y=True)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf = clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
print("RF Accuracy", sklearn.metrics.accuracy_score(y_test, y_hat))

from autosklearn.classification import AutoSklearnClassifier


automl = AutoSklearnClassifier(
    time_left_for_this_task=300, 
    n_jobs=-1, 
    resampling_strategy="cv",
    resampling_strategy_arguments={"folds":5},
    metric=accuracy
)

# Train auto-sklearn
automl.fit(X_train, y_train)

# Make predictions
y_hat = automl.predict(X_test)

# Evaluate accuracy
print("AutoML Accuracy:", accuracy_score(y_test, y_hat))

RF Accuracy 0.67


  "Fitting transformer with a pandas series which"


AutoML Accuracy: 0.69


# This is to check if there need to be done preprocessing to increase accuracy

In [8]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import accuracy

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf = RandomForestClassifier(random_state=42)
clf = clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
print("RF Accuracy", sklearn.metrics.accuracy_score(y_test, y_hat))

# Data Preprocessing
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


categorical_features = X_train.select_dtypes(include=["object"]).columns.tolist()
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
X_train_encoded = onehot_encoder.fit_transform(X_train[categorical_features])
X_test_encoded = onehot_encoder.transform(X_test[categorical_features])


numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_features])
X_test_scaled = scaler.transform(X_test[numerical_features])

X_train_processed = np.hstack((X_train_encoded, X_train_scaled))
X_test_processed = np.hstack((X_test_encoded, X_test_scaled))


automl = AutoSklearnClassifier(
    time_left_for_this_task=300,
    n_jobs=-1,  
    ensemble_size=10,  
    initial_configurations_via_metalearning=0,
    resampling_strategy="cv",
    resampling_strategy_arguments={"folds":5},
    metric=accuracy
)

# Train auto-sklearn
automl.fit(X_train_processed, y_train)

# Make predictions
y_hat = automl.predict(X_test_processed)

# Evaluate accuracy
print("AutoML Accuracy:", accuracy_score(y_test, y_hat))


RF Accuracy 0.67


  "Fitting transformer with a pandas series which"


AutoML Accuracy: 0.6625


In [None]:
# Data preprocessing didn't increase the performance of the model.