In [1]:
# Work around for using Autosklearn in Colab
!pip install git+https://github.com/Frankothe196/auto-sklearn.git@python3.10-added-compatibility;

Collecting git+https://github.com/Frankothe196/auto-sklearn.git@python3.10-added-compatibility
  Cloning https://github.com/Frankothe196/auto-sklearn.git (to revision python3.10-added-compatibility) to /tmp/pip-req-build-l8n5gjmp
  Running command git clone --filter=blob:none --quiet https://github.com/Frankothe196/auto-sklearn.git /tmp/pip-req-build-l8n5gjmp
  Running command git checkout -b python3.10-added-compatibility --track origin/python3.10-added-compatibility
  Switched to a new branch 'python3.10-added-compatibility'
  Branch 'python3.10-added-compatibility' set up to track remote branch 'python3.10-added-compatibility' from 'origin'.
  Resolved https://github.com/Frankothe196/auto-sklearn.git to commit 80a575760e99945fa31970b1479edeb759bc645a
  Running command git submodule update --init --recursive -q
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Coll

In [1]:
import sklearn.model_selection
from sklearn.datasets import fetch_openml
import sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from autosklearn.classification import AutoSklearnClassifier

X, y = fetch_openml(data_id=40691, as_frame=True, return_X_y=True)
enc = OneHotEncoder(handle_unknown='ignore')
X = enc.fit_transform(X)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=42)

clf = RandomForestClassifier(random_state=41)
clf = clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
print("RF Accuracy", sklearn.metrics.accuracy_score(y_test, y_hat))

# Original AUTO-SKLEARN run w/out onehot encoding:
automl = AutoSklearnClassifier(time_left_for_this_task=300)
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("AutoML Original w/1Hot Encoding Accuracy On Test", sklearn.metrics.accuracy_score(y_test, y_hat))
y_h = automl.predict(X_train)
print("AutoML Original w/1Hot Encoding Accuracy on Training", sklearn.metrics.accuracy_score(y_train, y_h))

# Re-get Data
X, y = fetch_openml(data_id=40691, as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=42)

# AUTO-SKLEARN with 10 folds cv
automl = AutoSklearnClassifier(time_left_for_this_task=300,resampling_strategy='cv',resampling_strategy_arguments={"folds": 10})
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("AutoML Accuracy On Test", sklearn.metrics.accuracy_score(y_test, y_hat))
# automl.refit(X_train,y_train)
y_h = automl.predict(X_train)
print("AutoML Accuracy on Training", sklearn.metrics.accuracy_score(y_train, y_h))
# Leaderboard training
# print(automl.leaderboard())
# automl.sprint_statistics()

# Original AUTO-SKLEARN run w/out onehot encoding:
automl = AutoSklearnClassifier(time_left_for_this_task=300)
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("AutoML Original w/out 1Hot Encoding Accuracy On Test", sklearn.metrics.accuracy_score(y_test, y_hat))
y_h = automl.predict(X_train)
print("AutoML Original w/out 1Hot Encoding Accuracy on Training", sklearn.metrics.accuracy_score(y_train, y_h))


# Running the new cv-fold model with extra training time - a slight increase;
# but takes time that may not be warrented in some cases
automl3 = AutoSklearnClassifier(time_left_for_this_task=500,resampling_strategy='cv',resampling_strategy_arguments={"folds": 10})
automl3.fit(X_train, y_train)
y_hat3 = automl3.predict(X_test)
print("AutoML CV+Time Accuracy On Test", sklearn.metrics.accuracy_score(y_test, y_hat3))
y_h3 = automl3.predict(X_train)
print("AutoML CV+Time Accuracy on Training", sklearn.metrics.accuracy_score(y_train, y_h3))

# Redo using pure data, instead of one-hot encoding -- One-Hot encoding is meant
# for use on categorical data. However, the values presented as 'observations'
# meant to serve as predictors for the 'quality' of the wine are continous real
# values. As such, preforming one-hot encoding is not the way to use it.
# After checking training and testing data, it was found that while the
# training data accuracy was good, the test accuracy was bad:
#AutoML Original w/1Hot Encoding Accuracy On Test 0.6
#AutoML Original w/1Hot Encoding Accuracy on Training 0.896580483736447

# So to avoid
# overfitting, trying out a different resampling strategy! As such, switched
# testing cv strategy with normal 10 folds.
# As can be seen; while removing one hot encoding helped, the biggest change
# was from changing the resampling approach to 10-fold cv. In addition, increasing
# the time budget helped.

# All together, in order to make this run better, one-hot encoding was removed
# and 10-fold cv resampling was used. This saw values of:
# AutoML Accuracy On Test 0.6725
# AutoML Accuracy on Training 1.0

# In addition; running this with an increased time; so to give it more time to
# explore the search space, saw another slight increase:
#AutoML CV+Time Accuracy On Test 0.675
#AutoML CV+Time Accuracy on Training 1.0

  warn(


RF Accuracy 0.64


Fitting to the training data: 100%|[32m██████████[0m| 300/300 [04:50<00:00,  1.03it/s, The total time budget for this task is 0:05:00]


AutoML Original w/1Hot Encoding Accuracy On Test 0.6
AutoML Original w/1Hot Encoding Accuracy on Training 0.896580483736447


  warn(
Fitting to the training data: 100%|[32m██████████[0m| 300/300 [04:55<00:00,  1.02it/s, The total time budget for this task is 0:05:00]


AutoML Accuracy On Test 0.6725
AutoML Accuracy on Training 1.0


Fitting to the training data: 100%|[32m██████████[0m| 300/300 [04:50<00:00,  1.03it/s, The total time budget for this task is 0:05:00]


AutoML Original w/out 1Hot Encoding Accuracy On Test 0.65
AutoML Original w/out 1Hot Encoding Accuracy on Training 0.8999165971643036


Fitting to the training data: 100%|[32m██████████[0m| 500/500 [08:20<00:00,  1.00s/it, The total time budget for this task is 0:08:20]


AutoML CV+Time Accuracy On Test 0.675
AutoML CV+Time Accuracy on Training 1.0
