In [None]:
# Work around for using Autosklearn in Colab
!pip install git+https://github.com/Frankothe196/auto-sklearn.git@python3.10-added-compatibility;

In [None]:
import sklearn.model_selection
from sklearn.datasets import fetch_openml
import sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

X, y = fetch_openml(data_id=40691, as_frame=True, return_X_y=True)
enc = OneHotEncoder(handle_unknown='ignore')
X = enc.fit_transform(X)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=42)

clf = RandomForestClassifier(random_state=41)
clf = clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
print("RF Accuracy", sklearn.metrics.accuracy_score(y_test, y_hat))

# Redo using pure data, instead of one-hot encoding -- One-Hot encoding is meant
# for use on categorical data. However, the values presented as 'observations'
# meant to serve as predictors for the 'quality' of the wine are continous real
# values. As such, preforming one-hot encoding is not the way to use it.
from autosklearn.classification import AutoSklearnClassifier
X, y = fetch_openml(data_id=40691, as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=42)
# After checking training and testing data, it was found that while the
# training data accuracy was good, the test accuracy was bad. So to avoid
# overfitting, trying out a different resampling strategy! As such,
# testing cv strategy with normal 10 folds.
# In addition; tried running with greater budget, slightly better, but not by a lot.
automl = AutoSklearnClassifier(time_left_for_this_task=300,resampling_strategy='cv',resampling_strategy_arguments={"folds": 10})
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("AutoML Accuracy On Test", sklearn.metrics.accuracy_score(y_test, y_hat))

print(automl.leaderboard())
automl.sprint_statistics()

automl.refit(X_train,y_train)
y_h = automl.predict(X_train)
print("AutoML Accuracy on Training", sklearn.metrics.accuracy_score(y_train, y_h))
# This works, with ~0.6725 results as opposed to previous. With extended training
# time of 500 iterations, resulting in ~0.675