In [1]:
from datasets import load_dataset
import numpy as np

In [2]:
trainData = load_dataset(
    'cifar10',
    split='train',
    cache_dir="/scratch/doa240/.cache/huggingface/datasets",
    streaming=True,
    ignore_verifications=True  # set to True if seeing splits Error
)
trainData = trainData.shuffle(seed=101)

In [3]:
X_train = []
y_train = []

for data in trainData:
    img = data['img']
    img = img.convert('L')
    img = np.array(img)
    X_train.append(img)
    y_train.append(data['label'])

In [4]:
X_train = np.array(X_train)
X_train = X_train.reshape(X_train.shape[0], -1)
print(X_train.shape)

(50000, 1024)


In [5]:
y_train = np.array(y_train)
print(y_train.shape)

(50000,)


In [6]:
import autosklearn.classification
import autosklearn.metrics
from pprint import pprint
import joblib
from smac.tae import StatusType 
from sklearn.metrics import accuracy_score, f1_score 

In [7]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600*6, 
    per_run_time_limit = 3600,
    seed=101,
    n_jobs=-1
)
automl.fit(X_train, y_train)



AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      n_jobs=-1, per_run_time_limit=3600, seed=101,
                      time_left_for_this_task=21600)

In [8]:
pprint(automl.sprint_statistics())

('auto-sklearn results:\n'
 '  Dataset name: 8c8d5fe5-cb41-11ed-8d2c-00110a6b78e8\n'
 '  Metric: accuracy\n'
 '  Best validation score: 0.488242\n'
 '  Number of target algorithm runs: 631\n'
 '  Number of successful target algorithm runs: 277\n'
 '  Number of crashed target algorithm runs: 45\n'
 '  Number of target algorithms that exceeded the time limit: 84\n'
 '  Number of target algorithms that exceeded the memory limit: 225\n')


In [9]:
print(automl.leaderboard())

          rank  ensemble_weight               type      cost     duration
model_id                                                                 
411          1             0.02  gradient_boosting  0.511758  2613.969324
556          2             0.02  gradient_boosting  0.513394  3011.279015
521          3             0.02  gradient_boosting  0.515333  1580.523910
572          4             0.02  gradient_boosting  0.516303  2273.191973
562          5             0.02  gradient_boosting  0.516788  2054.284811
565          7             0.04  gradient_boosting  0.517152  2113.286489
570          6             0.02  gradient_boosting  0.517152  2434.458619
542          8             0.04  gradient_boosting  0.517212  1436.065849
559          9             0.02  gradient_boosting  0.518000  1982.797989
489         10             0.02  gradient_boosting  0.518121  2572.393734
245         11             0.02  gradient_boosting  0.518242  3109.165267
505         12             0.02  gradi

In [10]:
print(automl.show_models())

{239: {'model_id': 239, 'rank': 1, 'cost': 0.5232727272727273, 'ensemble_weight': 0.04, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x155524b9d340>, 'balancing': Balancing(random_state=101), 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x15552542b520>, 'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x1554d9f06b50>, 'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=True,
                               l2_regularization=2.1102733168765376e-05,
                               learning_rate=0.12560435518379692, max_iter=512,
                               max_leaf_nodes=22, min_samples_leaf=9,
                               random_state=101, validation_fraction=None,
                               warm_start=True)}, 245: {'model_id': 245, 'rank': 2, 'cost': 0.5182424242424242, 'ensemble_weight': 0.02, 

In [11]:
joblib.dump(automl, 'autosklearnCifar10.joblib')

['autosklearnCifar10.joblib']

In [12]:
testData = load_dataset(
    'cifar10',
    split='test',
    cache_dir="/scratch/doa240/.cache/huggingface/datasets",
    streaming=True,
    ignore_verifications=True  # set to True if seeing splits Error
)

In [13]:
X_test = []
y_test = []

for data in testData:
    img = data['img']
    img = img.convert('L')
    img = np.array(img)
    X_test.append(img)
    y_test.append(data['label'])

In [14]:
X_test = np.array(X_test)
X_test = X_test.reshape(X_test.shape[0], -1)
print(X_test.shape)

(10000, 1024)


In [15]:
y_test = np.array(y_test)
print(y_test.shape)

(10000,)


In [16]:
# evaluate
pred = automl.predict(X_test)
test_acc = accuracy_score(y_test, pred)
print("Test Accuracy score {0}".format(test_acc))

Test Accuracy score 0.5233


In [17]:
classes = [0] * 10
classesCnt = [0] * 10

for i in range(len(pred)):
    if pred[i] == y_test[i]:
        classes[pred[i]] += 1
    classesCnt[y_test[i]] += 1
    
for i in range(10):
    print("Label:", i, classes[i]/classesCnt[i])

Label: 0 0.544
Label: 1 0.619
Label: 2 0.399
Label: 3 0.343
Label: 4 0.457
Label: 5 0.451
Label: 6 0.607
Label: 7 0.556
Label: 8 0.659
Label: 9 0.598


In [19]:
classes = np.array(classes)
print(np.sum(classes)/len(y_test))

0.5233
