In [1]:
from datasets import load_dataset
import numpy as np

In [2]:
trainData = load_dataset(
    'Maysee/tiny-imagenet',
    split='train',
    cache_dir="/scratch/doa240/.cache/huggingface/datasets",
    streaming=True,
    ignore_verifications=True  # set to True if seeing splits Error
)
trainData = trainData.shuffle(seed=101)

Using custom data configuration Maysee--tiny-imagenet-35af7c46a941f08e


In [3]:
X_train = []
y_train = []

for data in trainData:
    img = data['image']
    img = img.convert('L')
    img = np.array(img)
    X_train.append(img)
    y_train.append(data['label'])

In [4]:
X_train = np.array(X_train)
X_train = X_train.reshape(X_train.shape[0], -1)
print(X_train.shape)

(100000, 4096)


In [5]:
y_train = np.array(y_train)
print(y_train.shape)

(100000,)


In [6]:
import autosklearn.classification
import autosklearn.metrics
from pprint import pprint
import joblib
from smac.tae import StatusType 
from sklearn.metrics import accuracy_score, f1_score 

In [9]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=(3600*12), 
    per_run_time_limit = (3600*24)/6,
    seed=101,
    n_jobs=-1,
    memory_limit= 3072*50
)
automl.fit(X_train, y_train)



AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      memory_limit=153600, n_jobs=-1,
                      per_run_time_limit=14400.0, seed=101,
                      time_left_for_this_task=43200)

In [10]:
pprint(automl.sprint_statistics())

('auto-sklearn results:\n'
 '  Dataset name: a00f45f6-cca3-11ed-9a8b-00110a6b78e8\n'
 '  Metric: accuracy\n'
 '  Best validation score: 0.055394\n'
 '  Number of target algorithm runs: 66\n'
 '  Number of successful target algorithm runs: 30\n'
 '  Number of crashed target algorithm runs: 1\n'
 '  Number of target algorithms that exceeded the time limit: 32\n'
 '  Number of target algorithms that exceeded the memory limit: 3\n')


In [11]:
print(automl.leaderboard())

          rank  ensemble_weight         type      cost     duration
model_id                                                           
56           1             0.02  extra_trees  0.944606  2085.538455
60           2             0.02  extra_trees  0.945000  1954.992569
6            3             0.02  extra_trees  0.953061  5177.772418
55           4             0.02  extra_trees  0.953788  2492.968124
58           5             0.02          qda  0.969242  8275.014618
32           6             0.04          qda  0.975970  1644.863952
28           7             0.70     adaboost  0.976303  2037.303826
31           8             0.04          sgd  0.980303   287.119997
36           9             0.02          sgd  0.985424  6924.427040
46          10             0.06          sgd  0.986030  2219.296108
21          11             0.04          lda  0.994636   539.779286


In [12]:
print(automl.show_models())

{6: {'model_id': 6, 'rank': 1, 'cost': 0.953060606060606, 'ensemble_weight': 0.02, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x155499ee33d0>, 'balancing': Balancing(random_state=101, strategy='weighting'), 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x15541cbcb3d0>, 'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x1554791cd550>, 'sklearn_classifier': ExtraTreesClassifier(max_features=9, min_samples_split=6, n_estimators=512,
                     n_jobs=1, random_state=101, warm_start=True)}, 21: {'model_id': 21, 'rank': 2, 'cost': 0.9946363636363637, 'ensemble_weight': 0.04, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x15541cb13ee0>, 'balancing': Balancing(random_state=101), 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessi

In [13]:
joblib.dump(automl, 'imagenet24.joblib')

['imagenet24.joblib']

In [14]:
testData = load_dataset(
    'Maysee/tiny-imagenet',
    split='valid',
    cache_dir="/scratch/doa240/.cache/huggingface/datasets",
    streaming=True,
    ignore_verifications=True  # set to True if seeing splits Error
)



In [15]:
X_test = []
y_test = []

for data in testData:
    img = data['image']
    img = img.convert('L')
    img = np.array(img)
    X_test.append(img)
    y_test.append(data['label'])

In [16]:
X_test = np.array(X_test)
X_test = X_test.reshape(X_test.shape[0], -1)
print(X_test.shape)

(10000, 4096)


In [17]:
y_test = np.array(y_test)
print(y_test.shape)

(10000,)


In [None]:
# evaluate
pred = automl.predict(X_test)
test_acc = accuracy_score(y_test, pred)
print("Test Accuracy score {0}".format(test_acc))

Test Accuracy score 0.0652


In [None]:
classes = [0] * 1000
classesCnt = [0] * 1000

for i in range(len(pred)):
    if pred[i] == y_test[i]:
        classes[pred[i]] += 1
    classesCnt[y_test[i]] += 1
    
for i in range(1000):
    print("Label:", i, classes[i]/classesCnt[i])

In [None]:
classes = np.array(classes)
print(np.sum(classes)/len(y_test))