In [1]:
from datasets import load_dataset
import numpy as np
from skimage.feature import hog

In [2]:
trainData = load_dataset(
    'Maysee/tiny-imagenet',
    split='train',
    cache_dir="/scratch/doa240/.cache/huggingface/datasets",
    streaming=True,
    ignore_verifications=True  # set to True if seeing splits Error
)
trainData = trainData.shuffle(seed=101)

Using custom data configuration Maysee--tiny-imagenet-35af7c46a941f08e


In [5]:
X_train = []
y_train = []

i = 1
for data in trainData:
    img = data['image']
    if img.mode != 'RGB':
        img = img.convert("RGB")
        
    img = np.array(img)

    fd = hog(img, orientations=9, pixels_per_cell=(8, 8),cells_per_block=(2, 2), visualize=False,channel_axis=-1)
    
    if(len(X_train) > 0 and X_train[-1].shape != fd.shape):
        break
    
    X_train.append(fd)
    y_train.append(data['label'])
    
    if i % 5000 == 0:
        print(i)

In [6]:
x_train = np.array(X_train)
print(x_train.shape)
x_train = x_train.reshape(x_train.shape[0], -1)
print(x_train.shape)

(100000, 1764)
(100000, 1764)


In [7]:
y_train = np.array(y_train)
print(y_train.shape)

(100000,)


In [8]:
import autosklearn.classification
import autosklearn.metrics
from pprint import pprint
import joblib
from smac.tae import StatusType 
from sklearn.metrics import accuracy_score, f1_score 

In [9]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=(3600*24), 
    per_run_time_limit = (3600*24)/6,
    seed=101,
    n_jobs=-1,
    memory_limit= 3072*70
)
automl.fit(x_train, y_train)



AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      memory_limit=215040, n_jobs=-1,
                      per_run_time_limit=14400.0, seed=101,
                      time_left_for_this_task=86400)

In [10]:
pprint(automl.sprint_statistics())

('auto-sklearn results:\n'
 '  Dataset name: a1223143-e6b7-11ed-ad79-00110a6b78e8\n'
 '  Metric: accuracy\n'
 '  Best validation score: 0.112152\n'
 '  Number of target algorithm runs: 262\n'
 '  Number of successful target algorithm runs: 193\n'
 '  Number of crashed target algorithm runs: 4\n'
 '  Number of target algorithms that exceeded the time limit: 56\n'
 '  Number of target algorithms that exceeded the memory limit: 9\n')


In [11]:
print(automl.leaderboard())

          rank  ensemble_weight         type      cost      duration
model_id                                                            
61           1             0.02   libsvm_svc  0.887848  12856.559347
204          2             0.02   libsvm_svc  0.888121  10053.184100
192          3             0.02          lda  0.892848    641.001947
211          4             0.04          lda  0.893182    699.185577
201          5             0.02          lda  0.895879    642.002196
200          6             0.04          lda  0.897788    256.905924
189          7             0.22  extra_trees  0.897970   1072.704594
196          8             0.02          lda  0.898091    633.940253
183          9             0.02          lda  0.898394    270.526815
245         10             0.04          lda  0.898939    274.251649
186         11             0.02  gaussian_nb  0.898970    669.677803
214         12             0.02          lda  0.899182    655.275259
177         13             0.02   

In [12]:
print(automl.show_models())

{61: {'model_id': 61, 'rank': 1, 'cost': 0.8878484848484849, 'ensemble_weight': 0.02, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x15544b3bf610>, 'balancing': Balancing(random_state=101), 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x1554ae338460>, 'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x1554ae338850>, 'sklearn_classifier': SVC(C=968.2005893640705, cache_size=139142.08072916666,
    gamma=0.00522738710460193, max_iter=-1.0, random_state=101, shrinking=False,
    tol=0.0023639874792425806)}, 89: {'model_id': 89, 'rank': 2, 'cost': 0.9074848484848484, 'ensemble_weight': 0.34, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x1554ae762d90>, 'balancing': Balancing(random_state=101, strategy='weighting'), 'feature_preprocessor': <autosklearn.pipeline.c

In [13]:
joblib.dump(automl, "imagenetHOG.joblib")

['imagenetHOG.joblib']

In [14]:
testData = load_dataset(
    'Maysee/tiny-imagenet',
    split='valid',
    cache_dir="/scratch/doa240/.cache/huggingface/datasets",
    streaming=True,
    ignore_verifications=True  # set to True if seeing splits Error
)



In [15]:
X_test = []
y_test = []

i = 1
for data in testData:
    img = data['image']
    if img.mode != 'RGB':
        img = img.convert("RGB")
        
    img = np.array(img)

    fd = hog(img, orientations=9, pixels_per_cell=(8, 8),cells_per_block=(2, 2), visualize=False,channel_axis=-1)
    
    if(len(X_test) > 0 and X_test[-1].shape != fd.shape):
        break
    
    X_test.append(fd)
    y_test.append(data['label'])
    
    if i % 5000 == 0:
        print(i)

In [16]:
x_test = np.array(X_test)
print(x_test.shape)
x_test = x_test.reshape(x_test.shape[0], -1)
print(x_test.shape)

(10000, 1764)
(10000, 1764)


In [17]:
y_test = np.array(y_test)
print(y_test.shape)

(10000,)


In [18]:
# evaluate
pred = automl.predict(x_test)
test_acc = accuracy_score(y_test, pred)
print("Test Accuracy score {0}".format(test_acc))

Test Accuracy score 0.1271
