In [11]:
#hide
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# Tutorial: Using datasets

In [12]:
import glob
import os

from joblib import Parallel, delayed
import numpy as np
from sklearn import svm
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from torchvision import transforms as tfms
# from lightgbm import LGBMClassifier

import pytorch_lightning as pl
from einops.layers.torch import Rearrange
from image_folder_datasets.core import CNNModule, ImageFolderDataModule

data_dirs = sorted(list(glob.glob('datasets/*')))
print(len(data_dirs))

102


In [4]:
import pandas as pd
from contexttimer import Timer
from sklearn import metrics

def multiclass_report(x_train, y_train, x_val, y_val, clf=None, dataset_name=None):
    """Utility function to score classifier
    Pass in the classifier if you want to test train, test times etc.
    """
    n_classes = len(set(y_train))
    labels = sorted(list(set(y_train)))
    
    with Timer() as train_time:
        clf.fit(x_train, y_train)
        
    with Timer() as test_time:
        y_pred_proba = clf.predict_proba(x_val)
        
    y_pred = np.argmax(y_pred_proba, axis=1)
        
    results = {
        'Train time': train_time.elapsed,
        'Test time': test_time.elapsed
    }
    results['clf'] = clf.__class__.__name__
    results['dataset'] = dataset_name
    results['Weighted Fscore'] = metrics.f1_score(y_val, y_pred, average='weighted')
    results['Top-1 score'] = metrics.top_k_accuracy_score(y_val, y_pred_proba, k=1)
    results['Top-5 score'] = metrics.top_k_accuracy_score(y_val, y_pred_proba, k=5) if n_classes > 5 else None
    results['n_classes'] = n_classes
    results['n_train_samples'] = len(x_train)
    results['n_test_samples'] = len(x_val)
        
    return results

In [13]:
results = []

data_dirs = sorted(list(glob.glob('datasets/*')))

transform = tfms.Compose([
    tfms.Grayscale(),
    tfms.Resize(128, interpolation=2),
    tfms.RandomCrop(112),
    tfms.ToTensor(),
    Rearrange('h w c -> (h w c)'), 
])

for i, data_dir in enumerate(data_dirs):
    dataset_name = data_dir
    print(i, dataset_name)
    dm = ImageFolderDataModule(data_dir, 256, transform, num_workers=128)
    dm.setup()
    
    x_train, y_train = zip(*[(np.asarray(x), y) for x, y in dm.trainset])
    x_val, y_val = zip(*[(np.asarray(x), y) for x, y in dm.valset])
    
    # Do dimensionality reduction to 
    print("\tStart PCA")
    pca = PCA(n_components=0.8)
    pca.fit(x_train)
    x_train = pca.transform(x_train)
    x_val = pca.transform(x_val)

    print("\tn_components:", pca.n_components_)
    
    svm = SVC(probability=True)
    results.append(multiclass_report(x_train, y_train, x_val, y_val, clf=svm, dataset_name=dataset_name))

    dummy_clf = DummyClassifier()
    results.append(multiclass_report(x_train, y_train, x_val, y_val, clf=dummy_clf, dataset_name=dataset_name))

    print("\t", pd.DataFrame(results[2*i:2*i+2]))

0 datasets/6000-store-items-images-classified-by-color
	Start PCA
	n_components: 25
	    Train time  Test time              clf  \
0    4.876390   0.301916              SVC   
1    0.000375   0.000038  DummyClassifier   

                                                dataset  Weighted Fscore  \
0  datasets/6000-store-items-images-classified-by-color         0.312564   
1  datasets/6000-store-items-images-classified-by-color         0.022938   

   Top-1 score  Top-5 score  n_classes  n_train_samples  n_test_samples  
0     0.353365     0.807692         12             4991            1248  
1     0.112981     0.555288         12             4991            1248  
1 datasets/8-kinds-of-image-classification
	Start PCA
	n_components: 81
	    Train time  Test time              clf  \
0   66.499557   4.105875              SVC   
1    0.001159   0.000066  DummyClassifier   

                                    dataset  Weighted Fscore  Top-1 score  \
0  datasets/8-kinds-of-image-classificat

ValueError: y should be a 1d array, got an array of shape (51, 2) instead.

In [None]:
pd.DataFrame(results)