## This for just testing the classification between two classes from Caltech100

In [None]:
#@title Download Dataset
%%capture
!wget https://data.caltech.edu/records/nyy15-4j048/files/256_ObjectCategories.tar
!tar -xvf /content/256_ObjectCategories.tar

In [None]:
#@title Install OpenCLIP
%%capture
!pip install open_clip_torch

In [None]:
#@title Imports
import torch
import torchvision
import os
import open_clip
from torch.utils.data import DataLoader
import numpy as np
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

In [None]:
#@title Delete Unwanted Folders and Make Only Two Classes
directory = '/content/256_ObjectCategories' # Directory holds all the image's folders
dir_list  =     [ '027.calculator' ,'024.butterfly'] # List of all the wnated folders 
for folder in os.listdir(directory):
    f = os.path.join(directory, folder)    
    if folder not in dir_list:
      !rm -r $f
      continue


In [None]:
#@title Clip Model
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32',pretrained='openai')
device = "cuda" if torch.cuda.is_available() else "cpu"

100%|███████████████████████████████████████| 354M/354M [00:17<00:00, 20.1MiB/s]


In [None]:
#@title get features function from OpenAI CLIP Github
# https://github.com/openai/CLIP#linear-probe-evaluation
def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in DataLoader(dataset, batch_size=20 ,shuffle=True):
            features = model.encode_image(images.to(device))

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

In [None]:
#@title Making Dataset Out Of Images' folder
import random
transform = preprocess
dataset = torchvision.datasets.ImageFolder(directory, transform)
n = len(dataset)  # total number of examples
n_test = int(0.1 * n)  # take ~10% for test

test_list = [] 
while len(test_list) < n_test:
  rand = random.randint(0, n)
  if rand not in test_list:
    test_list.append(rand)

train_list = []
for num in range(n):
  if num not in test_list:
    train_list.append(num)

test_set = torch.utils.data.Subset(dataset, test_list,)  # take 10%
train_set = torch.utils.data.Subset(dataset, train_list)

In [None]:
dataset.classes

['024.butterfly', '027.calculator']

In [None]:
#@title  Calculating images' features for train/test sets.
train_features, train_labels = get_features(train_set)
test_features, test_labels = get_features(test_set)

In [None]:
test_labels

array([1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0])

In [None]:
train_labels

array([0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1])

In [None]:
%%capture
import time
#@title Installing AutoSkLearn
!pip install auto-sklearn
time.sleep(5)

In [None]:
#@title Importing Autosklearn
from autosklearn import *
import autosklearn
print('autosklearn: %s' % autosklearn.__version__)

autosklearn: 0.15.0


In [None]:
#@title Training Classification Model
# example of auto-sklearn for a classification dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from autosklearn.classification import AutoSklearnClassifier

## for only two classes .. 10 minutes is too long .. I think 5 will be cool.
classifier_model = AutoSklearnClassifier(time_left_for_this_task=10*60, per_run_time_limit=60, n_jobs=8)
# perform the search
classifier_model.fit(train_features, train_labels)
# summarize
print(classifier_model.sprint_statistics())
# evaluate best model
y_hat = classifier_model.predict(test_features)
acc = accuracy_score(test_labels, y_hat)
print("Accuracy: %.3f" % acc)

auto-sklearn results:
  Dataset name: d7c518cc-5607-11ed-8049-0242ac1c0002
  Metric: accuracy
  Best validation score: 1.000000
  Number of target algorithm runs: 104
  Number of successful target algorithm runs: 96
  Number of crashed target algorithm runs: 4
  Number of target algorithms that exceeded the time limit: 1
  Number of target algorithms that exceeded the memory limit: 3

Accuracy: 1.000


# Using LogisticRegression

In [None]:
# Calculate the image features
train_features, train_labels = get_features(train_set)
test_features, test_labels = get_features(test_set)

# Perform logistic regression
## max_iter reduced to 100 
classifier = LogisticRegression(random_state=0, max_iter=100, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")

Accuracy = 100.000


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if sys.path[0] == '':


In [None]:
predictions

array([0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1])

In [None]:
test_labels

array([0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1])