# Baseline Methods for OOD Digit Classification

- Decision Tree
- Random Forest
- Regular MLP
- AdaBoost
- SVM

## Data Preprocessing

Think about the following

- Should data have 3 channels or grayscale (1 channel)
- Should we use a scaler to center mean and scale to unit variance

In [2]:
import sys
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import svm
from typing import Union, List
import matplotlib.pyplot as plt
from collections import OrderedDict
from torch.utils.data import DataLoader

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score, mean_squared_error, classification_report, confusion_matrix, precision_recall_curve, PrecisionRecallDisplay, RocCurveDisplay
from joblib import dump, load

import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

from DGDataset import DGDataset

In [3]:
sns.set_style('darkgrid')

## Helper Functions

In [4]:
datasets = ['mnist', 'mnist_m', 'svhn', 'syn']
target_domain = 'mnist'
scaler = StandardScaler()

In [5]:
def get_performance_metrics(predictions: np.ndarray, labels: np.ndarray):
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    mse = mean_squared_error(labels, predictions)
    cm = confusion_matrix(labels, predictions)
    classification_rpt = classification_report(labels, predictions, output_dict=True)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "mse": mse,
        "cm": cm,
        "classification_rpt": classification_rpt,
        "classification_rpt_df": pd.DataFrame(classification_rpt).transpose()
    }
# predictions = svm_grid.predict(test_data)
# performance = get_performance_metrics(predictions, test_labels)


In [6]:
def load_dataset(datasets: List[str], target_domain: str, mode: str='train'):
    # datasets_ = datasets.copy()
    # datasets_.remove(target_domain)
    dataset = DGDataset(datasets, mode=mode)
    dataloader = DataLoader(dataset, batch_size=100)
    data, labels, domains = [], [], []
    for d, label, domain in dataloader:
        data.extend(d.numpy())
        labels.extend(label.numpy())
        domains.extend(domain.numpy())
    data = np.array(data)
    if len(data.shape) == 4:
        # has a color channel dimension
        data = data.reshape(len(data), np.prod(data.shape[1:])) # flatten each image to a vector
    return data, labels, domains

In [7]:
train_datasets = datasets.copy()
train_datasets.remove(target_domain)
train_data, train_labels, train_domains = load_dataset(train_datasets, target_domain, mode='train')
val_data, val_labels, val_domains = load_dataset(train_datasets, target_domain, mode='val')
test_data, test_labels, test_domains = load_dataset([target_domain], target_domain, mode='test')

## SVM

In [7]:
parameters = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': (1, 10),
    'gamma': ('scale', 'auto'),
    'decision_function_shape': ('ovo', 'ovr')
}
svm_grid = GridSearchCV(svm.SVC(), parameters).fit(train_data, train_labels)
print("Best SVM Parameters")
for k, v in svm_grid.best_params_.items():
    print(f"\t{k}: {v}")
dump(svm_grid, f'./models/{target_domain}/svm_grid.joblib')
svm_model = SVC(C=svm_grid.best_params_['C'], 
                kernel=svm_grid.best_params_['kernel'], 
                gamma=svm_grid.best_params_['gamma'], 
                decision_function_shape=svm_grid.best_params_['decision_function_shape']).fit(train_data, train_labels)
dump(svm_model, f'./models/{target_domain}/best_svm_model.joblib')


In [None]:
print(f"Accuracy: {round(accuracy_score(svm_model.predict(test_data), test_labels) * 100, 2)}%")

In [None]:
predictions = svm_model.predict(test_data)
svm_performance = get_performance_metrics(predictions, test_labels)
for k in ['accuracy', 'precision', 'recall', 'mse', 'cm']:
    print(f'{k}:', '\n', svm_performance[k], '\n')

svm_performance['classification_rpt_df']

## Decision Tree

In [None]:
dt_clf = DecisionTreeClassifier(random_state=0)
dt_clf.fit(train_data, train_labels)
print(f"Accuracy: {round(accuracy_score(dt_clf.predict(test_data), test_labels) * 100, 2)}%")

In [None]:
predictions = dt_clf.predict(test_data)
dt_performance = get_performance_metrics(predictions, test_labels)
for k in ['accuracy', 'precision', 'recall', 'mse', 'cm']:
    print(f'{k}:', '\n', dt_performance[k], '\n')

dt_performance['classification_rpt_df']

## Random Forest

## Random Forest Grid Search

In [8]:
parameters = {
    'n_estimators': [10, 50, 100],
    'criterion': ('gini', 'entropy', 'log_loss'),
    'max_features': ('sqrt', 'log2')
}
rf_grid = GridSearchCV(RandomForestClassifier(), parameters).fit(train_data, train_labels)
print("Best Random Forest Parameters")
for k, v in rf_grid.best_params_.items():
    print(f"\t{k}: {v}")
dump(rf_grid, f'./models/{target_domain}/rf_grid.joblib')
best_rf_model = RandomForestClassifier(
            n_estimators=rf_grid.best_params_['n_estimators'],
            criterion=rf_grid.best_params_['criterion'])
best_rf_model.fit(train_data, train_labels)
print(f"Accuracy: {round(accuracy_score(best_rf_model.predict(test_data), test_labels) * 100, 2)}%")
dump(best_rf_model, f'./models/{target_domain}/best_rf_model.joblib')

Best Random Forest Parameters
	criterion: entropy
	max_features: sqrt
	n_estimators: 100


FileNotFoundError: [Errno 2] No such file or directory: './models/mnist/rf_grid.joblib'

In [14]:
best_rf_model.fit(train_data, train_labels)
print(f"Accuracy: {round(accuracy_score(best_rf_model.predict(test_data), test_labels) * 100, 2)}%")

Accuracy: 79.43%


In [11]:
rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(train_data, train_labels)
print(f"Accuracy: {round(accuracy_score(rf_clf.predict(test_data), test_labels) * 100, 2)}%")

Accuracy: 77.37%


In [None]:
predictions = rf_clf.predict(test_data)
rf_performance = get_performance_metrics(predictions, test_labels)
for k in ['accuracy', 'precision', 'recall', 'mse', 'cm']:
    print(f'{k}:', '\n', rf_performance[k], '\n')

rf_performance['classification_rpt_df']

## MLP

In [None]:
# parameters = {
#     'activation': ('identity', 'logistic', 'tanh', 'relu'),
#     'solver': ('lbfgs', 'sgd', 'adam'),
#     'learning_rate': ('constant', 'invscaling', 'adaptive')
# }
# mlp_grid = GridSearchCV(MLPClassifier(shuffle=True), parameters).fit(train_data, train_labels)
# print(f"Accuracy: {round(accuracy_score(mlp_grid.predict(test_data), test_labels) * 100, 2)}%")
# dump(mlp_grid, f'./models/{target_domain}/mlp_grid.joblib')
# mlp_clf = MLPClassifier(
#     shuffle=True,
#     activation=mlp_grid.best_params_['activation'],
#     solver=mlp_grid.best_params_['solver'],
#     learning_rate=mlp_grid.best_params_['learning_rate']).fit(train_data, train_labels)
# dump(mlp_clf, f'./models/{target_domain}/best_mlp.joblib')

In [None]:
# for k, v in mlp_grid.best_params_.items():
#     print(f"\t{k}: {v}")

In [None]:
# mlp_clf = MLPClassifier(
#     shuffle=True,
#     activation='relu',
#     solver='adam').fit(train_data, train_labels)
# dump(mlp_clf, f'./models/{target_domain}/best_mlp.joblib')
# predictions = mlp_clf.predict(test_data)
# mlp_performance = get_performance_metrics(predictions, test_labels)
# for k in ['accuracy', 'precision', 'recall', 'mse', 'cm']:
#     print(f'{k}:', '\n', mlp_performance[k], '\n')

# mlp_performance['classification_rpt_df']

## AdaBoost

In [None]:
adaboost_dt_base_estimator = DecisionTreeClassifier(max_depth=10)
adaboost_clf = AdaBoostClassifier(n_estimators=100, random_state=0, base_estimator=adaboost_dt_base_estimator)
adaboost_clf.fit(train_data, train_labels)
print(f"Accuracy: {round(accuracy_score(adaboost_clf.predict(test_data), test_labels) * 100, 2)}%")

In [None]:
predictions = adaboost_clf.predict(test_data)
adaboost_performance = get_performance_metrics(predictions, test_labels)
for k in ['accuracy', 'precision', 'recall', 'mse', 'cm']:
    print(f'{k}:', '\n', adaboost_performance[k], '\n')

adaboost_performance['classification_rpt_df']

## AdaBoost Grid Search

In [15]:
parameters = {
    'n_estimators': [10, 50, 100],
    'learning_rate': [0.01, 0.1, 1.0]
}
adaboost_grid = GridSearchCV(AdaBoostClassifier(), parameters).fit(train_data, train_labels)
print("Best Random Forest Parameters")
for k, v in adaboost_grid.best_params_.items():
    print(f"\t{k}: {v}")
dump(adaboost_grid, f'./models/{target_domain}/adaboost_grid.joblib')
best_adaboost_model = AdaBoostClassifier(
            n_estimators=adaboost_grid.best_params_['n_estimators'],
            learning_rate=adaboost_grid.best_params_['learning_rate'])
dump(best_adaboost_model, f'./models/{target_domain}/best_adaboost_model.joblib')

Best Random Forest Parameters
	learning_rate: 1.0
	n_estimators: 100


['./models/mnist/best_adaboost_model.joblib']