# Default project: Model validation


### Load data

Let's first establish where our dataset is located.

In [11]:
training_folder = 'training_data/'

We are told that the dataset has one file for each class *k*=0,1,2,... labeled 'Class*k*.csv'.  When that file is loaded, it produces a matrix where the rows contain the samples, the last column contains the label, and the other columns contain the features. 

Let's load the class files one-by-one until there are none left.

In [13]:
import numpy as np

# instantiate empty arrays for features and labels
X = np.array([])
y = np.array([])
k = 0 # initialize

# load data from the relevant files
while True:
    try:
        # load data file
        class_k = np.loadtxt(training_folder + 'Class{:}.csv'.format(k))
        # extract features and labels
        class_k_features = class_k[:,:-1] # extract features
        class_k_labels  = class_k[:,-1].astype(np.int) # labels; convert to int
        # append the features and labels to the arrays
        X = np.vstack([X,class_k_features]) if X.size else class_k_features
        y = np.hstack([y,class_k_labels]) if y.size else class_k_labels
        # increment counter
        k += 1
    except:
        print('loaded %i classes of training data' %k)
        break

# examine shape
num_classes = k
num_features = X.shape[1]
num_samples = X.shape[0]

print('unique labels: ', np.unique(y))
print('number of features: ', num_features)
print('number of samples: ', num_samples)

loaded 20 classes of training data
unique labels:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
number of features:  20
number of samples:  100000


In [14]:
import sklearn.preprocessing

# standardize dataset
X = sklearn.preprocessing.scale(X)

### Load and validate a trained sk-learn model

 Logistic regression

In [15]:
import pickle

sklearn_model_file = 'best_logreg.sav'

# load a trained instance of sklearn.linear_model.LogisticRegression
model = pickle.load(open(sklearn_model_file,'rb'))

# predict
y_hat = model.predict(X)

# evaluate
acc = np.mean(y_hat == y)
print('accuracy of model: ',acc)

accuracy of model:  0.8701


SVM with RBF kernel

In [16]:
sklearn_model_file = 'best_svm.sav'

# load a trained instance of sklearn.linear_model.LogisticRegression
model = pickle.load(open(sklearn_model_file,'rb'))

# predict
y_hat = model.predict(X)

# evaluate
acc = np.mean(y_hat == y)
print('accuracy of model: ',acc)

accuracy of model:  0.91016


XGBoost

In [17]:
sklearn_model_file = 'best_xgboost.sav'

# load a trained instance of sklearn.linear_model.LogisticRegression
model = pickle.load(open(sklearn_model_file,'rb'))

# predict
y_hat = model.predict(X)

# evaluate
acc = np.mean(y_hat == y)
print('accuracy of model: ',acc)

accuracy of model:  0.936


Random Forest

In [30]:
import bz2
import requests

# download a trained instance of random forest
url = 'https://ml-model-hosting.web.app/best_RF.pbz2'
r = requests.get(url)
open('best_RF.pbz2', 'wb').write(r.content)
print("Downloaded as best_RF.pbz2.")

sklearn_model_file = 'best_RF.pbz2'
model = pickle.load(bz2.BZ2File(sklearn_model_file,'rb'))

# predict
y_hat = model.predict(X)

# evaluate
acc = np.mean(y_hat == y)
print('accuracy of model: ',acc)

Downloaded as best_RF.pbz2.
accuracy of model:  0.96731


### Load and validate a trained PyTorch model

In [18]:
import torch

# load a trained PyTorch model (see 'pytorch_saving_demo.ipynb')
mlr_torch = torch.jit.load("./best_nn.pth")

# predict
with torch.no_grad():
    scores = mlr_torch(torch.Tensor(X)).detach().numpy()   
y_hat = np.argmax(scores,axis=1)

# evaluate
acc = np.mean(y_hat == y)
print('accuracy of model: ',acc)

accuracy of model:  0.91358


In [20]:
from torch.utils.data import Dataset, DataLoader
class ProjectDataset(Dataset):

    def __init__(self, samples, targets, transform=None):
        self.samples = samples
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):  
        label = self.targets[index]
        sample = self.samples[index,:]
        sample = sample.reshape(1,-1)
        if self.transform:
            sample = self.transform(sample)
        return (sample, label)
    
dl = DataLoader(ProjectDataset(X,y), batch_size=len(y))

xx,_=next(iter(dl))
# load a trained PyTorch model (see 'pytorch_saving_demo.ipynb')
mlr_torch = torch.jit.load("./best_cnn.pth")

# predict
with torch.no_grad():
    scores = mlr_torch(xx.float()).detach().numpy()   
y_hat = np.argmax(scores,axis=1)

# evaluate
acc = np.mean(y_hat == y)
print('accuracy of model: ',acc)

accuracy of model:  0.91562
