## Using a Stacked Model with a Blender to Improve Wine Quality Estimation ##

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split

In [3]:
data_path = Path("./winequality-white.csv")
data_columns = ["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", 
"pH", "sulphates", "alcohol", "quality"]
wine_df = pd.read_csv(data_path, header=0, names=data_columns, sep=";")

test_size=0.2
train_df, test_df = train_test_split(wine_df, test_size=test_size)

# data overview:
#print(wine_df.describe())

# which quality classes do we have? :
qualities = wine_df["quality"].unique()
print(f"Number of unique 'qualities': {len(qualities)}")
print(f"Qualities: {sorted(qualities)}")
binc = np.bincount([q for q in wine_df["quality"]])
no_inst = len(wine_df)
print(f"\nClass counts: {binc}")
print(f"\nNumber of instances: {no_inst} ")
print(f"\nClass fractions: {np.round(binc/no_inst,4) * 100}")


Number of unique 'qualities': 7
Qualities: [3, 4, 5, 6, 7, 8, 9]

Class counts: [   0    0    0   20  163 1457 2198  880  175    5]

Number of instances: 4898 

Class fractions: [ 0.    0.    0.    0.41  3.33 29.75 44.88 17.97  3.57  0.1 ]


#### ToDo: !Refactoring: don't use pandas dataframes, where not necessary! ####

In [22]:
from sklearn.preprocessing import StandardScaler

def scale_dataframe(data_df, exempt_last_column=False, column_names_to_scale=None):
    """
        Scales columns of a given data frame with a StandardScaler from Sklearn. 
        Input:
            data_df : dataframe with numerical values to normalize
            exempt_last_column : if true, column_names_to_scale will be ignored and all but the last column will be scaled.
            column_names_to_scale : list of the names of the columns to be scaled

        Output:
            dataframe with columns scaled
    """
    scaler = StandardScaler()
    
    # quickfix (until refactoring out pandas..) for data_df as np.arrays instead of pd.DataFrame
    if(isinstance(data_df, pd.DataFrame)):
        data = data_df.to_numpy()
    else:
        data = data_df

    if(exempt_last_column & (column_names_to_scale != None)):
        raise UserWarning("exempt_last_column=True : your column_names_to_scale will be ignored!")
    if(exempt_last_column):
        data_to_scale = data[:,:-1]
        last_column = np.expand_dims(data[:,-1].astype(np.int_), axis=1)
        data_scaled = np.append(scaler.fit_transform(data_to_scale), last_column, axis=1)
        return pd.DataFrame(data_scaled)
    elif(column_names_to_scale):
        data_to_scale = data[column_names_to_scale]
        data_scaled = scaler.fit_transform(data_to_scale)
        df_temp = pd.DataFrame(data_scaled, columns=column_names_to_scale, index=data_df.index)
        data_df[column_names_to_scale]= df_temp
    else:
        data_scaled = scaler.fit_transform(data)
        data_df = pd.DataFrame(data_scaled)
    
    return data_df


In [16]:
"""
# Model definition from the WineDataset note book -- I don't know how to import this from another Jupyter notebook...
from torch import nn

class WineNetwork(nn.Module):
    def __init__(self):
        super(WineNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(11, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(p=0.2),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, 10),
            nn.ReLU()
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits
"""

"\n# Model definition from the WineDataset note book -- I don't know how to import this from another Jupyter notebook...\nfrom torch import nn\n\nclass WineNetwork(nn.Module):\n    def __init__(self):\n        super(WineNetwork, self).__init__()\n        self.linear_relu_stack = nn.Sequential(\n            nn.Linear(11, 64),\n            nn.ReLU(),\n            nn.BatchNorm1d(64),\n            nn.Dropout(p=0.2),\n            nn.Linear(64, 128),\n            nn.ReLU(),\n            nn.BatchNorm1d(128),\n            nn.Linear(128, 256),\n            nn.ReLU(),\n            nn.BatchNorm1d(256),\n            nn.Linear(256, 10),\n            nn.ReLU()\n        )\n\n    def forward(self, x):\n        logits = self.linear_relu_stack(x)\n        return logits\n"

In [5]:
def train_loop(dataloader, model, loss_fn, optimizer):
    losses, nof_correct = 0, 0
    for xx, y_true in dataloader:
        y_pred = model(xx)
        loss= loss_fn(y_pred, y_true)
        losses += loss.item()
        nof_correct += (y_pred.argmax(1) == y_true).sum().item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return losses, nof_correct

def test_loop(dataloader, model, loss_fn):
    losses, no_correct = 0, 0
    with torch.no_grad():
        for (X,y_true) in dataloader:
            pred = model(X)
            losses += loss_fn(pred, y_true).item()
            no_correct += (pred.argmax(1)== y_true).sum().item()
     
    return losses, no_correct
    

### Define Maps to Transform the Training-Data from the Disk - This is the First Layer of Stacking ###:

Decision-Tree Map:<br>
Is used for the AdaBoost-Decision-Tree and for the plain decision-tree.

In [6]:
# transform the training data with the decision-tree:

import pickle

def Wine_tree_map(data_df, tree_name, does_predict_proba=False):
    """
    Maps a data-frame by using a pre-trained decision-tree.
    Input:
        data_df : Pandas data-frame to be mapped, containing (x, y_true)
        tree_name : path / name of the pretrained tree
        does_predict_proba : uses the predict_proba method if true, uses predict otherwize
    Returns:
        Pandas data-frame containing (y_pred, y_true)
    """
    tree_model = pickle.load(open(tree_name, 'rb'))
    X = data_df.to_numpy()
    if(does_predict_proba):
        # "np.int_" is "long" in numpy:
        Z = np.append(tree_model.predict_proba(X[:,:-1]), np.expand_dims(X[:,-1].astype(np.int_), axis=1), axis=1)
    else:
        Z = np.append(np.expand_dims(tree_model.predict(X[:,:-1]).astype(np.int_), axis=1), np.expand_dims(X[:,-1].astype(np.int_), axis=1), axis=1)
    
    return pd.DataFrame(Z)

Neural-Net Map:

In [7]:
# maps the training data with the neural-net:

import numpy as np
import pandas as pd


def WineNetwork_map(data_df, net_name):
    """
    Maps a dataframe by applying a pre-trained neural net of type "class WineNetwork"
    Input:
        data_df : the pandas data-frame to be transformed, containing (x, y_true)
        net_name : the path / name of the net-model of type WineNetwork to be loaded
    Returns:
        pandas data-frame containing (y_pred probatilities, y_true)
    """
    class WineNetwork(nn.Module):
        def __init__(self):
            super(WineNetwork, self).__init__()
            self.linear_relu_stack = nn.Sequential(
                nn.Linear(11, 64),
                nn.ReLU(),
                nn.BatchNorm1d(64),
                nn.Dropout(p=0.2),
                nn.Linear(64, 128),
                nn.ReLU(),
                nn.BatchNorm1d(128),
                nn.Linear(128, 256),
                nn.ReLU(),
                nn.BatchNorm1d(256),
                nn.Linear(256, 10),
                nn.ReLU()
            )

        def forward(self, x):
            logits = self.linear_relu_stack(x)
            return logits

    net_model = WineNetwork()
    net_model.load_state_dict(torch.load(net_name))
    net_model.eval()

    # add softmax for prediction of probabilities:
    # softm = lambda x : np.exp(x)/np.sum(np.exp(x))
    softm = torch.nn.Softmax(dim=1)
    
    X = torch.tensor(data_df.iloc[:,:-1].to_numpy(), dtype=torch.float32).detach()
    Y = torch.tensor(data_df.iloc[:, -1].to_numpy(), dtype=torch.long).detach()

    Z = torch.cat((softm(net_model(X)), Y.unsqueeze(dim=1)), dim=1).detach().numpy()
    
    return pd.DataFrame(Z)
    

Torch DataSet - to be used for batching with torch DataLoader:

In [8]:
# We use the boiler-plate triade: pandas.data-frame -> torch.data-set -> torch.data-loader

# define torch.dataset: __init__(), __len__(), __getitem__()
from torch.utils.data import Dataset

class WineDataSet(Dataset):
    def __init__(self, data_df, transform=None, target_transform=None):
        self.data_df = data_df
        self.transform = transform
        self.target_transform = target_transform
        self.X = torch.tensor(self.data_df.iloc[:,:-1].to_numpy(), dtype=torch.float32)
        self.Y = torch.tensor(self.data_df.iloc[:, -1].to_numpy(), dtype= torch.long)

    def __len__(self):
        return len(self.Y)
        
    def __getitem__(self,idx):
        self.x = self.X[idx,:]
        self.y = self.Y[idx]
        if self.transform != None:
            self.x = self.transform(self.x)
        if self.target_transform != None:
            self.y = self.target_transform(self.y)
        return self.x, self.y

In [None]:
"""
# LOAD AND PREPARE DATA: DATA FRESH LOAD - NOT for stacking:

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from torch import nn
from torch.utils.data import DataLoader


# DATA FRESH LOAD - NOT for stacking:
data_path = Path("./winequality-white.csv")
column_names = ["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", 
"pH", "sulphates", "alcohol", "quality"]
#column_names_to_normalize = column_names[:-1]
wine_df = pd.read_csv(data_path, header=0, names=column_names, sep=";")

# prepare data:
shuffle(wine_df, random_state=0)
scaled_wine_df = scale_dataframe(wine_df, exempt_last_column=True)

# names/ paths of the pre-trained classifiers:
#WineNetwork_filename = "model_640_0.001_369_64_SGD.pt"
NET_FILENAME="model_635_0.001_255_64_SGD.pt"
TREE_FILENAME= 'AdaBoost_071_model.dct'

# use the scaled data for the net:
net_df = WineNetwork_map(scaled_wine_df, net_name=NET_FILENAME)
net_df = scale_dataframe(net_df)

# use the unscaled data for the tree:
tree_df = Wine_tree_map(wine_df, tree_name=TREE_FILENAME, does_predict_proba=True)
tree_df = scale_dataframe(tree_df)

# combine net_df and tree_df to combined_data_df:
combined_data_df = pd.DataFrame(np.append(net_df.iloc[:,:-1].to_numpy(), tree_df.to_numpy(), axis=1))

# split into train_df, test_df:
TEST_SIZE = 0.2
combined_train_df, combined_test_df = train_test_split(combined_data_df, test_size=TEST_SIZE)

# calculate the input-dimension for the blender - minus one is for the label column:
blender_input_dim = combined_data_df.shape[1]-1

# create dataloader from train_df and test_df:
BATCH_SIZE=64
train_ds = WineDataSet(combined_train_df)
test_ds = WineDataSet(combined_test_df)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True)
"""

#### Load and Prepare Data for Mapping and Blender Training: ###

Technical note to the following code: <br>
we have to pad the tree-output because the first 3 classes of "quality" are not present in the dataset, i.e. the  <br>
tree outputs the probability of the i'th of the present classes, i.e. the i'th in 4,5,6,7,8,9 <br>
Might also be easier for the blender in the end, to have the i'th input meaning the same for all inputs. <br>

In [52]:
# LOAD AND PREPARE DATA FOR MAPPING AND BLENDER TRAINING:

import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader


# DATA LOAD FOR STACKING:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

# names/ paths of the pre-trained classifiers:
NET_FILENAME = "net_model_660_0.001_299_64_SGD.pt"
TREE_FILENAME = 'AdaBoost_071_model.dct'
TREE_FILENAME2 = "DecisionTree_061_model.dct"


# TRAIN DATA PREPARATION:

# use predict_proba method for trees:
predict_proba = True

# use the scaled data for the net:
scaled_train_df = scale_dataframe(train_df, exempt_last_column=True)
us_net_train_df = WineNetwork_map(scaled_train_df, net_name=NET_FILENAME)
net_train_df = scale_dataframe(us_net_train_df, exempt_last_column=True)

# use the unscaled data for the tree:
us_tree_train_df = Wine_tree_map(train_df, tree_name=TREE_FILENAME, does_predict_proba=predict_proba)
# padding of the tree-output
if(predict_proba):
    us_tree_train_df = np.pad(us_tree_train_df, ((0,0), (3,0)), "constant", constant_values=0)
tree_train_df = scale_dataframe(us_tree_train_df, exempt_last_column=True)

us_tree2_train_df = Wine_tree_map(train_df, tree_name=TREE_FILENAME2, does_predict_proba=predict_proba)
if(predict_proba):
    us_tree2_train_df = np.pad(us_tree2_train_df, ((0,0), (3,0)), "constant", constant_values=0)
tree2_train_df = scale_dataframe(us_tree2_train_df, exempt_last_column=True)


# combine net_df and tree_df to combined_data_df:
combined_train_ = np.append(net_train_df.iloc[:,:-1].to_numpy(), tree_train_df.iloc[:,:-1].to_numpy(), axis=1)
# combine combined_train_ and tree2_train_df -- the true labels are contained in tree2_train_df:
combined_train_df = pd.DataFrame(np.append(combined_train_, tree2_train_df.to_numpy(), axis=1))

# TEST DATA PREPARATION:
# use the scaled data for the net:
scaled_test_df = scale_dataframe(test_df, exempt_last_column=True)
us_net_test_df = WineNetwork_map(scaled_test_df, net_name=NET_FILENAME)
net_test_df = scale_dataframe(us_net_test_df, exempt_last_column=True)

# use the unscaled data for the tree:
us_tree_test_df = Wine_tree_map(test_df, tree_name=TREE_FILENAME, does_predict_proba=predict_proba)
# padding of the tree-output:
if(predict_proba):
    us_tree_test_df = np.pad(us_tree_test_df, ((0,0), (3,0)), "constant", constant_values=0)
tree_test_df = scale_dataframe(us_tree_test_df, exempt_last_column=True)

us_tree2_test_df = Wine_tree_map(test_df, tree_name=TREE_FILENAME2, does_predict_proba=predict_proba)
if(predict_proba):
    us_tree2_test_df = np.pad(us_tree2_test_df, ((0,0), (3,0)), "constant", constant_values=0)
tree2_test_df = scale_dataframe(us_tree2_test_df, exempt_last_column=True)

# combine net_df and tree_df to combined_data_df:
combined_test_ = np.append(net_test_df.iloc[:,:-1].to_numpy(), tree_test_df.iloc[:,:-1].to_numpy(), axis=1)
# combine combined_test_ and tree2_test_df -- the true-labels are contained in tree2_test_df:
combined_test_df = pd.DataFrame(np.append(combined_test_, tree2_test_df.to_numpy(), axis=1))


# PARAMETERS:
# calculate the input-dimension for the blender - minus one is for the label column:
BLENDER_INPUT_DIM = combined_test_df.shape[1]-1

### Compare the Tree- and the Net-Classifier ###
? How many of the instances the net gets right, does the tree get right? <br>
At first - compare on the test-set:

In [53]:
# TEST-SET COMPARISON:
# we have to use the un-scaled data of courses:

# get the class predictions of the net classifier:
X_net_test = us_net_test_df.iloc[:,:-1].to_numpy()
y_pred_net_test = np.argmax(X_net_test, axis=1)

# get the class predictions of the tree classifier:
X_tree_test = us_tree_test_df[:, :-1]
y_pred_tree_test = np.argmax(X_tree_test, axis=1)

# get the true labels:
#y_true_test = us_net_test_df.iloc[:,-1].to_numpy()
y_true_test = us_tree_test_df[:,-1]
nof_instances = len(y_true_test)
accc_net = (y_pred_net_test == y_true_test).sum()
accc_tree = (y_pred_tree_test.squeeze() == y_true_test).sum()
ic_tree_net = (y_pred_net_test == y_pred_tree_test.squeeze()).sum()

print("--- ON TEST DATA ---")
print(f"X_tree_test.shape: {X_tree_test.shape}")
print(f"accuracy count net: {accc_net}, i.e. {accc_net / nof_instances :.3f}")
print(f"accuracy count tree: {accc_tree}, i.e. {accc_tree / nof_instances :.3f}")
print(f"net and tree agree on {ic_tree_net} instances.")

# get the instances where the net is correct:
true_net_preds_test = np.where(np.equal(y_pred_net_test, y_true_test))
true_tree_preds_test = np.where(np.equal(y_pred_tree_test, y_true_test))
# calculate where they are both correct:
N = set(true_net_preds_test[0])
T = set(true_tree_preds_test[0])
print(f"The correct predictions of the net are those of the tree: {N.issubset(T)}")
inters = len(N.intersection(T))
print(f"There are {inters} correct predictions made by both simultaneously.")
print(f"There are {accc_net - inters} correct predictions made by the net only, that is {(accc_net - inters)/len(y_true_test) :.3f} of the data.")



--- ON TEST DATA ---
X_tree_test.shape: (979, 10)
accuracy count net: 349, i.e. 0.356
accuracy count tree: 700, i.e. 0.715
net and tree agree on 428 instances.
The correct predictions of the net are those of the tree: False
There are 292 correct predictions made by both simultaneously.
There are 57 correct predictions made by the net only, that is 0.058 of the data.


So on the test-set there are some true classifications made by the net-classifier, that are done wrong by the tree-classifier. <br>
We would expect from this, that the ensemble resulting from stacking with a blender would be more accurate than each single classifier. <br>
<br>
To get a quick result, we just add the probability-masses of both predictors (soft-voting):

In [48]:
# instead of training a blender, we take simply the sum of the probability-masses as prediction of the ensemble and renormalize:
y_pred_test = X_tree_test + X_net_test
y_pred_class_test = np.argmax(y_pred_test, axis=1)
print(f"Accuracy soft-voting: {(y_pred_class_test == y_true_test).sum()/ len(y_true_test) :.3f}")


Accuracy soft-voting: 0.714


From the soft-voting we have a predictor with 71% accuracy - identical to the AdaBoosted Decision-Tree, i.e. *this soft-voting blender is not improving the overall classification.* <br>
<br>
So let's TRAIN a blender instead of PRESCRIBING one!

### Building a Stacked Ensemble with Blender:
Create the DataLoader from the DataFrames/ DataSets:

In [49]:
# create dataloader from train_df and test_df:
BATCH_SIZE=512
train_ds = WineDataSet(combined_train_df)
test_ds = WineDataSet(combined_test_df)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True)

Define the Blender Model:

In [50]:
import torch
from torch import nn

class BlenderModel(nn.Module):
    
    def __init__(self, input_dim):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
        nn.Linear(input_dim, 128),
        nn.ReLU(),
        nn.BatchNorm1d(128),
        nn.Dropout(p=0.2),
        nn.Linear(128, 256),
        nn.ReLU(),
        nn.BatchNorm1d(256),
        nn.Dropout(p=0.2),
        nn.Linear(256, 10),
        nn.ReLU(),
        )
    
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits


In [51]:
# Train the blender model:
import os
import copy

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter


# writer for tensorboard:
writer = SummaryWriter()


# blender model and parameters:

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

blender_model = BlenderModel(BLENDER_INPUT_DIM)

# parameters:

# loss function:
# cross-entropy:
loss_fn = nn.CrossEntropyLoss()

# optimizer:

# adam:
OPTIMIZER_NAME = "ADAM"
LEARNING_RATE = 1e-4
optimizer = torch.optim.Adam(blender_model.parameters(), lr=LEARNING_RATE, weight_decay=0.5, amsgrad=True)
#optimizer = torch.optim.Adam(blender_model.parameters(), lr=LEARNING_RATE)

# sgd:
#OPTIMIZER_NAME = "SGD"
#LEARNING_RATE = 1e-3
#MOMENTUM=0.9
#optimizer = torch.optim.SGD(blender_model.parameters(), lr= LEARNING_RATE, momentum=MOMENTUM)

# adamax:
#OPTIMIZER_NAME = "ADAMAX"
#LEARNING_RATE = 1e-4
#optimizer = torch.optim.Adamax(blender_model.parameters(), lr=LEARNING_RATE, weight_decay=0.8)

# LBFGS
#OPTIMIZER_NAME = "LBFGFS"
#LEARNING_RATE = 1
#MAX_ITER = 20
#optimizer = torch.optim.LBFGS(params, lr=LEARNING_RATE, max_iter=MAX_ITER, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn=None)


EPOCHS = 700
WRITE_LOG_AFTER_EPOCHS = 100
best_model_name = ""
max_correct = -torch.inf

print(f"Epochs: {EPOCHS}, Learning-rate : {LEARNING_RATE}, Optimizer : {OPTIMIZER_NAME}, Batch-Size : {BATCH_SIZE}")
for ep in range(1, EPOCHS+1):        
       
        # put model in train mode:
        blender_model.train()
        (train_loss, train_no_correct) = train_loop(train_dl, blender_model, loss_fn, optimizer)
              
        # switch model to to evaluation mode:
        blender_model.eval()
        (test_loss, test_no_correct) = test_loop(test_dl, blender_model, loss_fn)

        if(test_no_correct > max_correct):
            max_correct = test_no_correct
            #if(best_model_name):
            #    os.remove(best_model_name)
            best_model_name = "./blender_model_" + str(test_no_correct) + "_" + str(LEARNING_RATE) + "_" + str(ep) + "_" + str(BATCH_SIZE) + "_" + OPTIMIZER_NAME + "_sv.pt"
            # torch.save(blender_model.state_dict(), best_model_name)
            best_dict = copy.deepcopy(blender_model.state_dict())

        writer.add_scalar("Loss/test", test_loss/ len(test_ds), ep)
        writer.add_scalar("Accuracy/test", test_no_correct/ len(test_ds), ep)
        writer.add_scalar("Loss/train", train_loss/ len(train_ds), global_step=ep)
        writer.add_scalar("Accuracy/train", train_no_correct/ len(train_ds), global_step=ep)
       
        if ep % WRITE_LOG_AFTER_EPOCHS == 0:
            print(f"\n----- Epoch: {ep} -----")
            print(f"Epoch loss: {test_loss/ len(test_ds)}")
            print(f"Epoch accuracy: {test_no_correct/ len(test_ds)}")

torch.save(best_dict, best_model_name)
            

Using device: cpu
Epochs: 700, Learning-rate : 0.0001, Optimizer : ADAM, Batch-Size : 512

----- Epoch: 100 -----
Epoch loss: 0.0027197044156301495
Epoch accuracy: 0.6721144024514811

----- Epoch: 200 -----
Epoch loss: 0.0026411966356972997
Epoch accuracy: 0.7037793667007151

----- Epoch: 300 -----
Epoch loss: 0.002624496010886515
Epoch accuracy: 0.7150153217568948

----- Epoch: 400 -----
Epoch loss: 0.0025665196261927107
Epoch accuracy: 0.7180796731358529

----- Epoch: 500 -----
Epoch loss: 0.0025676816427914677
Epoch accuracy: 0.7170582226762002

----- Epoch: 600 -----
Epoch loss: 0.002554128435464144
Epoch accuracy: 0.7170582226762002

----- Epoch: 700 -----
Epoch loss: 0.0025339868144190225
Epoch accuracy: 0.7180796731358529


#### Some test statistics: ####
Names of Models are constructed according to: <br>
best_model_name = "./blender_model_" + str(test_no_correct) + "_" + str(LEARNING_RATE) + "_" + str(ep) + "_" + str(BATCH_SIZE) + "_" + OPTIMIZER_NAME + "_sv.pt <br>
<br>
1. blender_model_693_0.0001_48_256_SGD_sv.pt - 70.7% accuracy <br>
2. blender_model_696_0.0001_15_512_ADAM_sv.pt - 71.0% accuracy <br>
3. blender_model_696_1e-05_219_1024_ADAM_sv.pt - 71.0% accuracy <br>
4. blender_model_697_0.0001_16_512_ADAM_sv.pt - 71.1% accuracy <br>
5. blender_model_697_0.0001_44_256_SGD_sv.pt - 71.1% accuracy <br>
6. blender_model_698_0.0001_11_256_ADAM_sv.pt - 71.2% accuracy <br>
7. blender_model_698_1e-05_100_512_ADAM_sv.pt - 71.2% accuracy <br>
8. blender_model_701_0.0001_36_256_SGD_sv.pt - 71.5% accuracy <br>
9. blender_model_701_1e-05_192_512_ADAM_sv.pt - 71.5% accuracy <br>
10. blender_model_697_0.001_13_256_ADAMAX_sv.pt - 71.1% accuracy <br>
11. blender_model_706_0.0001_346_512_ADAM_sv.pt - 72.0% accuracy <br>
<br>
The blender was feed by the following three models: <br>
<br>
AdaBoost_071_model.dct - 71% accuracy<br>
DecisionTree_061_model.dct - 61% accuracy<br>
net_model_647_0.001_393_64_SGD.pt - 64% accuracy<br>
(except for model number 11. which was feed by net_model_660_0.001_299_64_SGD.pt - 71,8% accuracy instead.) <br>
<br>
Summary: <br>
I would have expected the blender to be considerably better than the best of the in-going predictors (after all the net does 5% of the data correct, that the Adaboost gets wrong), especially since the net and the decision-tree are so much different models (i.e. different by their construction, but maybe not by their prediction behaviour... needs to be investigated...)






## Investigation on the failing stacking-blender setup ##
### Compare the Tree- and the Net-Classifier ###
Above in this notebook, we already compared the net- and the adaboost-tree-classifier on the test data. <br>
Here we do the same on the training data:

In [46]:
# COMPARING ON THE TRAIN SET:

# we have to use the un-scaled (us) data of course:

# get the class predictions of the net classifier:
X_net = us_net_train_df.iloc[:,:-1].to_numpy()
y_pred_net = np.argmax(X_net, axis=1)

# get the class predictions of the tree classifier:
X_tree = us_tree_train_df[:, :-1]
y_pred_tree = np.argmax(X_tree, axis=1) #+ 3 # <--- plus 3 because the first 3 classes do not appear in the set.

# get the true labels:
y_true = us_net_train_df.iloc[:,-1].to_numpy()
nof_instances = len(y_true)
accc_net = (y_pred_net == y_true).sum()
accc_tree = (y_pred_tree.squeeze() == y_true).sum()
ic_tree_net = (y_pred_net == y_pred_tree.squeeze()).sum()


print("--- ON TRAIN DATA ---")
print(f"accuracy count net: {accc_net}, i.e. {accc_net / nof_instances :.3f}")
print(f"accuracy count tree: {accc_tree}, i.e. {accc_tree / nof_instances :.3f}")
print(f"net and tree agree on {ic_tree_net} instances.")

# get the instances where the net is correct:
true_net_preds = np.where(np.equal(y_pred_net, y_true))
true_tree_preds = np.where(np.equal(y_pred_tree, y_true))
# calculate where they are both correct:
N = set(true_net_preds[0])
T = set(true_tree_preds[0])
print(f"The correct predictions of the net are those of the tree: {N.issubset(T)}")
inters = len(N.intersection(T))
print(f"There are {inters} correct predictions made by both simultaneously")
print(f"There are {accc_net - inters} correct predictions made by the net only, that is {(accc_net - inters)/len(y_true) :.3f} of the data.")


--- ON TRAIN DATA ---
accuracy count net: 1357, i.e. 0.346
accuracy count tree: 3917, i.e. 1.000
net and tree agree on 1357 instances.
The correct predictions of the net are those of the tree: True
There are 1357 correct predictions made by both simultaneously
There are 0 correct predictions made by the net only, that is 0.000 of the data.


The results on the training-set show: the tree (in this case the AdaBoost only) is overfitting on the train set. <br>
That means that the input to the blender comming from the Adaboost-Tree is equal to the desired output. <br>
<br>
The blender has nothing to learn, the only thing it has to do, is to learn to ignore the input comming <br> 
from the net-classifier and having done so, the entire ensemble-stacking-blender setup is just the Adaboost-Tree.

### Conclusion: ###
There is no point in training a stacked ensemble with a blender, when one of the in-going classifiers is overfitting badly. <br>
<br>
The question now is: would the ensemble actually become better when using a non-overfitting, perhabs slightly worse Adaboost-Tree?
