In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sea
import os
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, r2_score
import sklearn
import copy
import time

from utils import *

plt.style.use('seaborn')

  plt.style.use('seaborn')


In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv('./data/WineQT.csv').drop(columns=['Id'], axis=0)

scaler = StandardScaler()

df['quality'] = df['quality'].map(lambda x : x - 3)
df.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1])

df_train, df_val = train_test_split(df, test_size=0.2)

X_train_c, y_train_c = df_train.values[:, :-1], pd.get_dummies(df_train.values[:, -1], dtype=np.float32).values
X_val_c, y_val_c = df_val.values[:, :-1], pd.get_dummies(df_val.values[:, -1], dtype=np.float32).values

X_train_c.shape, X_val_c.shape, y_train_c.shape, y_val_c.shape

((914, 11), (229, 11), (914, 6), (229, 6))

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv('./data/HousingData.csv')
nan_cols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'AGE', 'LSTAT']
for col in nan_cols:
    df[col] = df[col].fillna(df[col].mean(), inplace=False)
    

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
df_scaled = pd.DataFrame(scaled_data, columns=df.columns)


X_train_r, X_val_r, y_train_r, y_val_r = train_test_split(df_scaled.iloc[:, :-1].values, df_scaled.iloc[:, -1].values, test_size=0.1)
y_train_r, y_val_r = y_train_r.reshape(-1, 1), y_val_r.reshape(-1, 1)

X_train_r.shape, y_train_r.shape, X_val_r.shape, y_val_r.shape


((455, 13), (455, 1), (51, 13), (51, 1))

In [8]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

class DecisionTree:
    def __init__(self,
                 criterion,
                 max_depth,
                 task):
        self.criterion = criterion
        self.max_depth = max_depth
        self.task = task
        self.model = DecisionTreeClassifier(criterion=self.criterion, max_depth=self.max_depth)  if task == 'classification' \
                    else DecisionTreeRegressor(criterion=self.criterion, max_depth=self.max_depth)
        
    def __call__(self, x):
        pred = self.model.predict(x) 
        return np.expand_dims(pred, -1) if self.task == 'regression' else pred
        
    def train(self, X, y):
        self.model.fit(X, y)
        print("Accuracy of decision tree:", self.model.score(X, y))
        

In [9]:
from sklearn.model_selection import KFold

class Stacking:
    def __init__(self,
                 level_0    :list,
                 level_1    :Model,
                 method     :str, 
                 X          :np.ndarray, 
                 y          :np.ndarray,
                 frac       :float):

        self.level_0 = level_0
        self.level_1 = level_1
        self.method = method
        self.X = X
        self.y = y
        self.frac = frac
        self.num_samples = len(self.X)
        
    def blending(self):
        X_train, X_val, y_train, y_val = train_test_split(self.X, self.y, test_size=0.2)
        print("Level 0:")
        for model in self.level_0:
            model.train(X_train, y_train)
        
        X_new = np.hstack([model(X_val) for model in self.level_0])
        print("Level 1:")            
        self.level_1.train(X_new, y_val)
        print("=" * 30)
        
    def holdout(self, k=5):
        new_X, new_y = [], []
        
        kf = KFold(n_splits=k, shuffle=False)
        
        for (train_idx, val_idx) in kf.split(self.X, self.y):
            temp_X = []
            for model in self.level_0:
                model.train(self.X[train_idx], self.y[train_idx])
                temp_X.append(model(self.X[val_idx]))
            
            new_X.extend(np.hstack(temp_X))
            new_y.extend(self.y[val_idx])
        
        print("Level 1:")
        # print(np.array(new_X).shape, np.array(new_y).shape)
        self.level_1.train(np.array(new_X), np.array(new_y))

        print("Level 0:")
        for model in self.level_0:
            model.train(self.X, self.y)

    def train(self):
        self.time_taken = time.time()
        
        if self.method == 'blending':
            self.blending()
        else:
            self.holdout()
        
        self.time_taken = time.time() - self.time_taken
        
    def predict(self, x):
        preds_0 = []
        for model in self.level_0:
            preds_0.append(model(x))
    
        preds_0 = np.hstack(preds_0)
        pred_1 = self.level_1(preds_0)
        return pred_1
    

In [10]:
logistic = Model(lr=5e-3, logger=None, loss_fxn=CrossEntropyLoss(), type='classification', epochs=1000)
logistic.add(Layer(11, 6, Softmax()))

mlp = Model(lr=5e-3, logger=None, loss_fxn=CrossEntropyLoss(), type='classification', epochs=1000)
mlp.add(Layer(11, 16, Relu()))
mlp.add(Layer(16, 16, Relu()))
mlp.add(Layer(16, 6, Softmax()))

dt = DecisionTree(criterion='entropy', max_depth=10, task='classification')

stacking = Stacking(
    level_0=[logistic, mlp, dt],
    level_1=DecisionTree(criterion='entropy', max_depth=5, task='classification'),
    frac=0.15,
    method='holdout',
    X=X_train_c,
    y=y_train_c
)

stacking


<__main__.Stacking at 0x7f403a4b13d0>

In [11]:
stacking.train()

epoch: 999 	Train:[loss:1.6188 acc:0.4172]]
epoch: 999 	Train:[loss:1.3217 acc:0.5212]]
Accuracy of decision tree: 0.908344733242134
epoch: 999 	Train:[loss:1.2256 acc:0.5294]]
epoch: 999 	Train:[loss:1.1210 acc:0.5800]]
Accuracy of decision tree: 0.9069767441860465
epoch: 999 	Train:[loss:1.1003 acc:0.5622]]
epoch: 999 	Train:[loss:1.0114 acc:0.5992]]
Accuracy of decision tree: 0.8727770177838577
epoch: 999 	Train:[loss:1.0346 acc:0.6047]]
epoch: 999 	Train:[loss:0.9299 acc:0.6238]]
Accuracy of decision tree: 0.8467852257181943
epoch: 999 	Train:[loss:0.9649 acc:0.6230]]
epoch: 999 	Train:[loss:0.9182 acc:0.6243]]
Accuracy of decision tree: 0.9084699453551912
Level 1:
Accuracy of decision tree: 0.6652078774617067
Level 0:
epoch: 999 	Train:[loss:0.9759 acc:0.6127]]
epoch: 999 	Train:[loss:0.8917 acc:0.6204]]
Accuracy of decision tree: 0.8194748358862144


In [12]:
y_pred_c = stacking.predict(X_val_c)
accuracy_score(np.argmax(y_val_c, axis=-1), np.argmax(y_pred_c, axis=-1))

0.5502183406113537

In [13]:
regression = Model(lr=5e-3, logger=None, loss_fxn=MSELoss(), type='regression', epochs=1000)
regression.add(Layer(13, 1, Linear()))

regression_meta = Model(lr=5e-3, logger=None, loss_fxn=MSELoss(), type='regression', epochs=1000)
regression_meta.add(Layer(3, 1, Linear()))

mlp = Model(lr=5e-3, logger=None, loss_fxn=MSELoss(), type='regression', epochs=1000)
mlp.add(Layer(13, 16, Relu()))
mlp.add(Layer(16, 16, Relu()))
mlp.add(Layer(16, 1, Linear()))

dt = DecisionTree(criterion='squared_error', max_depth=10, task='regression')

stacking = Stacking(
    level_0=[regression, mlp, dt],
    level_1=regression_meta,
    frac=0.15,
    method='holdout',
    X=X_train_r,
    y=y_train_r
)

stacking

<__main__.Stacking at 0x7f403a16db20>

In [14]:
stacking.train()

epoch: 999 	Train:[loss:0.2880 acc:0.7158]]
epoch: 999 	Train:[loss:0.4389 acc:0.5669]]
Accuracy of decision tree: 0.9912390730252666
epoch: 999 	Train:[loss:0.2827 acc:0.7348]]
epoch: 999 	Train:[loss:0.3521 acc:0.6697]]
Accuracy of decision tree: 0.9869240340906972
epoch: 999 	Train:[loss:0.2901 acc:0.7129]]
epoch: 999 	Train:[loss:0.3051 acc:0.6980]]
Accuracy of decision tree: 0.9851970973209988
epoch: 999 	Train:[loss:0.2394 acc:0.7455]]
epoch: 999 	Train:[loss:0.2711 acc:0.7118]]
Accuracy of decision tree: 0.9928021314081754
epoch: 999 	Train:[loss:0.2446 acc:0.7286]]
epoch: 999 	Train:[loss:0.2370 acc:0.7370]]
Accuracy of decision tree: 0.9851439044210383
Level 1:
epoch: 999 	Train:[loss:0.2311 acc:0.7658]]
Level 0:
epoch: 999 	Train:[loss:0.2705 acc:0.7259]]
epoch: 999 	Train:[loss:0.2591 acc:0.7374]]
Accuracy of decision tree: 0.9852998191060519


In [15]:
y_pred_r = stacking.predict(X_val_r)
r2_score(y_val_r, y_pred_r)

0.8505474735905906

In [17]:
def find_best_model(X_train, y_train, X_val, y_val, level_0, level_1, accuracy_fxn, task, y_transform_function=None):
    bar = tqdm(total=4)
    
    df = {
        'level_1': [],
        'method': [],
        'train_acc': [],
        'val_acc': [],
        'time_taken': []
        
    }
    counter = 0
    
    for level1 in level_1:
        for method in ['blending', 'holdout']:
            stacking = Stacking(
                level_0=[copy.deepcopy(model) for model in level_0],
                level_1=copy.deepcopy(level1),
                method=method,
                frac=0.15,
                X=X_train,
                y=y_train,   
            )
            
            stacking.train()
            
            y_pred_train = stacking.predict(X_train)
            y_pred_val = stacking.predict(X_val)
                
            acc_train = accuracy_fxn(y_transform_function(y_train, -1), y_transform_function(y_pred_train, -1))
            acc_val = accuracy_fxn(y_transform_function(y_val, -1), y_transform_function(y_pred_val, -1))
            
            df['train_acc'].append(acc_train)
            df['val_acc'].append(acc_val)
            df['level_1'].append(level1)
            df['method'].append(method)
            df['time_taken'].append(stacking.time_taken)
            
            counter += 1
            bar.update(1)
            
            print(X_train.shape, y_train.shape)
            
                    
    df = pd.DataFrame(df).sort_values(by='val_acc', ascending=False)
    
    return df


In [18]:
def identity(x, axis):
    return x

regression = Model(lr=5e-3, logger=None, loss_fxn=MSELoss(), type='regression', epochs=1000)
regression.add(Layer(13, 1, Linear()))

regression_meta = Model(lr=5e-3, logger=None, loss_fxn=MSELoss(), type='regression', epochs=1000)
regression_meta.add(Layer(3, 1, Linear()))

mlp = Model(lr=5e-3, logger=None, loss_fxn=MSELoss(), type='regression', epochs=1000)
mlp.add(Layer(13, 16, Relu()))
mlp.add(Layer(16, 16, Relu()))
mlp.add(Layer(16, 1, Linear()))

dt = DecisionTree(criterion='squared_error', max_depth=10, task='regression')

df_regression = find_best_model(
    X_train=X_train_r,
    y_train=y_train_r,
    X_val=X_val_r,
    y_val=y_val_r,
    level_0=[regression, mlp, dt],
    level_1=[regression_meta, DecisionTree(criterion='squared_error', max_depth=5, task='regression')],
    accuracy_fxn=r2_score,
    task='regression',
    y_transform_function=identity
)


  0%|          | 0/4 [00:00<?, ?it/s]

Level 0:
epoch: 999 	Train:[loss:0.2977 acc:0.6910]]
epoch: 999 	Train:[loss:0.4432 acc:0.5399]]
Accuracy of decision tree: 0.9834041647627584
Level 1:
epoch: 999 	Train:[loss:0.2075 acc:0.8076]]
(455, 13) (455, 1)
epoch: 999 	Train:[loss:0.2896 acc:0.7142]]
epoch: 999 	Train:[loss:0.4481 acc:0.5579]]
Accuracy of decision tree: 0.9912390730252666
epoch: 999 	Train:[loss:0.2816 acc:0.7358]]
epoch: 999 	Train:[loss:0.3753 acc:0.6479]]
Accuracy of decision tree: 0.9869240340906972
epoch: 999 	Train:[loss:0.2896 acc:0.7133]]
epoch: 999 	Train:[loss:0.3681 acc:0.6357]]
Accuracy of decision tree: 0.9851970973209988
epoch: 999 	Train:[loss:0.2390 acc:0.7459]]
epoch: 999 	Train:[loss:0.2839 acc:0.6982]]
Accuracy of decision tree: 0.9928021314081754
epoch: 999 	Train:[loss:0.2446 acc:0.7286]]
epoch: 999 	Train:[loss:0.2584 acc:0.7133]]
Accuracy of decision tree: 0.9851439044210383
Level 1:
epoch: 999 	Train:[loss:0.2302 acc:0.7666]]
Level 0:
epoch: 999 	Train:[loss:0.2704 acc:0.7259]]
epoch: 99

In [19]:
df_regression

Unnamed: 0,level_1,method,train_acc,val_acc,time_taken
3,<__main__.DecisionTree object at 0x7f408fb26130>,holdout,0.875098,0.850149,12.163668
1,Layer: [in:3] [out:1] [activation:Linear]\n,holdout,0.922403,0.847752,12.768725
2,<__main__.DecisionTree object at 0x7f408fb26130>,blending,0.765768,0.748452,2.088301
0,Layer: [in:3] [out:1] [activation:Linear]\n,blending,0.826081,0.717198,2.435594


In [20]:
logistic_meta = Model(lr=5e-3, logger=None, loss_fxn=CrossEntropyLoss(), type='classification', epochs=1000)
logistic_meta.add(Layer(18, 6, Softmax()))

logistic = Model(lr=5e-3, logger=None, loss_fxn=CrossEntropyLoss(), type='classification', epochs=1000)
logistic.add(Layer(11, 6, Softmax()))

mlp = Model(lr=5e-3, logger=None, loss_fxn=CrossEntropyLoss(), type='classification', epochs=1000)
mlp.add(Layer(11, 16, Relu()))
mlp.add(Layer(16, 16, Relu()))
mlp.add(Layer(16, 6, Softmax()))

dt = DecisionTree(criterion='entropy', max_depth=10, task='classification')

df_classification = find_best_model(
    X_train=X_train_c,
    y_train=y_train_c,
    X_val=X_val_c,
    y_val=y_val_c,
    level_0=[logistic, mlp, dt],
    level_1=[logistic_meta, DecisionTree(criterion='entropy', max_depth=5, task='classification')],
    accuracy_fxn=accuracy_score,
    task='classification',
    y_transform_function=np.argmax
)


  0%|          | 0/4 [00:00<?, ?it/s]

Level 0:
epoch: 999 	Train:[loss:1.6424 acc:0.4391]]
epoch: 999 	Train:[loss:1.2851 acc:0.5759]]
Accuracy of decision tree: 0.9534883720930233
Level 1:
epoch: 999 	Train:[loss:1.2616 acc:0.4863]]
(914, 11) (914, 6)
epoch: 999 	Train:[loss:1.6657 acc:0.4487]]
epoch: 999 	Train:[loss:1.2654 acc:0.5828]]
Accuracy of decision tree: 0.908344733242134
epoch: 999 	Train:[loss:1.2469 acc:0.5527]]
epoch: 999 	Train:[loss:1.0553 acc:0.6252]]
Accuracy of decision tree: 0.920656634746922
epoch: 999 	Train:[loss:1.1777 acc:0.5513]]
epoch: 999 	Train:[loss:0.9689 acc:0.6320]]
Accuracy of decision tree: 0.8727770177838577
epoch: 999 	Train:[loss:1.0985 acc:0.5855]]
epoch: 999 	Train:[loss:0.8989 acc:0.6402]]
Accuracy of decision tree: 0.8467852257181943
epoch: 999 	Train:[loss:1.0369 acc:0.6052]]
epoch: 999 	Train:[loss:0.8673 acc:0.6516]]
Accuracy of decision tree: 0.9084699453551912
Level 1:
epoch: 999 	Train:[loss:1.2545 acc:0.4891]]
Level 0:
epoch: 999 	Train:[loss:1.0315 acc:0.5919]]
epoch: 999 

In [21]:
df_classification

Unnamed: 0,level_1,method,train_acc,val_acc,time_taken
3,<__main__.DecisionTree object at 0x7f403a5d8c40>,holdout,0.603939,0.545852,30.49013
0,Layer: [in:18] [out:6] [activation:Softmax]\n,blending,0.507659,0.458515,5.109774
1,Layer: [in:18] [out:6] [activation:Softmax]\n,holdout,0.507659,0.454148,27.930266
2,<__main__.DecisionTree object at 0x7f403a5d8c40>,blending,0.535011,0.414847,4.047029


### Analysis

The following uses data from the above dataframes and accuracy reported is that of validation accuracy.

1. Regression:
    1. Stacking: accuracy=0.85, time_taken= 12s
    2. Bagging:
        - Linear Regression: accuracy=0.79, time_taken=4.8s
        - MLP: accuracy=0.67, time_taken=16.3s
        - Decision Tree: accuracy=0.82, time_taken=3.41s 
<br><br>

2. Classification:
    1. Stacking: accuracy=0.54, time_taken= 30s
    2. Bagging:
        - Logistic Regression: accuracy=0.22, time_taken=10.65s
        - MLP: accuracy=0.40, time_taken=25.3s
        - Decision Tree: accuracy=0.60, time_taken=6.41s