# How to stack machine learning models for improved performance

# Load dataset

In [1]:
#iris dataset

from sklearn.datasets import load_iris 
x, y = load_iris(return_X_y=True)

In [2]:
# remove low variance features 
from sklearn.feature_selection import VarianceThreshold

selection = VarianceThreshold(threshold=(0.1))
X = selection.fit_transform(x)
X.shape

(150, 4)

In [3]:
# Data splitting 
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(
    x, y, stratify=y, test_size=0.2, random_state=42
)

In [4]:
train_x.shape, test_x.shape

((120, 4), (30, 4))

# Build Classification models

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef 
from sklearn.metrics import f1_score

## K nearest neighbors 


In [6]:
from sklearn.neighbors import KNeighborsClassifier

In [7]:
knn = KNeighborsClassifier(3)
knn.fit(train_x, train_y)

# Make predictions 

train_y_pred = knn.predict(train_x)
y_test_pred = knn.predict(test_x)

# Training set performance 
knn_train_accuracy = accuracy_score(train_y, train_y_pred)
knn_train_mcc = matthews_corrcoef(train_y, train_y_pred)
knn_train_f1 = f1_score(train_y, train_y_pred, average="weighted")

# Test set performance 
knn_test_accuracy = accuracy_score(test_y, y_test_pred)
knn_test_mcc = matthews_corrcoef(test_y, y_test_pred)
knn_test_f1 = f1_score(test_y, y_test_pred, average="weighted")

print('Model performance for Training set')
print(f'-Accuracy: {knn_train_accuracy}')
print(f"-MCC: {knn_train_mcc}")
print(f"F1 Score: {knn_train_f1}")
print("--------------------------------------")
print('Model performance for Training set')
print(f'-Accuracy: {knn_test_accuracy}')
print(f"-MCC: {knn_test_mcc}")
print(f"F1 Score: {knn_test_f1}")

Model performance for Training set
-Accuracy: 0.9583333333333334
-MCC: 0.9375976715114386
F1 Score: 0.9583268218992551
--------------------------------------
Model performance for Training set
-Accuracy: 1.0
-MCC: 1.0
F1 Score: 1.0


## Support vector machine (Radial basis function kernel)

In [8]:
from sklearn.svm import SVC

svm_rbf = SVC(gamma=2, C=1)
svm_rbf.fit(train_x, train_y)

#Make predictions 
train_y_pred = svm_rbf.predict(train_x)
y_test_pred = svm_rbf.predict(test_x)

#training set performance 
svm_train_accuracy = accuracy_score(train_y, train_y_pred)
svm_train_mcc = matthews_corrcoef(train_y, train_y_pred)
svm_train_f1 = f1_score(train_y, train_y_pred, average="weighted")

# Test set performance 
svm_test_accuracy = accuracy_score(test_y, y_test_pred)
svm_test_mcc = matthews_corrcoef(test_y, y_test_pred)
svm_test_f1 = f1_score(test_y, y_test_pred, average="weighted")

print('Model performance for Training set')
print(f'-Accuracy: {svm_train_accuracy}')
print(f"-MCC: {svm_train_mcc}")
print(f"F1 Score: {svm_train_f1}")
print("--------------------------------------")
print('Model performance for Training set')
print(f'-Accuracy: {svm_test_accuracy}')
print(f"-MCC: {svm_test_mcc}")
print(f"F1 Score: {svm_test_f1}")

Model performance for Training set
-Accuracy: 0.9916666666666667
-MCC: 0.9876028806587153
F1 Score: 0.9916653643798512
--------------------------------------
Model performance for Training set
-Accuracy: 0.9666666666666667
-MCC: 0.9515873026942034
F1 Score: 0.9665831244778613


## Decision tree


In [9]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=5)
dt.fit(train_x, train_y)

#make predictions
train_y_pred = dt.predict(train_x)
y_test_pred = dt.predict(test_x)

# training set performance 
dt_train_accuracy = accuracy_score(train_y, train_y_pred)
dt_train_mcc = matthews_corrcoef(train_y, train_y_pred)
dt_train_f1 = f1_score(train_y, train_y_pred, average="weighted")

# Test set performance 
dt_test_accuracy = accuracy_score(test_y, y_test_pred)
dt_test_mcc = matthews_corrcoef(test_y, y_test_pred)
dt_test_f1 = f1_score(test_y, y_test_pred, average="weighted")

print('Model performance for Training set')
print(f'-Accuracy: {dt_train_accuracy}')
print(f"-MCC: {dt_train_mcc}")
print(f"F1 Score: {dt_train_f1}")
print("--------------------------------------")
print('Model performance for Training set')
print(f'-Accuracy: {dt_test_accuracy}')
print(f"-MCC: {dt_test_mcc}")
print(f"F1 Score: {dt_test_f1}")

Model performance for Training set
-Accuracy: 1.0
-MCC: 1.0
F1 Score: 1.0
--------------------------------------
Model performance for Training set
-Accuracy: 0.9333333333333333
-MCC: 0.9
F1 Score: 0.9333333333333333


## Random forest 

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=10)
rf.fit(train_x, train_y)

#make predictions
train_y_pred = rf.predict(train_x)
y_test_pred = rf.predict(test_x)

# training set performance 
rf_train_accuracy = accuracy_score(train_y, train_y_pred)
rf_train_mcc = matthews_corrcoef(train_y, train_y_pred)
rf_train_f1 = f1_score(train_y, train_y_pred, average="weighted")

# Test set performance 
rf_test_accuracy = accuracy_score(test_y, y_test_pred)
rf_test_mcc = matthews_corrcoef(test_y, y_test_pred)
rf_test_f1 = f1_score(test_y, y_test_pred, average="weighted")

print('Model performance for Training set')
print(f'-Accuracy: {rf_train_accuracy}')
print(f"-MCC: {rf_train_mcc}")
print(f"F1 Score: {rf_train_f1}")
print("--------------------------------------")
print('Model performance for Training set')
print(f'-Accuracy: {rf_test_accuracy}')
print(f"-MCC: {rf_test_mcc}")
print(f"F1 Score: {rf_test_f1}")

Model performance for Training set
-Accuracy: 1.0
-MCC: 1.0
F1 Score: 1.0
--------------------------------------
Model performance for Training set
-Accuracy: 0.9666666666666667
-MCC: 0.9515873026942034
F1 Score: 0.9665831244778613


## Neural Network 

In [12]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(alpha=1, max_iter=1000)
mlp.fit(train_x, train_y)

#make predictions
train_y_pred = mlp.predict(train_x)
y_test_pred = mlp.predict(test_x)

# training set performance 
mlp_train_accuracy = accuracy_score(train_y, train_y_pred)
mlp_train_mcc = matthews_corrcoef(train_y, train_y_pred)
mlp_train_f1 = f1_score(train_y, train_y_pred, average="weighted")

# Test set performance 
mlp_test_accuracy = accuracy_score(test_y, y_test_pred)
mlp_test_mcc = matthews_corrcoef(test_y, y_test_pred)
mlp_test_f1 = f1_score(test_y, y_test_pred, average="weighted")

print('Model performance for Training set')
print(f'-Accuracy: {mlp_train_accuracy}')
print(f"-MCC: {mlp_train_mcc}")
print(f"F1 Score: {mlp_train_f1}")
print("--------------------------------------")
print('Model performance for Training set')
print(f'-Accuracy: {mlp_test_accuracy}')
print(f"-MCC: {mlp_test_mcc}")
print(f"F1 Score: {mlp_test_f1}")

Model performance for Training set
-Accuracy: 0.9833333333333333
-MCC: 0.9754065040827025
F1 Score: 0.9833229101521785
--------------------------------------
Model performance for Training set
-Accuracy: 1.0
-MCC: 1.0
F1 Score: 1.0


# Building Stacked model

In [15]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimator_list = [
    ('knn', knn),
    ('svm', svm_rbf),
    ('dt',dt),
    ('rf', rf),
    ('mlp', mlp)
]

#build stack model 
stack_model = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression()
)

# Train stacked model
stack_model.fit(train_x, train_y)

#make predictions
train_y_pred = stack_model.predict(train_x)
y_test_pred = stack_model.predict(test_x)

# training set performance 
stack_train_accuracy = accuracy_score(train_y, train_y_pred)
stack_train_mcc = matthews_corrcoef(train_y, train_y_pred)
stack_train_f1 = f1_score(train_y, train_y_pred, average="weighted")

# Test set performance 
stack_test_accuracy = accuracy_score(test_y, y_test_pred)
stack_test_mcc = matthews_corrcoef(test_y, y_test_pred)
stack_test_f1 = f1_score(test_y, y_test_pred, average="weighted")

print('Model performance for Training set')
print(f'-Accuracy: {stack_train_accuracy}')
print(f"-MCC: {stack_train_mcc}")
print(f"F1 Score: {stack_train_f1}")
print("--------------------------------------")
print('Model performance for Training set')
print(f'-Accuracy: {stack_test_accuracy}')
print(f"-MCC: {stack_test_mcc}")
print(f"F1 Score: {stack_test_f1}")

Model performance for Training set
-Accuracy: 0.9916666666666667
-MCC: 0.9876028806587153
F1 Score: 0.9916653643798512
--------------------------------------
Model performance for Training set
-Accuracy: 1.0
-MCC: 1.0
F1 Score: 1.0
