<a href="https://colab.research.google.com/github/Bulat27/ML_Exercise_Stacking/blob/master/Stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing necessary libraries and dataset

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

from sklearn.linear_model import LogisticRegression

In [2]:
url = 'https://raw.githubusercontent.com/Bulat27/ML_Exercise_Stacking/master/bank.csv'
df_bank = pd.read_csv(url, sep = ";")

In [3]:
df_bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


# Pre-processing data

In [4]:
saved_y = df_bank[['y']]

df_bank = df_bank.drop('y', axis = 1)

In [5]:
cat_col = 'object'
cat_columns = list(df_bank.select_dtypes(include=cat_col).columns)
cat_data = df_bank[cat_columns]
cat_vars = cat_data.columns

In [6]:
for var in cat_vars:
    cat_list = pd.get_dummies(df_bank[var], prefix=var)
    df_bank=df_bank.join(cat_list)

    
data_vars=df_bank.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]

In [7]:
bank_final_df =df_bank[to_keep]

In [8]:
bank_final_df.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,1787,19,79,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,33,4789,11,220,1,339,4,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,35,1350,16,185,1,330,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,30,1476,3,199,4,-1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,59,0,5,226,1,-1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1


In [9]:
bank_final_df.columns.values

array(['age', 'balance', 'day', 'duration', 'campaign', 'pdays',
       'previous', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student',
       'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'default_no', 'default_yes', 'housing_no',
       'housing_yes', 'loan_no', 'loan_yes', 'contact_cellular',
       'contact_telephone', 'contact_unknown', 'month_apr', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown'], dtype=object)

In [10]:
df_bank = pd.concat([bank_final_df, saved_y], axis = 1)

In [11]:
df_bank.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
0,30,1787,19,79,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,no
1,33,4789,11,220,1,339,4,0,0,0,...,0,1,0,0,0,1,0,0,0,no
2,35,1350,16,185,1,330,1,0,0,0,...,0,0,0,0,0,1,0,0,0,no
3,30,1476,3,199,4,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,no
4,59,0,5,226,1,-1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,no


In [12]:
binarizer = LabelBinarizer()

In [13]:
transformed_y = binarizer.fit_transform(df_bank.y.values.reshape(-1, 1))

In [14]:
transformed_y

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [15]:
df_bank['y_transformed'] = transformed_y
df_bank['y_transformed'] = df_bank['y_transformed'].astype('int64') 

In [16]:
df_bank['y'].value_counts(normalize = True)

no     0.88476
yes    0.11524
Name: y, dtype: float64

In [17]:
df_bank['y_transformed'].value_counts(normalize = True)

0    0.88476
1    0.11524
Name: y_transformed, dtype: float64

# Split the data

In [18]:
x = df_bank.drop(['y', 'y_transformed'], axis = 1)
y = df_bank['y_transformed']

In [19]:
trainX, testX, trainY, testY = train_test_split(x, y, test_size = 0.2)

#Stacking model


In [20]:
estimators = [
    ('svm', LinearSVC(max_iter = 1000)),
    ('knn', KNeighborsClassifier(n_neighbors = 5))
]

In [21]:
stack_clf = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression())

In [22]:
stack_clf.fit(trainX, trainY)



StackingClassifier(estimators=[('svm', LinearSVC()),
                               ('knn', KNeighborsClassifier())],
                   final_estimator=LogisticRegression())

# Comparison of stacked model with base models




In [23]:
stack_pred = stack_clf.predict(testX)

In [24]:
svm = LinearSVC(max_iter=1000)
svm.fit(trainX, trainY)
svm_pred = svm.predict(testX)



In [25]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(trainX, trainY)
knn_pred = knn.predict(testX)

In [26]:
print(f'SVM accuracy :{accuracy_score(testY, svm_pred)}')
print(f'KNN accuracy :{accuracy_score(testY, knn_pred)}')
print(f'Stacking accuracy :{accuracy_score(testY, stack_pred)}')

SVM accuracy :0.8994475138121547
KNN accuracy :0.8917127071823204
Stacking accuracy :0.901657458563536
