Stacking

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer()

data_df = pd.DataFrame(data = data.data,
                       columns = data.feature_names)

X_train, X_rem, y_train, y_rem = train_test_split(data.data, data.target, random_state=97, train_size=0.6)

X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, random_state=97, test_size=0.3)

print(data.data.size)

17070


In [3]:
data = load_breast_cancer()

data_df = pd.DataFrame(data = data.data,
                       columns = data.feature_names)

X_train, X_rem, y_train, y_rem = train_test_split(data.data, data.target, random_state=97, train_size=0.6)

X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, random_state=97, test_size=0.3)

print(data.data.size)


17070


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

models = dict()
# preds = list()
models['lr'] = LogisticRegression(max_iter=100000)
models['cart'] = DecisionTreeClassifier()
models['bayes'] = GaussianNB()



for model in models:
  models[model].fit(X_train,y_train)

In [5]:
pred1 = models['lr'].predict(X_valid)
pred2 = models['cart'].predict(X_valid)
pred3 = models['bayes'].predict(X_valid)

test_preds1 = models['lr'].predict(X_test)
test_preds2 = models['cart'].predict(X_test)
test_preds3 = models['bayes'].predict(X_test)

data_df_new = pd.DataFrame(data = X_valid,
                       columns = data.feature_names)
data_df_new['lr'] = pred1
data_df_new['cart'] = pred2
data_df_new['bayes'] = pred3


print(data_df_new.info())
print(data_df_new.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              159 non-null    float64
 1   mean texture             159 non-null    float64
 2   mean perimeter           159 non-null    float64
 3   mean area                159 non-null    float64
 4   mean smoothness          159 non-null    float64
 5   mean compactness         159 non-null    float64
 6   mean concavity           159 non-null    float64
 7   mean concave points      159 non-null    float64
 8   mean symmetry            159 non-null    float64
 9   mean fractal dimension   159 non-null    float64
 10  radius error             159 non-null    float64
 11  texture error            159 non-null    float64
 12  perimeter error          159 non-null    float64
 13  area error               159 non-null    float64
 14  smoothness error         1

In [6]:
pred1 = models['lr'].predict(X_valid)
pred2 = models['cart'].predict(X_valid)
pred3 = models['bayes'].predict(X_valid)

test_preds1 = models['lr'].predict(X_test)
test_preds2 = models['cart'].predict(X_test)
test_preds3 = models['bayes'].predict(X_test)

data_df_new = pd.DataFrame(data = X_valid,
                       columns = data.feature_names)
data_df_new['lr'] = pred1
data_df_new['cart'] = pred2
data_df_new['bayes'] = pred3


print(data_df_new.info())
print(data_df_new.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              159 non-null    float64
 1   mean texture             159 non-null    float64
 2   mean perimeter           159 non-null    float64
 3   mean area                159 non-null    float64
 4   mean smoothness          159 non-null    float64
 5   mean compactness         159 non-null    float64
 6   mean concavity           159 non-null    float64
 7   mean concave points      159 non-null    float64
 8   mean symmetry            159 non-null    float64
 9   mean fractal dimension   159 non-null    float64
 10  radius error             159 non-null    float64
 11  texture error            159 non-null    float64
 12  perimeter error          159 non-null    float64
 13  area error               159 non-null    float64
 14  smoothness error         1

In [7]:
print(data_df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst texture  worst perimeter  \
0           

In [8]:
train_stack = np.column_stack((pred1,pred2,pred3))
test_stack = np.column_stack((test_preds1,test_preds2,test_preds3))

In [9]:
final_model = LogisticRegression(max_iter=100000)

final_model.fit(train_stack,y_valid)

LogisticRegression(max_iter=100000)

In [10]:
final_predictions = final_model.predict(test_stack)

In [11]:
from sklearn import metrics

print("Accuracy: ",metrics.accuracy_score(y_test, final_predictions))
print("Precision: ",metrics.precision_score(y_test, final_predictions))
print("Recall: ",metrics.recall_score(y_test, final_predictions))

Accuracy:  0.9855072463768116
Precision:  0.9791666666666666
Recall:  1.0


In [12]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

def model_Evaluate(model, y_test, final_predictions):
  print(classification_report(y_test, final_predictions))


model_Evaluate(final_model, y_test, final_predictions)

              precision    recall  f1-score   support

           0       1.00      0.95      0.98        22
           1       0.98      1.00      0.99        47

    accuracy                           0.99        69
   macro avg       0.99      0.98      0.98        69
weighted avg       0.99      0.99      0.99        69



StackingClassifier from sklearn

In [13]:
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)

data=load_breast_cancer()
data_df = pd.DataFrame(data = data.data,
                       columns = data.feature_names)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=97, train_size=0.8)


model1 = LogisticRegression(max_iter=100000)
model2 = DecisionTreeClassifier()
model3 = GaussianNB()

print(y_test)

[1 0 1 1 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 0 0 1 1 0 1 1 1 1 0 0 0 1 0 0 0 0
 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0
 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 1 0 1 0 0 1 1 0 1 0 1 0
 0 0 0]


In [14]:
estimators = [
     ('lr', model1),
     ('cart', model2),
     ('bayes', model3)
]

final_model = LogisticRegression(max_iter=100000)
sclf = StackingClassifier(estimators=estimators,
                            final_estimator=final_model,
                            cv=10)

In [15]:
sclf.fit(X_train, y_train)

StackingClassifier(cv=10,
                   estimators=[('lr', LogisticRegression(max_iter=100000)),
                               ('cart', DecisionTreeClassifier()),
                               ('bayes', GaussianNB())],
                   final_estimator=LogisticRegression(max_iter=100000))

In [16]:
prediction = sclf.predict(X_test)

In [17]:
print("Accuracy: ",sclf.score(X_test, y_test))
print("Precision: ",metrics.precision_score( y_test, prediction))
print("Accuracy: ",metrics.recall_score( y_test, prediction))

Accuracy:  0.956140350877193
Precision:  0.9558823529411765
Accuracy:  0.9701492537313433


AdaBoost

In [18]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)

data=load_breast_cancer()
data_df = pd.DataFrame(data = data.data,
                       columns = data.feature_names)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=97, train_size=0.8)


model1 = LogisticRegression(max_iter=100000)
model2 = DecisionTreeClassifier()
model3 = GaussianNB()

estimators = [
     ('lr', model1),
     ('cart', model2),
     ('bayes', model3)
]

In [19]:
abc = AdaBoostClassifier(learning_rate=1)
abc.fit(X_train, y_train)

prediction = abc.predict(X_test)

In [20]:
from sklearn import metrics

print("Accuracy: ",abc.score(X_test, y_test))
print("Precision: ",metrics.precision_score(y_test, prediction))
print("Recall: ",metrics.recall_score( y_test, prediction))

Accuracy:  0.9473684210526315
Precision:  0.9552238805970149
Recall:  0.9552238805970149


Adaboost Regression on concrete_data.csv

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
datasets = pd.read_csv('/content/drive/MyDrive/datasets/Contrete.csv')

In [23]:
X = datasets.iloc[:, :-1].values

# Only last column, 0 for 1st column and -1 for last colum,-2 for 2nd last column
y = datasets.iloc[:, -1].values
print("\n\nInput : \n", X)
print("\n\nOutput: \n", y)



Input : 
 [[ 540.     0.     0.  ... 1040.   676.    28. ]
 [ 540.     0.     0.  ... 1055.   676.    28. ]
 [ 332.5  142.5    0.  ...  932.   594.   270. ]
 ...
 [ 148.5  139.4  108.6 ...  892.4  780.    28. ]
 [ 159.1  186.7    0.  ...  989.6  788.9   28. ]
 [ 260.9  100.5   78.3 ...  864.5  761.5   28. ]]


Output: 
 [79.99 61.89 40.27 ... 23.7  32.77 32.4 ]


In [24]:
from sklearn.model_selection import train_test_split

#split data set into train and test sets
X_train, X_test, y_train, y_test = train_test_split(datasets, y, test_size = 0.25, random_state = 97)

print(y_test)

[22.72 13.57 51.02 25.22 21.78 35.76 36.35 33.08 55.06 37.96 56.83 39.36
 25.69 66.82 67.11 23.51 14.6  61.09 65.91 76.24 21.95 33.06 55.02 17.22
 50.53 57.03 11.41 39.38 17.95 25.73 51.72 15.61 21.97 24.24 69.3  35.57
 28.94 64.3   6.28 28.02 79.4  77.3  55.6  15.34 59.09 72.99 70.7  42.8
 45.08 66.   40.56 55.51 58.52 33.76 29.41 61.99 31.45 31.35 30.22  8.
 53.69 53.9  53.52 12.47 13.09 32.84 36.45 44.09 32.53 52.83 42.29 35.08
 17.44  9.73 42.22 14.99 40.27 74.36 34.9  74.19 17.6  23.8  10.54 14.2
 18.2  15.42 39.   26.92 41.05 32.24 41.67 26.85 39.42 37.81 24.28 12.55
 42.55 33.4  15.53 31.35 39.64 29.73 37.34 25.1  19.77 54.38 10.39 53.3
 12.45 41.24 52.44 22.9  17.54 12.64 37.68 44.52 52.04 32.01 26.26 40.2
 33.76 31.18 44.4  33.61  7.75 59.   52.12 50.73 46.9  32.63 43.38 56.74
 32.05 40.87 51.73 22.44 64.9  41.68  6.27 21.48 12.05 41.16 43.38 24.43
 17.84 10.35 48.79 66.9  33.73  7.84 66.7  13.46 52.52 11.65 14.4  44.87
 18.42 19.69 64.9  39.29 13.12 55.94 14.59 38.11 49.8  55

In [25]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

 
#Choosing Decision Tree with 1 level as the weak learner
DTR=DecisionTreeRegressor(max_depth=1)
RegModel = AdaBoostRegressor(n_estimators=50, base_estimator=DTR ,learning_rate=1)

In [26]:
AB=RegModel.fit(X_train,y_train)
y_pred=AB.predict(X_test)

In [27]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# model_Evaluate(AB, y_test, predictions) --> doesn't work for continuous values
print("Accuracy: ",RegModel.score(X_test, y_test))

Accuracy:  0.7057686422041147


In [28]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

X,y = load_diabetes(return_X_y=True)

#split data set into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 97)


from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

 
#Choosing Decision Tree with 1 level as the weak learner
DTR=DecisionTreeRegressor(max_depth=10)
RegModel = AdaBoostRegressor(n_estimators=100, base_estimator=DTR ,learning_rate=1)

AB=RegModel.fit(X_train,y_train)
y_pred=AB.predict(X_test)

from sklearn import metrics
from sklearn.metrics import mean_squared_error
# model_Evaluate(AB, y_test, predictions) --> doesn't work for continuous values
print("Accuracy: ",RegModel.score(X_test, y_test))

print("Mean Square Error: ",mean_squared_error(y_test,y_pred))

Accuracy:  0.38678549468167767
Mean Square Error:  3942.5735170757184
