In [1]:
# Bagging decision trees for classification
from pandas import read_csv
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
filename = 'Company_Data.csv'
names = ['Sales','CompPrice','Income','Advertising','Population','Price','ShelveLoc','Age','Education','Urban','US']
dataframe = read_csv(filename)
dataframe

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


In [3]:
dataframe['Urban'].value_counts()

Yes    282
No     118
Name: Urban, dtype: int64

In [4]:
dataframe['Urban'] = dataframe['Urban'].map({'Yes': 1, 'No': 0})

In [5]:
dataframe['Urban'].value_counts()

1    282
0    118
Name: Urban, dtype: int64

In [6]:
dataframe['US'].value_counts()

Yes    258
No     142
Name: US, dtype: int64

In [7]:
dataframe['US'] = dataframe['US'].map({'Yes': 1, 'No': 0})

In [8]:
dataframe['US'].value_counts()

1    258
0    142
Name: US, dtype: int64

In [9]:
dataframe['ShelveLoc'].value_counts()

Medium    219
Bad        96
Good       85
Name: ShelveLoc, dtype: int64

In [10]:
dataframe['ShelveLoc'] = dataframe['ShelveLoc'].map({'Bad': 0,'Medium': 1, 'Good': 2})

In [11]:
dataframe['ShelveLoc'].value_counts()

1    219
0     96
2     85
Name: ShelveLoc, dtype: int64

In [12]:
dataframe

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,0,42,17,1,1
1,11.22,111,48,16,260,83,2,65,10,1,1
2,10.06,113,35,10,269,80,1,59,12,1,1
3,7.40,117,100,4,466,97,1,55,14,1,1
4,4.15,141,64,3,340,128,0,38,13,1,0
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,2,33,14,1,1
396,6.14,139,23,3,37,120,1,55,11,0,1
397,7.41,162,26,12,368,159,1,40,18,1,1
398,5.94,100,79,7,284,95,0,50,12,1,1


In [13]:
array = dataframe.values
X = array[:,0:10]
Y = array[:,10]

In [14]:
seed = 7
kfold = KFold(n_splits = 10, random_state = seed, shuffle = True)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator = cart, n_estimators = num_trees, random_state = seed)
results = cross_val_score(model, X, Y, cv = kfold)
print(results.mean())

0.885


In [15]:
results

array([0.9  , 0.85 , 0.9  , 0.85 , 0.9  , 0.9  , 0.875, 0.875, 0.875,
       0.925])

In [16]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

kfold = KFold(n_splits = 10, random_state = 7, shuffle = True)
cart = DecisionTreeClassifier()
num_trees = 100
max_features = 3
model = RandomForestClassifier(n_estimators = num_trees, max_features = max_features)
results = cross_val_score(model, X, Y, cv = kfold)
print(results.mean())

0.9025000000000001


In [17]:
# AdaBoost classification
from sklearn.ensemble import AdaBoostClassifier

kfold = KFold(n_splits = 10, random_state = seed, shuffle = True)
cart = DecisionTreeClassifier()
num_trees = 10
seed = 7
model = AdaBoostClassifier(n_estimators = num_trees, random_state = seed)
results = cross_val_score(model, X, Y, cv = kfold)
print(results.mean())

0.8950000000000001


In [18]:
# Stacking Ensemble for classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

kfold = KFold(n_splits = 10, random_state = 7, shuffle = True)

# create the sub models
estimators = []
model1 = LogisticRegression(max_iter = 500)
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, Y, cv = kfold)
print(results.mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8525


In [19]:
estimators

[('logistic', LogisticRegression(max_iter=500)),
 ('cart', DecisionTreeClassifier()),
 ('svm', SVC())]

In [20]:
model1

LogisticRegression(max_iter=500)

In [21]:
model2

DecisionTreeClassifier()

In [22]:
model3

SVC()

In [23]:
ensemble

VotingClassifier(estimators=[('logistic', LogisticRegression(max_iter=500)),
                             ('cart', DecisionTreeClassifier()),
                             ('svm', SVC())])