In [1]:
# Bagging decision trees for classification
from pandas import read_csv
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
filename = 'Fraud_check.csv'
names = ['Sales','CompPrice','Income','Advertising','Population','Price','ShelveLoc','Age','Education','Urban','US']
dataframe = read_csv(filename)
dataframe

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [3]:
dataframe['Undergrad'].value_counts()

YES    312
NO     288
Name: Undergrad, dtype: int64

In [4]:
dataframe['Undergrad'] = dataframe['Undergrad'].map({'YES': 1, 'NO': 0})

In [5]:
dataframe['Undergrad'].value_counts()

1    312
0    288
Name: Undergrad, dtype: int64

In [6]:
dataframe['Marital.Status'].value_counts()

Single      217
Married     194
Divorced    189
Name: Marital.Status, dtype: int64

In [7]:
dataframe['Marital.Status'] = dataframe['Marital.Status'].map({'Divorced': 0,'Single': 1, 'Married': 2})

In [8]:
dataframe['Marital.Status'].value_counts()

1    217
2    194
0    189
Name: Marital.Status, dtype: int64

In [9]:
dataframe['Urban'].value_counts()

YES    302
NO     298
Name: Urban, dtype: int64

In [10]:
dataframe['Urban'] = dataframe['Urban'].map({'YES': 1, 'NO': 0})

In [11]:
dataframe['Urban'].value_counts()

1    302
0    298
Name: Urban, dtype: int64

In [31]:
##Converting the Taxable income variable to bucketing. 
dataframe['Taxable.Income'] = dataframe['Taxable.Income'].apply(lambda x: 'Risky' if x>30000  else 'Good')

In [34]:
## riskey = 0 , good = 1
dataframe['Taxable.Income'] = dataframe['Taxable.Income'].apply(lambda x: 0 if x== 'Risky'  else 1)

In [46]:
dataframe

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,1,0,50047,10,1
1,1,0,0,134075,18,1
2,0,2,0,160205,30,1
3,1,1,0,193264,15,1
4,0,2,0,27533,28,0
...,...,...,...,...,...,...
595,1,0,0,39492,7,1
596,1,0,0,55369,2,1
597,0,0,0,154058,0,1
598,1,2,0,180083,17,0


In [47]:
array = dataframe.values
X = array[:,0:5]
Y = array[:,5]

In [48]:
seed = 7
kfold = KFold(n_splits = 10, random_state = seed, shuffle = True)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator = cart, n_estimators = num_trees, random_state = seed)
results = cross_val_score(model, X, Y, cv = kfold)
print(results.mean())

0.5066666666666666


In [49]:
results

array([0.46666667, 0.43333333, 0.51666667, 0.6       , 0.51666667,
       0.46666667, 0.45      , 0.55      , 0.48333333, 0.58333333])

In [50]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

kfold = KFold(n_splits = 10, random_state = 7, shuffle = True)
cart = DecisionTreeClassifier()
num_trees = 100
max_features = 3
model = RandomForestClassifier(n_estimators = num_trees, max_features = max_features)
results = cross_val_score(model, X, Y, cv = kfold)
print(results.mean())

0.5033333333333335


In [51]:
# AdaBoost classification
from sklearn.ensemble import AdaBoostClassifier

kfold = KFold(n_splits = 10, random_state = seed, shuffle = True)
cart = DecisionTreeClassifier()
num_trees = 10
seed = 7
model = AdaBoostClassifier(n_estimators = num_trees, random_state = seed)
results = cross_val_score(model, X, Y, cv = kfold)
print(results.mean())

0.5183333333333333


In [52]:
# Stacking Ensemble for classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

kfold = KFold(n_splits = 10, random_state = 7, shuffle = True)

# create the sub models
estimators = []
model1 = LogisticRegression(max_iter = 500)
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, Y, cv = kfold)
print(results.mean())

0.49000000000000005


In [53]:
estimators

[('logistic', LogisticRegression(max_iter=500)),
 ('cart', DecisionTreeClassifier()),
 ('svm', SVC())]

In [54]:
model1

LogisticRegression(max_iter=500)

In [55]:
model2

DecisionTreeClassifier()

In [56]:
model3

SVC()

In [57]:
ensemble

VotingClassifier(estimators=[('logistic', LogisticRegression(max_iter=500)),
                             ('cart', DecisionTreeClassifier()),
                             ('svm', SVC())])