In [1]:
from __future__ import division
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# we will use the bank ira data that we used extensively with R
# the first time  bank-ira/bank-data.xlsx
import tkFileDialog  #for file choose dialog
bm = pd.read_excel(tkFileDialog.askopenfilename())

In [3]:
bm.head()

Unnamed: 0,age,sex,region,income,married,children,car,save_act,ch_act,mortgage,ira
0,48,FEMALE,INNER_CITY,17546.0,NO,1,NO,NO,NO,NO,YES
1,40,MALE,TOWN,30085.1,YES,3,YES,NO,YES,YES,NO
2,51,FEMALE,INNER_CITY,16575.4,YES,0,YES,YES,YES,NO,NO
3,23,FEMALE,TOWN,20375.4,YES,3,NO,NO,YES,NO,NO
4,57,FEMALE,RURAL,50576.3,YES,0,NO,YES,NO,NO,NO


In [4]:
bm.shape

(600, 11)

In [5]:
bm.dtypes

age           int64
sex          object
region       object
income      float64
married      object
children      int64
car          object
save_act     object
ch_act       object
mortgage     object
ira          object
dtype: object

In [6]:
# the tree classfier in scikit only accepts numeric data!
# now we will modify all the object columns to integers using labelencoder
# Lets look at three ways of doing it
#  1) one by one manually
#  2) using a for loop but typing in the names of columns manually
#  3) no manual entry of column names, best for big datasets
# first let us make a deepcopy with which to experiment
from copy import deepcopy
bm1 = deepcopy(bm)

In [7]:
# we will use a function LabelEncoder from sklean.preprocessing module
# it looks at the number of unique values in each column and 
#           gives an integer value between 0:number of unique values
from sklearn.preprocessing import LabelEncoder # for label conversion

In [8]:
# method 1: one by one manually
# consider the sex column
bm1["sex"].head()

0    FEMALE
1      MALE
2    FEMALE
3    FEMALE
4    FEMALE
Name: sex, dtype: object

In [9]:
bm1["sex"] = LabelEncoder().fit_transform(bm1["sex"])

In [10]:
bm1["sex"].head()

0    0
1    1
2    0
3    0
4    0
Name: sex, dtype: int64

In [11]:
# method 2: for loop but type in the names of columns
for x in ["region","married","car","save_act","ch_act","mortgage","ira"]:
    bm1[x] = LabelEncoder().fit_transform(bm1[x])
bm1.head()

Unnamed: 0,age,sex,region,income,married,children,car,save_act,ch_act,mortgage,ira
0,48,0,0,17546.0,0,1,0,0,0,0,1
1,40,1,3,30085.1,1,3,1,0,1,1,0
2,51,0,0,16575.4,1,0,1,1,1,0,0
3,23,0,3,20375.4,1,3,0,0,1,0,0
4,57,0,1,50576.3,1,0,0,1,0,0,0


In [12]:
# method 3: fully automated
# let us make a list of columns that are of type object
#  select_dtypes() method selects columns on the basis of column type
#  output is another dataframe with just the selected columns
bm.select_dtypes(include=['object']).head()   

Unnamed: 0,sex,region,married,car,save_act,ch_act,mortgage,ira
0,FEMALE,INNER_CITY,NO,NO,NO,NO,NO,YES
1,MALE,TOWN,YES,YES,NO,YES,YES,NO
2,FEMALE,INNER_CITY,YES,YES,YES,YES,NO,NO
3,FEMALE,TOWN,YES,NO,NO,YES,NO,NO
4,FEMALE,RURAL,YES,NO,YES,NO,NO,NO


In [13]:
# we want just the names of the columns, 
#    use the dataframe method .columns
bm.select_dtypes(include=['object']).columns    # the output is an index of columns

Index([u'sex', u'region', u'married', u'car', u'save_act', u'ch_act',
       u'mortgage', u'ira'],
      dtype='object')

In [14]:
# now to get the list use the list() function
list(bm.select_dtypes(include=['object']).columns)

[u'sex',
 u'region',
 u'married',
 u'car',
 u'save_act',
 u'ch_act',
 u'mortgage',
 u'ira']

In [15]:
# use labelencoder for each column
objcols = list(bm.select_dtypes(include=['object']).columns)
for x in objcols:
    bm[x] = LabelEncoder().fit_transform(bm[x])

In [16]:
bm.dtypes  # now fixed, all are integer or floats

age           int64
sex           int64
region        int64
income      float64
married       int64
children      int64
car           int64
save_act      int64
ch_act        int64
mortgage      int64
ira           int64
dtype: object

In [17]:
bm.head()

Unnamed: 0,age,sex,region,income,married,children,car,save_act,ch_act,mortgage,ira
0,48,0,0,17546.0,0,1,0,0,0,0,1
1,40,1,3,30085.1,1,3,1,0,1,1,0
2,51,0,0,16575.4,1,0,1,1,1,0,0
3,23,0,3,20375.4,1,3,0,0,1,0,0
4,57,0,1,50576.3,1,0,0,1,0,0,0


In [18]:
X = bm.ix[:,0:-1].as_matrix()
y = bm.ira.as_matrix()

In [19]:
from sklearn.cross_validation import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3333, random_state=42)

In [20]:
X_train.shape

(400, 10)

In [21]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion = "gini", min_samples_split = 10).fit(X_train, y_train)

In [22]:
dtree

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')

In [23]:
# number of nodes
dtree.tree_.node_count

85

In [24]:
y_pred = dtree.predict(X_test)

In [25]:
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,90,9,99
1,21,80,101
All,111,89,200


In [26]:
from sklearn.metrics import classification_report, accuracy_score
print "Accuracy Score:", accuracy_score(y_test, y_pred), '\n\n', classification_report(y_test, y_pred)

Accuracy Score: 0.85 

             precision    recall  f1-score   support

          0       0.81      0.91      0.86        99
          1       0.90      0.79      0.84       101

avg / total       0.86      0.85      0.85       200



In [27]:
# now we will try to improve it using grid search
# first set up the pipeline
from sklearn.pipeline import Pipeline 
from sklearn.grid_search import GridSearchCV
pipeline = Pipeline([("dtree", DecisionTreeClassifier())])
# now the parameter space
parameters = { 'dtree__max_depth': (4, 5, 6, 7, 8, 9, 10),\
              'dtree__min_samples_split': (1, 2, 3),\
              'dtree__min_samples_leaf': (1, 2, 3) }
# now for grid search using 10 fold CV for each combination of parameters
# some scoring options:'accuracy', f1','mean_squared_error', 'precision','recall', 'roc_auc'
# n_jobs: number of cpus to use,  -1 means use all
import time
start_time = time.time() # so we can see how long things take
grid_search = GridSearchCV( pipeline, parameters, n_jobs =-1, verbose = 1,\
                           scoring ='precission', cv=10)
grid_search.fit( X_train, y_train) 
print 'Best score: %0.3f' % grid_search.best_score_ 
print 'Best parameters set:' 
best_parameters = grid_search.best_estimator_.get_params() 
for param_name in sorted( parameters.keys()): 
    print '\ t% s: %r' % (param_name, best_parameters[ param_name]) 
predictions = grid_search.predict( X_test) 
print "\n\n Accuracy Score:", accuracy_score(y_test, predictions), '\n\n', classification_report(y_test, predictions)
print "elaplsed time ", time.time() - start_time

Fitting 10 folds for each of 63 candidates, totalling 630 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 608 out of 630 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 630 out of 630 | elapsed:    0.8s finished


Best score: 0.887
Best parameters set:
\ tdtree__max_depth: 6
\ tdtree__min_samples_leaf: 2
\ tdtree__min_samples_split: 1


 Accuracy Score: 0.87 

             precision    recall  f1-score   support

          0       0.85      0.90      0.87        99
          1       0.89      0.84      0.87       101

avg / total       0.87      0.87      0.87       200

elaplsed time  1.01709890366
