# Decision Tree Classifier & Random Forest Models     

In [34]:
%load_ext autoreload
%autoreload 2

warnings.filterwarnings('ignore')
 
%matplotlib inline
%matplotlib notebook

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
import time
import warnings
import itertools
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import NullFormatter
import matplotlib.image as mpimg
import pandas as pd
import numpy as np
import pydotplus
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals.six import StringIO
from sklearn import tree

In [3]:
df = pd.read_csv('./Datasets/clean_train.csv')
df_test = pd.read_csv('./Datasets/clean_test.csv')

In [4]:
df.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,date_recordedIs_month_start_False,date_recordedIs_month_start_True,date_recordedIs_quarter_end_False,date_recordedIs_quarter_end_True,date_recordedIs_quarter_start_False,date_recordedIs_quarter_start_True,date_recordedIs_year_end_False,date_recordedIs_year_start_False,date_recordedIs_year_start_True,status_group
0,69572,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999,...,1,0,1,0,1,0,1,1,0,functional
1,8776,0.0,1399,34.698766,-2.147466,0,20,2,280,2010,...,1,0,1,0,1,0,1,1,0,functional
2,34310,25.0,686,37.460664,-3.821329,0,21,4,250,2009,...,1,0,1,0,1,0,1,1,0,functional
3,67743,0.0,263,38.486161,-11.155298,0,90,63,58,1986,...,1,0,1,0,1,0,1,1,0,non functional
4,19728,0.0,0,31.130847,-1.825359,0,18,1,0,0,...,1,0,1,0,1,0,1,1,0,functional


In [6]:
df['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

In [7]:
df_test.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,date_recordedIs_month_end_False,date_recordedIs_month_end_True,date_recordedIs_month_start_False,date_recordedIs_month_start_True,date_recordedIs_quarter_end_False,date_recordedIs_quarter_end_True,date_recordedIs_quarter_start_False,date_recordedIs_quarter_start_True,date_recordedIs_year_end_False,date_recordedIs_year_start_False
0,50785,0.0,1996,35.290799,-4.059696,0,21,3,321,2012,...,1,0,1,0,1,0,1,0,1,1
1,51630,0.0,1569,36.656709,-3.309214,0,2,2,300,2000,...,1,0,1,0,1,0,1,0,1,1
2,17168,0.0,1567,34.767863,-5.004344,0,13,2,500,2010,...,1,0,0,1,1,0,1,0,1,1
3,45559,0.0,267,38.058046,-9.418672,0,80,43,250,1987,...,1,0,1,0,1,0,1,0,1,1
4,49871,500.0,1260,35.006123,-10.950412,0,10,3,60,2000,...,1,0,1,0,1,0,1,0,1,1


In [8]:
df.shape, df_test.shape

((59400, 201), (14850, 197))

In [9]:
# train and test are different shapes. Find which columns are different.
df.columns.difference(df_test.columns)

Index(['date_recordedIs_year_start_True',
       'extraction_type_other - mkulima/shinyanga', 'scheme_management_None',
       'status_group'],
      dtype='object')

In [10]:
df_test.columns.difference(df.columns)

Index([], dtype='object')

In [11]:
# Drop columns that are different excluding the target(status_group)
df.drop(columns = ['date_recordedIs_year_start_True',
       'extraction_type_other - mkulima/shinyanga', 'scheme_management_None'], inplace=True)

In [12]:
df.shape, df_test.shape

((59400, 198), (14850, 197))

# Decision Tree Classifier

### Feature Set

In [13]:
X = df.drop(columns = ['id', 'status_group']).values
y = df['status_group'].values

### Normalize the Data

In [14]:
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))

### Train-Test Split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

In [16]:
print('Training set: ', X_train.shape, y_train.shape)
print('Testing set: ', X_test.shape, y_test.shape)

Training set:  (44550, 196) (44550,)
Testing set:  (14850, 196) (14850,)


### Training the Model
Starting the algorithm with k=4 neighbors at first.

In [43]:
dtc = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(X_train, y_train)
dtc

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

### Predicting

In [44]:
yhat = dtc.predict(X_test)

### Accuracy Evaluation
Using __accuracy classification score__ to compute subset accuracy. This function is equal to the Jaccard similarity score function. This computes how closely the actual labels and the predicted labels matched in the train set.

In [45]:
print('Train set Accuracy: ', metrics.accuracy_score(y_train, dtc.predict(X_train)))
print('Test set Accuracy: ', metrics.accuracy_score(y_test, yhat))

Train set Accuracy:  0.7143658810325477
Test set Accuracy:  0.7122558922558923


# Visualization of Decision Tree

In [48]:
dot_data = StringIO()
filename = 'pumptree.png'
featureNames = df.columns[0:196]
targetNames = df['status_group']
out = tree.export_graphviz(dtc, feature_names=featureNames, 
                           out_file=dot_data, 
                           class_names=np.unique(y_train), 
                           filled=True, 
                           special_characters=True, 
                           rotate=False)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x1a596dc18>

<IPython.core.display.Javascript object>

### Confusion Matrix

In [47]:
# Create a confusion matrix for K=1 model.
print(classification_report(y_test, yhat))
pd.DataFrame(confusion_matrix(y_test, yhat), columns=['Pred +', 'Pred Fix', 'Pred -'], index=['Act +', 'Pred Fix', 'Act -'])

                         precision    recall  f1-score   support

             functional       0.67      0.95      0.79      8008
functional needs repair       0.47      0.07      0.12      1062
         non functional       0.87      0.50      0.63      5780

              micro avg       0.71      0.71      0.71     14850
              macro avg       0.67      0.51      0.51     14850
           weighted avg       0.73      0.71      0.68     14850



Unnamed: 0,Pred +,Pred Fix,Pred -
Act +,7618,34,356
Pred Fix,903,72,87
Act -,2845,48,2887


# Random Forest Classifier

### Training the Model
Data was already split and normalized above. No need to do it again.

In [52]:
rfc = RandomForestClassifier(criterion='entropy', max_depth=None).fit(X_train, y_train)

In [53]:
yhatRF = rfc.predict(X_test)

In [54]:
print('Train set Accuracy: ', metrics.accuracy_score(y_train, rfc.predict(X_train)))
print('Test set Accuracy: ', metrics.accuracy_score(y_test, yhatRF))

Train set Accuracy:  0.9809427609427609
Test set Accuracy:  0.8002693602693602


### Let's try a Random Forest with Grid Search cross validation

In [57]:
rfc_params = {'n_estimators':[2,5,10,20,50,75,150],
              'criterion':['gini', 'entropy'],
              'max_depth':[2,5,10,20,50,None],
              'min_samples_split':[2,5,10,20]}


grid_rfc = GridSearchCV(RandomForestClassifier(), 
                        rfc_params, 
                        cv=5, 
                        scoring='accuracy').fit(X_train, y_train)

In [58]:
yhatGS = grid_rfc.predict(X_test)

In [59]:
print('Train set Accuracy: ', metrics.accuracy_score(y_train, grid_rfc.predict(X_train)))
print('Test set Accuracy: ', metrics.accuracy_score(y_test, yhatGS))

Train set Accuracy:  0.8968799102132435
Test set Accuracy:  0.8154882154882155
