# Gradient Boosting  

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
import warnings
import itertools
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import matplotlib.ticker as ticker
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

warnings.filterwarnings('ignore')

%matplotlib inline
%matplotlib notebook

In [3]:
df = pd.read_csv('./Datasets/clean_train.csv')
df_test = pd.read_csv('./Datasets/clean_test.csv')

In [4]:
df.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,date_recordedIs_month_start_False,date_recordedIs_month_start_True,date_recordedIs_quarter_end_False,date_recordedIs_quarter_end_True,date_recordedIs_quarter_start_False,date_recordedIs_quarter_start_True,date_recordedIs_year_end_False,date_recordedIs_year_start_False,date_recordedIs_year_start_True,status_group
0,69572,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999,...,1,0,1,0,1,0,1,1,0,functional
1,8776,0.0,1399,34.698766,-2.147466,0,20,2,280,2010,...,1,0,1,0,1,0,1,1,0,functional
2,34310,25.0,686,37.460664,-3.821329,0,21,4,250,2009,...,1,0,1,0,1,0,1,1,0,functional
3,67743,0.0,263,38.486161,-11.155298,0,90,63,58,1986,...,1,0,1,0,1,0,1,1,0,non functional
4,19728,0.0,0,31.130847,-1.825359,0,18,1,0,0,...,1,0,1,0,1,0,1,1,0,functional


In [5]:
df['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

In [6]:
df_test.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,date_recordedIs_month_end_False,date_recordedIs_month_end_True,date_recordedIs_month_start_False,date_recordedIs_month_start_True,date_recordedIs_quarter_end_False,date_recordedIs_quarter_end_True,date_recordedIs_quarter_start_False,date_recordedIs_quarter_start_True,date_recordedIs_year_end_False,date_recordedIs_year_start_False
0,50785,0.0,1996,35.290799,-4.059696,0,21,3,321,2012,...,1,0,1,0,1,0,1,0,1,1
1,51630,0.0,1569,36.656709,-3.309214,0,2,2,300,2000,...,1,0,1,0,1,0,1,0,1,1
2,17168,0.0,1567,34.767863,-5.004344,0,13,2,500,2010,...,1,0,0,1,1,0,1,0,1,1
3,45559,0.0,267,38.058046,-9.418672,0,80,43,250,1987,...,1,0,1,0,1,0,1,0,1,1
4,49871,500.0,1260,35.006123,-10.950412,0,10,3,60,2000,...,1,0,1,0,1,0,1,0,1,1


In [7]:
df.shape, df_test.shape

((59400, 201), (14850, 197))

In [8]:
# train and test are different shapes. Find which columns are different.
df.columns.difference(df_test.columns)

Index(['date_recordedIs_year_start_True',
       'extraction_type_other - mkulima/shinyanga', 'scheme_management_None',
       'status_group'],
      dtype='object')

In [9]:
df_test.columns.difference(df.columns)

Index([], dtype='object')

In [10]:
# Drop columns that are different excluding the target(status_group)
df.drop(columns = ['date_recordedIs_year_start_True',
                   'extraction_type_other - mkulima/shinyanga', 
                   'scheme_management_None'], inplace=True)

In [11]:
df.shape, df_test.shape

((59400, 198), (14850, 197))

# Modeling

### Feature Set

In [12]:
X = df.drop(columns = ['id', 'status_group']).values
y = df['status_group'].values

### Normalize the Data

In [13]:
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))

### Train-Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4)

In [15]:
print('Training set: ', X_train.shape, y_train.shape)
print('Testing set: ', X_test.shape, y_test.shape)

Training set:  (44550, 196) (44550,)
Testing set:  (14850, 196) (14850,)


### Training the Model

In [19]:
%%time
gbm = GradientBoostingClassifier(max_depth=8,
                                 min_samples_split = 500,
                                 min_samples_leaf=50,
                                 max_features=1.0,
                                 subsample=0.8).fit(X_train, y_train)

gbm

CPU times: user 9min 59s, sys: 6.9 s, total: 10min 6s
Wall time: 10min 54s


### Predicting

In [17]:
yhat = gbm.predict(X_test)

### Accuracy Evaluation
Using __accuracy classification score__ to compute subset accuracy. This function is equal to the Jaccard similarity score function. This computes how closely the actual labels and the predicted labels matched in the train set.

In [18]:
print('Train set Accuracy: ', metrics.accuracy_score(y_train, gbm.predict(X_train))
print('Test set Accuracy: ', metrics.accuracy_score(y_test, yhat))

Train set Accuracy:  0.7937149270482604
Test set Accuracy:  0.781077441077441


In [None]:
%%time
params = {'learning_rate': [0.075, 0.7],
          'max_depth': [13, 14],
          'min_samples_leaf': [15, 16],
          'max_features': [1.0],
          'n_estimators': [100, 200]} 


grid_grad = GridSearchCV(GradientBoostingClassifier(), params, cv=5, scoring='accuracy')
grid_grad.fit(X_train, y_train)