# Predicting faulty water pumps

## Optimize Model Paramaters¶

## Import Libraries
Import libraries necessary to get the models trained

In [2]:
import datetime
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as ss
import seaborn as sns
from IPython.display import display # use of display() for DataFrames

# Import Preprocessing and ML libraries
from sklearn.utils import shuffle
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, accuracy_score, f1_score

%matplotlib inline

## Load Water Pump Data Files, Create Data Frames and Clean The Data

#### Read in the data pump files stored in the directory raw-data into Pandas dataframes

In [3]:
!dir clean-data\*.pkl

 Volume in drive E is DATA
 Volume Serial Number is 6E98-6AE7

 Directory of E:\GitHub\Udacity\machine-learning\projects\capstone-project\raw-data

05/31/2018  12:20 PM         3,634,926 pump_test_features_df.pkl
05/31/2018  12:20 PM        14,615,910 pump_train_features_df.pkl
05/31/2018  12:20 PM           594,995 pump_train_label_df.pkl
06/18/2018  08:19 PM        32,059,977 wp_clean_data.pkl
               4 File(s)     50,905,808 bytes
               0 Dir(s)  1,927,563,112,448 bytes free


### Load Cleaned Water Pump Data Set

In [4]:
!dir clean-data

 Volume in drive E is DATA
 Volume Serial Number is 6E98-6AE7

 Directory of E:\GitHub\Udacity\machine-learning\projects\capstone-project\clean-data

06/26/2018  05:53 PM    <DIR>          .
06/26/2018  05:53 PM    <DIR>          ..
06/13/2018  01:14 PM        25,569,696 clean_pump_train_features_df.pkl
06/19/2018  01:26 PM        32,059,792 clean_wp_data_object_18061913.pkl
06/16/2018  03:04 PM         5,988,228 clean_wp_test_features_df.pkl
06/26/2018  05:53 PM         7,512,042 clean_wp_test_features_df_18062617.pkl
06/16/2018  03:04 PM        25,580,661 clean_wp_train_features_df.pkl
06/26/2018  05:53 PM        56,455,561 clean_wp_train_features_df_18062617.pkl
               6 File(s)    153,165,980 bytes
               2 Dir(s)  1,927,563,112,448 bytes free


Floydhub script: $ floyd run "Pump it Up - Optimize Model Parameters.ipynb" --cpu2 --data cmc265/datasets/clean_wp_train_features_df_18062617pkl --mode jupyter 

In [7]:
local_file_path = 'clean-data\\clean_wp_train_features_df_18062617.pkl'
floydhub_file_path = 'cmc265/datasets/clean_wp_train_features_df_18062617pkl'
wp_test_file = local_file_path

test_df = pd.read_pickle(wp_test_file)
test_df.shape

(59400, 873)

### Create a training and test sets to evaluate models 

In [8]:
label_col = 'status_group'
target_df = test_df[label_col]
feature_df = test_df.drop(label_col, axis = 1)

X_train, X_test, y_train, y_test = train_test_split(feature_df, target_df, test_size = 0.3)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 41580 samples.
Testing set has 17820 samples.


### Tune two ensemble models on the data set

#### Tune the Random Forest Model
Use a gridsearch with key model hyper parameters

In [30]:
rfc= RandomForestClassifier(class_weight = 'balanced', n_jobs=-1)

# n_estimators=10, criterion=’gini’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
# max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, 
# bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)

parameters = {'n_estimators': [500, 1000, 2000], 'max_depth': [5, 10, 15,20], 
              'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]}

# to better manage imbalance outcome classes 
sss_cv = StratifiedShuffleSplit(n_splits = 3, test_size=0.3, random_state=0)

# Make an f1_score scoring object using make_scorer()
scorer = make_scorer(f1_score, average='macro')

# Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj =  GridSearchCV(estimator = rfc, param_grid = parameters, scoring = scorer, n_jobs=-1, cv = sss_cv)

# Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train, y_train)

In [32]:
model_name = rfc.__class__.__name__
print("Best estimator for model {} => {}".format(model_name, grid_fit.best_estimator_))
print("Best parameters for model{} => {}".format(model_name, grid_fit.best_params_))
print("Best score {} => {}".format(model_name, grid_fit.best_score_))

Best estimator for model RandomForestClassifier => RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=20, max_features='sqrt',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)
Best parameters for modelRandomForestClassifier => {'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 1000, 'max_depth': 20, 'min_samples_leaf': 1}
Best score RandomForestClassifier => 0.684289729915


#### Tune the Gardient Boosting Model
Use a gridsearch with key model hyper parameters

In [None]:
gbm = GradientBoostingClassifier()

# loss=’deviance’, learning_rate=0.1, n_estimators=100, subsample=1.0, criterion=’friedman_mse’, 
# min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, 
# min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, 
# warm_start=False, presort=’auto’

parameters = {'n_estimators': [100, 500, 1000, 1500], 'learning_rate': [0.01, 0.05, 0.1], 
              'max_depth': [5, 10, 15] , 'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]}

# to better manage imbalance outcome classes 
sss_cv = StratifiedShuffleSplit(n_splits = 3, test_size=0.3, random_state=0)

# Make an f1_score scoring object using make_scorer()
scorer = make_scorer(f1_score, average='macro')

# Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj =  GridSearchCV(estimator = gbm, param_grid = parameters, scoring = scorer, n_jobs=-1, cv = sss_cv)

# Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train, y_train)

In [None]:
model_name = gbm.__class__.__name__
print("Best estimator for model {} => {}".format(model_name, grid_fit.best_estimator_))
print("Best parameters for model{} => {}".format(model_name, grid_fit.best_params_))
print("Best score {} => {}".format(model_name, grid_fit.best_score_))