In [41]:
import pandas as pd
import numpy as np
import re

import time

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer
from sklearn import datasets, linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix, roc_curve
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import sklearn.datasets as datasets

import pandas_profiling

from sklearn.linear_model import LogisticRegression, LinearRegression
# import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [5]:
df_train = pd.read_csv('./datasets/clean_train_data.csv')

In [6]:
df_test = pd.read_csv('./datasets/clean_test_data.csv') 

In [8]:
df_train.shape, df_test.shape

((59400, 27), (14850, 26))

In [10]:
cols = df_train.select_dtypes(exclude=[np.number])

In [11]:
list(cols)

['date_recorded',
 'wpt_name',
 'basin',
 'region',
 'lga',
 'ward',
 'construction_year',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'quality_group',
 'quantity',
 'source',
 'source_class',
 'waterpoint_type',
 'status_group']

In [12]:
df_train.dtypes

id                         int64
amount_tsh               float64
date_recorded             object
gps_height                 int64
longitude                float64
latitude                 float64
wpt_name                  object
num_private                int64
basin                     object
region                    object
region_code                int64
district_code              int64
lga                       object
ward                      object
population                 int64
construction_year         object
extraction_type_group     object
extraction_type_class     object
management                object
management_group          object
payment                   object
quality_group             object
quantity                  object
source                    object
source_class              object
waterpoint_type           object
status_group              object
dtype: object

In [13]:
dummy_col = pd.get_dummies(df_train, columns = ['basin',
 'region',
 'construction_year',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'quality_group',
 'quantity',
 'source',
 'source_class',
 'waterpoint_type'])

In [14]:
df_train['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

In [15]:
dummy_col.head()

Unnamed: 0,id,amount_tsh,date_recorded,gps_height,longitude,latitude,wpt_name,num_private,region_code,district_code,...,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
0,69572,6000.0,2011-03-14,1390,34.938093,-9.856322,none,0,11,5,...,1,0,0,0,1,0,0,0,0,0
1,8776,0.0,2013-03-06,1399,34.698766,-2.147466,Zahanati,0,20,2,...,0,1,0,0,1,0,0,0,0,0
2,34310,25.0,2013-02-25,686,37.460664,-3.821329,Kwa Mahundi,0,21,4,...,0,1,0,0,0,1,0,0,0,0
3,67743,0.0,2013-01-28,263,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,90,63,...,1,0,0,0,0,1,0,0,0,0
4,19728,0.0,2011-07-13,0,31.130847,-1.825359,Shuleni,0,18,1,...,0,1,0,0,1,0,0,0,0,0


In [16]:
y = df_train['status_group'].values
X = dummy_col.drop(['status_group', 'id', 'amount_tsh', 'num_private', 'date_recorded', 
                    'wpt_name', 'lga', 'ward'], axis=1)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [18]:
print(X.shape, y.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(59400, 118) (59400,)
(44550, 118) (44550,)
(14850, 118) (14850,)


In [20]:
ss = StandardScaler()
Xs = ss.fit_transform(X)

In [21]:
# classifier = DecisionTreeClassifier(criterion='gini',
#                                     max_depth=None)

In [22]:
%%time
grad_params = {'learning_rate': [0.075, 0.7],
               'max_depth': [13, 14],
               'min_samples_leaf': [15, 16],
               'max_features': [1.0],
               'n_estimators': [100, 200]} 


grid_grad = GridSearchCV(GradientBoostingClassifier(), grad_params, cv=5, scoring='accuracy')
grid_grad.fit(X_train, y_train)

CPU times: user 15h 5min 57s, sys: 3min 53s, total: 15h 9min 51s
Wall time: 1d 16h 52min 41s


In [23]:
from sklearn.metrics import classification_report, confusion_matrix

y_preds = grid_grad.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), columns=['Pred +', 'Pred Fix', 'Pred -'],
             index=['Act +', 'Act Fix', 'Act -'])

                         precision    recall  f1-score   support

             functional       0.81      0.89      0.85      8195
functional needs repair       0.59      0.32      0.41      1071
         non functional       0.83      0.78      0.80      5584

            avg / total       0.80      0.81      0.80     14850



Unnamed: 0,Pred +,Pred Fix,Pred -
Act +,7283,182,730
Act Fix,548,339,184
Act -,1166,58,4360


In [24]:
grid_grad.score(X_test, y_test)

0.8068686868686868

In [25]:
preds = pd.DataFrame(y_preds)

In [27]:
n_test = pd.read_csv('./datasets/clean_test_data.csv')

In [28]:
n_test.head()

Unnamed: 0,id,amount_tsh,date_recorded,gps_height,longitude,latitude,wpt_name,num_private,basin,region,...,extraction_type_group,extraction_type_class,management,management_group,payment,quality_group,quantity,source,source_class,waterpoint_type
0,50785,0.0,2013-02-04,1996,35.290799,-4.059696,Dinamu Secondary School,0,Internal,Manyara,...,other,other,parastatal,parastatal,never pay,good,seasonal,rainwater harvesting,surface,other
1,51630,0.0,2013-02-04,1569,36.656709,-3.309214,Kimnyak,0,Pangani,Arusha,...,gravity,gravity,vwc,user-group,never pay,good,insufficient,spring,groundwater,communal standpipe
2,17168,0.0,2013-02-01,1567,34.767863,-5.004344,Puma Secondary,0,Internal,Singida,...,other,other,vwc,user-group,never pay,good,insufficient,rainwater harvesting,surface,other
3,45559,0.0,2013-01-22,267,38.058046,-9.418672,Kwa Mzee Pange,0,Ruvuma / Southern Coast,Lindi,...,other,other,vwc,user-group,unknown,good,dry,shallow well,groundwater,other
4,49871,500.0,2013-03-27,1260,35.006123,-10.950412,Kwa Mzee Turuka,0,Ruvuma / Southern Coast,Ruvuma,...,gravity,gravity,water board,user-group,pay monthly,good,enough,spring,groundwater,communal standpipe


In [26]:
n_test.shape

(14850, 26)

In [29]:
predict = pd.concat((n_test['id'], preds), axis=1)

In [30]:
predict.columns=['id', 'status_group']

In [31]:
predict['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

In [32]:
predict.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,functional
4,49871,functional


In [35]:
predict.to_csv('./Submission_8.csv', index=False)

In [37]:
predict.shape

(14850, 2)