In [37]:
import pandas as pd
import numpy as np
import re

import time

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer
from sklearn import datasets, linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.preprocessing import StandardScaler, label_binarize, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import sklearn.datasets as datasets

import pandas_profiling

from sklearn.linear_model import LogisticRegression, LinearRegression
# import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
%%time
df_train = pd.read_csv('./Datasets/clean_train_data.csv')

CPU times: user 480 ms, sys: 88.1 ms, total: 568 ms
Wall time: 626 ms


In [3]:
df_test = pd.read_csv('./Datasets/clean_test_data.csv') 

In [4]:
# Check shape of each data table.
df_train.shape, df_test.shape

((59400, 27), (14850, 26))

In [5]:
# Find columns with 'object' dtypes.
cols = df_train.select_dtypes(exclude=[np.number])

In [7]:
# Show list of object columns.
list(cols)

['date_recorded',
 'wpt_name',
 'basin',
 'region',
 'lga',
 'ward',
 'construction_year',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'quality_group',
 'quantity',
 'source',
 'source_class',
 'waterpoint_type',
 'status_group']

In [7]:
df_train.dtypes

id                         int64
amount_tsh               float64
date_recorded             object
gps_height                 int64
longitude                float64
latitude                 float64
wpt_name                  object
num_private                int64
basin                     object
region                    object
region_code                int64
district_code              int64
lga                       object
ward                      object
population                 int64
construction_year         object
extraction_type_group     object
extraction_type_class     object
management                object
management_group          object
payment                   object
quality_group             object
quantity                  object
source                    object
source_class              object
waterpoint_type           object
status_group              object
dtype: object

In [8]:
# Exclude object columns. Find non-object columns.
cols2 = df_train.select_dtypes(exclude=[np.object])

In [9]:
# Show non-object columns.
list(cols2)

['id',
 'amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'region_code',
 'district_code',
 'population']

In [10]:
# Create dummy values.
dummy_col = pd.get_dummies(df_train, columns = ['basin',
 'region',
 'construction_year',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'quality_group',
 'quantity',
 'source',
 'source_class',
 'waterpoint_type'])

In [11]:
# Check independent variable. Make sure values are correct.
df_train['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

In [13]:
dummy_col.head()

Unnamed: 0,id,amount_tsh,date_recorded,gps_height,longitude,latitude,wpt_name,num_private,region_code,district_code,...,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
0,69572,6000.0,2011-03-14,1390,34.938093,-9.856322,none,0,11,5,...,1,0,0,0,1,0,0,0,0,0
1,8776,0.0,2013-03-06,1399,34.698766,-2.147466,Zahanati,0,20,2,...,0,1,0,0,1,0,0,0,0,0
2,34310,25.0,2013-02-25,686,37.460664,-3.821329,Kwa Mahundi,0,21,4,...,0,1,0,0,0,1,0,0,0,0
3,67743,0.0,2013-01-28,263,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,90,63,...,1,0,0,0,0,1,0,0,0,0
4,19728,0.0,2011-07-13,0,31.130847,-1.825359,Shuleni,0,18,1,...,0,1,0,0,1,0,0,0,0,0


In [14]:
y = df_train['status_group'].values
X = dummy_col.drop(['status_group', 'id', 'amount_tsh', 'num_private', 
                    'date_recorded', 'wpt_name', 'lga', 'ward'], axis=1)

## KNN Classifier

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [16]:
print(X.shape, y.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(59400, 118) (59400,)
(44550, 118) (44550,)
(14850, 118) (14850,)


In [17]:
ss = StandardScaler()
Xs = ss.fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [18]:
# K- Nearest Neighbors Model
knn5 = KNeighborsClassifier(n_neighbors=5, weights='uniform')

scores = cross_val_score(knn5, Xs, y, cv=5)
np.mean(scores)

0.77210433688644

In [20]:
knn5.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [21]:
knn5.score(X_test, y_test)

0.6643771043771044

In [22]:
from sklearn.metrics import classification_report, confusion_matrix

y_preds = knn5.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), columns=['Pred +', 'Pred Fix', 'Pred -'], index=['Act +', 'Act Fix', 'Act -'])

                         precision    recall  f1-score   support

             functional       0.68      0.80      0.74      8041
functional needs repair       0.42      0.19      0.26      1081
         non functional       0.66      0.56      0.61      5728

              micro avg       0.66      0.66      0.66     14850
              macro avg       0.59      0.52      0.53     14850
           weighted avg       0.65      0.66      0.65     14850



Unnamed: 0,Pred +,Pred Fix,Pred -
Act +,6470,164,1407
Act Fix,669,203,209
Act -,2415,120,3193


## Random Forest Classifier 

In [17]:
classifier = DecisionTreeClassifier(criterion='gini',
                                    max_depth=None)

In [23]:
%%time
dt = RandomForestClassifier(class_weight = 'balanced')
s = cross_val_score(dt, X, y, n_jobs=-1)
print("{} Score:\t{:0.3} ± {:0.3}".format("Random Forest with Balanced Classes", s.mean().round(3), s.std().round(3)))



Random Forest with Balanced Classes Score:	0.787 ± 0.002
CPU times: user 334 ms, sys: 211 ms, total: 544 ms
Wall time: 8.5 s


In [24]:
%%time
rfc_params = {'n_estimators':[2,5,10,20,50,75,150],
             'criterion':['gini', 'entropy'],
             'max_depth':[2,5,10,20,50,None],
             'min_samples_split':[2,5,10,20]}


grid_rfc = GridSearchCV(RandomForestClassifier(), rfc_params, cv=5, scoring='accuracy')
grid_rfc.fit(X_train, y_train)

CPU times: user 1h 27min 33s, sys: 2min 46s, total: 1h 30min 20s
Wall time: 1h 39min 51s


In [25]:
grid_rfc.score(X_test, y_test)

0.801077441077441

## Confusion Matrix

In [26]:
from sklearn.metrics import classification_report, confusion_matrix

y_preds = grid_rfc.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), columns=['Pred +', 'Pred Fix', 'Pred -'], 
                                                         index=['Act +', 'Act Fix', 'Act -'])

                         precision    recall  f1-score   support

             functional       0.79      0.90      0.84      8041
functional needs repair       0.59      0.27      0.37      1081
         non functional       0.84      0.76      0.80      5728

              micro avg       0.80      0.80      0.80     14850
              macro avg       0.74      0.64      0.67     14850
           weighted avg       0.80      0.80      0.79     14850



Unnamed: 0,Pred +,Pred Fix,Pred -
Act +,7264,132,645
Act Fix,607,293,181
Act -,1319,70,4339


# Random Forest, Logistic Regression, Onehotencoder

In [59]:
n_estimator = 10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [60]:
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train)), y_train)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [62]:
grid_rfc.score(X_test, y_test)

0.8677441077441077

In [63]:
y_preds = grid_rfc.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), columns=['Pred +', 'Pred Fix', 'Pred -'], index=['Act +', 'Act Fix', 'Act -'])

                         precision    recall  f1-score   support

             functional       0.85      0.95      0.90      8039
functional needs repair       0.75      0.41      0.53      1066
         non functional       0.91      0.84      0.88      5745

              micro avg       0.87      0.87      0.87     14850
              macro avg       0.84      0.73      0.77     14850
           weighted avg       0.87      0.87      0.86     14850



Unnamed: 0,Pred +,Pred Fix,Pred -
Act +,7609,101,329
Act Fix,486,440,140
Act -,862,46,4837


# Create prediction dataframe to be submitted

In [76]:
preds = pd.DataFrame(y_preds)

In [77]:
preds.head()

Unnamed: 0,0
0,functional
1,non functional
2,functional
3,functional
4,functional


In [78]:
n_test = pd.read_csv('./Datasets/Tanzania_Test.csv')

In [79]:
n_test.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,...,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,...,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


In [80]:
n_test.shape

(14850, 40)

In [82]:
predict = pd.concat((n_test['id'], preds), axis=1)

In [83]:
predict.columns=['id', 'status_group']

In [84]:
predict['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

In [85]:
predict.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,non functional
2,17168,functional
3,45559,functional
4,49871,functional


In [86]:
predict.shape

(14850, 2)

In [87]:
predict.to_csv('./Submissions/Submission_1.csv', index=False)