In [67]:
from pathlib import Path
import os
import numpy as np
import pandas as pd

# sci-kit learn is our one-stop shop for machine learning!
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [68]:
path=Path.cwd()

In [69]:
datapath=Path.joinpath(path,'data')

In [70]:
os.listdir(datapath)

['0bf8bc6e-30d0-4c50-956a-603fc693d966.csv',
 '4910797b-ee55-40a7-8668-10efd5c1b960.csv',
 '702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv',
 'SubmissionFormat (1).csv']

In [71]:
fullpath1=Path.joinpath(datapath,'0bf8bc6e-30d0-4c50-956a-603fc693d966.csv')
fullpath2=Path.joinpath(datapath,'4910797b-ee55-40a7-8668-10efd5c1b960.csv')

In [72]:
df1=pd.read_csv(fullpath1)
df2=pd.read_csv(fullpath2)

In [73]:
df1.head()
df2.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [74]:
full_df=pd.merge(df1,df2)

In [75]:
full_df.head()

Unnamed: 0,id,status_group,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,functional,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,functional,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,functional,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,non functional,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,functional,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [76]:
cols=full_df.columns

In [77]:
ids=full_df['id']

In [78]:
#full_df.set_index('id',inplace=True)

In [79]:
full_df.head()

Unnamed: 0,id,status_group,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,functional,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,functional,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,functional,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,non functional,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,functional,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [80]:
full_df.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,317.650385,668.297239,34.077427,-5.706033,0.474141,15.297003,5.629747,179.909983,1300.652475
std,21453.128371,2997.574558,693.11635,6.567432,2.946019,12.23623,17.587406,9.633649,471.482176,951.620547
min,0.0,0.0,-90.0,0.0,-11.64944,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.090347,-8.540621,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.908743,-5.021597,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.178387,-3.326156,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


In [81]:
type(cols)

pandas.core.indexes.base.Index

In [82]:


numeric_col_list=['amount_tsh','gps_height','longitude','latitude','num_private','region_code','district_code','population','construction_year']

qual=list(cols)
for i in numeric_col_list:
    if i in qual:
        qual.remove(i)

In [83]:
data_num=pd.DataFrame(full_df[numeric_col_list])
data_num.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
0,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999
1,0.0,1399,34.698766,-2.147466,0,20,2,280,2010
2,25.0,686,37.460664,-3.821329,0,21,4,250,2009
3,0.0,263,38.486161,-11.155298,0,90,63,58,1986
4,0.0,0,31.130847,-1.825359,0,18,1,0,0


In [84]:
from sklearn.preprocessing import StandardScaler
model=StandardScaler()
scaled_results = model.fit_transform(full_df[numeric_col_list])
# convert those results into a dataframe
scaled_df = pd.DataFrame(scaled_results, columns=numeric_col_list)
scaled_df.head()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
0,1.895665,1.041252,0.131052,-1.408791,-0.038749,-0.244325,-0.06537,-0.150399,0.733857
1,-0.10597,1.054237,0.09461,1.207934,-0.038749,0.267409,-0.376781,0.21229,0.745416
2,-0.09763,0.025541,0.515158,0.639751,-0.038749,0.324269,-0.169174,0.14866,0.744365
3,-0.10597,-0.584751,0.671308,-1.84972,-0.038749,4.247564,5.955245,-0.25857,0.720196
4,-0.10597,-0.9642,-0.448669,1.317271,-0.038749,0.153691,-0.480585,-0.381587,-1.366788


In [85]:
X = scaled_df
Y = full_df['status_group']
# feature extraction
model = ExtraTreesClassifier(n_estimators=10)
model.fit(X, Y)
print(X.columns)
print(model.feature_importances_)

Index(['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private',
       'region_code', 'district_code', 'population', 'construction_year'],
      dtype='object')
[0.05826805 0.11818231 0.28525055 0.27860895 0.00317369 0.03223689
 0.03906581 0.08570181 0.09951193]


In [86]:
X = data_num
Y = full_df['status_group']
# feature extraction
model = ExtraTreesClassifier(n_estimators=10)
model.fit(X, Y)
print(X.columns)
print(model.feature_importances_)

Index(['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private',
       'region_code', 'district_code', 'population', 'construction_year'],
      dtype='object')
[0.05773679 0.11597876 0.28571697 0.28573396 0.00296638 0.02766543
 0.03767658 0.08497581 0.10154933]


In [87]:
qual_l=['waterpoint_type_group','source_class','quantity_group','quality_group']

In [88]:
qual2=full_df[qual_l]

In [89]:
qual2.head()

Unnamed: 0,waterpoint_type_group,source_class,quantity_group,quality_group
0,communal standpipe,groundwater,enough,good
1,communal standpipe,surface,insufficient,good
2,communal standpipe,surface,enough,good
3,communal standpipe,groundwater,dry,good
4,communal standpipe,surface,seasonal,good


In [90]:
d=pd.get_dummies(qual2)

In [91]:
d['id']=ids

In [92]:
data_num['id']=ids

In [93]:
n_full_df=pd.merge(data_num,d)

In [94]:
n_full_df.set_index('id')

Unnamed: 0_level_0,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,waterpoint_type_group_cattle trough,...,quantity_group_enough,quantity_group_insufficient,quantity_group_seasonal,quantity_group_unknown,quality_group_colored,quality_group_fluoride,quality_group_good,quality_group_milky,quality_group_salty,quality_group_unknown
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,1390,34.938093,-9.856322e+00,0,11,5,109,1999,0,...,1,0,0,0,0,0,1,0,0,0
8776,0.0,1399,34.698766,-2.147466e+00,0,20,2,280,2010,0,...,0,1,0,0,0,0,1,0,0,0
34310,25.0,686,37.460664,-3.821329e+00,0,21,4,250,2009,0,...,1,0,0,0,0,0,1,0,0,0
67743,0.0,263,38.486161,-1.115530e+01,0,90,63,58,1986,0,...,0,0,0,0,0,0,1,0,0,0
19728,0.0,0,31.130847,-1.825359e+00,0,18,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
9944,20.0,0,39.172796,-4.765587e+00,0,4,8,1,2009,0,...,1,0,0,0,0,0,0,0,1,0
19816,0.0,0,33.362410,-3.766365e+00,0,17,3,0,0,0,...,1,0,0,0,0,0,1,0,0,0
54551,0.0,0,32.620617,-4.226198e+00,0,17,3,0,0,0,...,1,0,0,0,0,0,0,1,0,0
53934,0.0,0,32.711100,-5.146712e+00,0,14,6,0,0,0,...,0,0,1,0,0,0,0,0,1,0
46144,0.0,0,30.626991,-1.257051e+00,0,18,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [95]:
data_num.shape

(59400, 10)

In [96]:
d.shape

(59400, 21)

In [97]:
len(d.columns)

21

In [98]:
full_df.corr()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
id,1.0,-0.005321,-0.004692,-0.001348,0.001718,-0.002629,-0.003028,-0.003044,-0.002813,-0.002082
amount_tsh,-0.005321,1.0,0.07665,0.022134,-0.05267,0.002944,-0.026813,-0.023599,0.016288,0.067915
gps_height,-0.004692,0.07665,1.0,0.149155,-0.035751,0.007237,-0.183521,-0.171233,0.135003,0.658727
longitude,-0.001348,0.022134,0.149155,1.0,-0.425802,0.023873,0.034197,0.151398,0.08659,0.396732
latitude,0.001718,-0.05267,-0.035751,-0.425802,1.0,0.006837,-0.221018,-0.20102,-0.022152,-0.245278
num_private,-0.002629,0.002944,0.007237,0.023873,0.006837,1.0,-0.020377,-0.004478,0.003818,0.026056
region_code,-0.003028,-0.026813,-0.183521,0.034197,-0.221018,-0.020377,1.0,0.678602,0.094088,0.031724
district_code,-0.003044,-0.023599,-0.171233,0.151398,-0.20102,-0.004478,0.678602,1.0,0.061831,0.048315
population,-0.002813,0.016288,0.135003,0.08659,-0.022152,0.003818,0.094088,0.061831,1.0,0.26091
construction_year,-0.002082,0.067915,0.658727,0.396732,-0.245278,0.026056,0.031724,0.048315,0.26091,1.0


In [99]:
X = pd.merge(d,data_num)
X=X.drop('id',axis=1)
Y = full_df['status_group']
# feature extraction
model = ExtraTreesClassifier(n_estimators=10)
model.fit(X, Y)
print(X.columns)
print(model.feature_importances_)
p=model.feature_importances_
rank=tuple(_)

Index(['waterpoint_type_group_cattle trough',
       'waterpoint_type_group_communal standpipe', 'waterpoint_type_group_dam',
       'waterpoint_type_group_hand pump',
       'waterpoint_type_group_improved spring', 'waterpoint_type_group_other',
       'source_class_groundwater', 'source_class_surface',
       'source_class_unknown', 'quantity_group_dry', 'quantity_group_enough',
       'quantity_group_insufficient', 'quantity_group_seasonal',
       'quantity_group_unknown', 'quality_group_colored',
       'quality_group_fluoride', 'quality_group_good', 'quality_group_milky',
       'quality_group_salty', 'quality_group_unknown', 'amount_tsh',
       'gps_height', 'longitude', 'latitude', 'num_private', 'region_code',
       'district_code', 'population', 'construction_year'],
      dtype='object')
[3.81582297e-04 1.61904780e-02 2.38787552e-05 1.24343254e-02
 2.51654466e-03 4.79389862e-02 4.93872406e-03 5.20641983e-03
 8.44759681e-04 7.29418198e-02 3.28643497e-02 1.29265062e-02
 9.52

In [100]:
keys=X.columns
values=p
d=dict(zip(values,keys))

In [101]:
s=sorted(d,reverse=True)
for i in s:
    print(d[i])

latitude
longitude
gps_height
construction_year
quantity_group_dry
population
district_code
waterpoint_type_group_other
region_code
amount_tsh
quantity_group_enough
waterpoint_type_group_communal standpipe
quantity_group_insufficient
waterpoint_type_group_hand pump
quality_group_unknown
quantity_group_seasonal
quality_group_good
source_class_surface
source_class_groundwater
quality_group_salty
num_private
waterpoint_type_group_improved spring
quantity_group_unknown
quality_group_milky
quality_group_colored
source_class_unknown
quality_group_fluoride
waterpoint_type_group_cattle trough
waterpoint_type_group_dam


In [102]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [103]:
Y.value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [104]:
Y

0                     functional
1                     functional
2                     functional
3                 non functional
4                     functional
5                     functional
6                 non functional
7                 non functional
8                 non functional
9                     functional
10                    functional
11                    functional
12                    functional
13                    functional
14                    functional
15                    functional
16                non functional
17                non functional
18       functional needs repair
19                    functional
20                    functional
21                    functional
22       functional needs repair
23                    functional
24                    functional
25       functional needs repair
26                    functional
27                    functional
28                non functional
29                    functional
          

In [105]:
y=Y.map({'functional':0,'non functional':1,'functional needs repair':2})

In [106]:
y.value_counts()

0    32259
1    22824
2     4317
Name: status_group, dtype: int64

In [107]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [108]:
from sklearn.linear_model import LogisticRegression
instance_lm=LogisticRegression()
fit_lm=instance_lm.fit(X_train,y_train)
predicts_lm=fit_lm.predict(X_test)



In [109]:
accuracy_score(predicts_lm,y_test)

0.6955106621773288

In [110]:
cross_val_score_lm=cross_val_score(instance_lm,X,y,cv=5)



In [111]:
from sklearn.ensemble import RandomForestClassifier
instance_rf=RandomForestClassifier()
fit_rf=instance_rf.fit(X_train,y_train)
predicts_rf=fit_rf.predict(X_test)



In [112]:
accuracy_score(predicts_rf,y_test)

0.784006734006734

In [113]:
cross_val_score_rf=cross_val_score(instance_rf,X,y,cv=5)

In [114]:
cross_val_score_rf

array([0.78983251, 0.78478243, 0.78425926, 0.775     , 0.78599091])

In [115]:
from xgboost import XGBClassifier
instance_xg = XGBClassifier()
fit_xg=instance_xg.fit(X_train, y_train)
predicts_xg=fit_xg.predict(X_test)

In [116]:
accuracy_score(predicts_xg,y_test)

0.725364758698092

In [117]:
cross_val_score_xg=cross_val_score(instance_xg,X,y,cv=5)

In [118]:
cross_val_score_xg

array([0.72493898, 0.72493898, 0.72626263, 0.72693603, 0.73034181])

In [119]:
from keras.models import Sequential
from keras.layers import Dense

In [120]:
len(X.columns)


29

In [121]:
model = Sequential()
model.add(Dense(12, input_dim=29, activation='relu'))
model.add(Dense(29, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [122]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [123]:
model.fit(X_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x144db43c278>

In [125]:
p=model.predict(X_test)
#accuracy_score(p,y_test)

In [126]:
_, accuracy = model.evaluate(X, y)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 66.59
