In [1]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier

import pandas_profiling
from scipy.cluster.hierarchy import dendrogram, linkage

import seaborn as sns
from matplotlib import pyplot as plt

plt.style.use('fivethirtyeight')

%matplotlib inline

In [2]:
# Read in the data.
df_train = pd.read_csv('./Datasets/Tanzania_Train.csv')
df_labels = pd.read_csv('./Datasets/Tanzana_Train_Label.csv')

In [3]:
# Display first 5 columns of the training data.
df_train.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [4]:
# Display first 5 columns of independant variable. 
df_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [6]:
# Merge two tables together so independant variable is in the training table. 
df_merge = pd.merge(df_train, df_labels)

In [7]:
# Check to see if status_group in now included in table. 
df_merge.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [8]:
# Generate a profile report of the data to examine. 
pandas_profiling.ProfileReport(df_merge)

0,1
Number of variables,41
Number of observations,59400
Total Missing (%),1.9%
Total size in memory,19.0 MiB
Average record size in memory,336.0 B

0,1
Numeric,10
Categorical,30
Boolean,0
Date,0
Text (Unique),0
Rejected,1
Unsupported,0

0,1
Distinct count,98
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,317.65
Minimum,0
Maximum,350000
Zeros (%),70.1%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,20
95-th percentile,1200
Maximum,350000
Range,350000
Interquartile range,20

0,1
Standard deviation,2997.6
Coef of variation,9.4367
Kurtosis,4903.5
Mean,317.65
MAD,522.12
Skewness,57.808
Sum,18868000
Variance,8985500
Memory size,3.4 MiB

Value,Count,Frequency (%),Unnamed: 3
0.0,41639,70.1%,
500.0,3102,5.2%,
50.0,2472,4.2%,
1000.0,1488,2.5%,
20.0,1463,2.5%,
200.0,1220,2.1%,
100.0,816,1.4%,
10.0,806,1.4%,
30.0,743,1.3%,
2000.0,704,1.2%,

Value,Count,Frequency (%),Unnamed: 3
0.0,41639,70.1%,
0.2,3,0.0%,
0.25,1,0.0%,
1.0,3,0.0%,
2.0,13,0.0%,

Value,Count,Frequency (%),Unnamed: 3
138000.0,1,0.0%,
170000.0,1,0.0%,
200000.0,1,0.0%,
250000.0,1,0.0%,
350000.0,1,0.0%,

0,1
Distinct count,9
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Lake Victoria,10248
Pangani,8940
Rufiji,7976
Other values (6),32236

Value,Count,Frequency (%),Unnamed: 3
Lake Victoria,10248,17.3%,
Pangani,8940,15.1%,
Rufiji,7976,13.4%,
Internal,7785,13.1%,
Lake Tanganyika,6432,10.8%,
Wami / Ruvu,5987,10.1%,
Lake Nyasa,5085,8.6%,
Ruvuma / Southern Coast,4493,7.6%,
Lake Rukwa,2454,4.1%,

0,1
Distinct count,55
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1300.7
Minimum,0
Maximum,2013
Zeros (%),34.9%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,1986
Q3,2004
95-th percentile,2010
Maximum,2013
Range,2013
Interquartile range,2004

0,1
Standard deviation,951.62
Coef of variation,0.73165
Kurtosis,-1.5964
Mean,1300.7
MAD,906.91
Skewness,-0.63493
Sum,77258757
Variance,905580
Memory size,928.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,20709,34.9%,
2010,2645,4.5%,
2008,2613,4.4%,
2009,2533,4.3%,
2000,2091,3.5%,
2007,1587,2.7%,
2006,1471,2.5%,
2003,1286,2.2%,
2011,1256,2.1%,
2004,1123,1.9%,

Value,Count,Frequency (%),Unnamed: 3
0,20709,34.9%,
1960,102,0.2%,
1961,21,0.0%,
1962,30,0.1%,
1963,85,0.1%,

Value,Count,Frequency (%),Unnamed: 3
2009,2533,4.3%,
2010,2645,4.5%,
2011,1256,2.1%,
2012,1084,1.8%,
2013,176,0.3%,

0,1
Distinct count,356
Unique (%),0.6%
Missing (%),0.0%
Missing (n),0

0,1
2011-03-15,572
2011-03-17,558
2013-02-03,546
Other values (353),57724

Value,Count,Frequency (%),Unnamed: 3
2011-03-15,572,1.0%,
2011-03-17,558,0.9%,
2013-02-03,546,0.9%,
2011-03-14,520,0.9%,
2011-03-16,513,0.9%,
2011-03-18,497,0.8%,
2011-03-19,466,0.8%,
2013-02-04,464,0.8%,
2013-01-29,459,0.8%,
2011-03-04,458,0.8%,

0,1
Distinct count,20
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,5.6297
Minimum,0
Maximum,80
Zeros (%),0.0%

0,1
Minimum,0
5-th percentile,1
Q1,2
Median,3
Q3,5
95-th percentile,30
Maximum,80
Range,80
Interquartile range,3

0,1
Standard deviation,9.6336
Coef of variation,1.7112
Kurtosis,16.214
Mean,5.6297
MAD,4.7435
Skewness,3.962
Sum,334407
Variance,92.807
Memory size,928.1 KiB

Value,Count,Frequency (%),Unnamed: 3
1,12203,20.5%,
2,11173,18.8%,
3,9998,16.8%,
4,8999,15.1%,
5,4356,7.3%,
6,4074,6.9%,
7,3343,5.6%,
8,1043,1.8%,
30,995,1.7%,
33,874,1.5%,

Value,Count,Frequency (%),Unnamed: 3
0,23,0.0%,
1,12203,20.5%,
2,11173,18.8%,
3,9998,16.8%,
4,8999,15.1%,

Value,Count,Frequency (%),Unnamed: 3
60,63,0.1%,
62,109,0.2%,
63,195,0.3%,
67,6,0.0%,
80,12,0.0%,

0,1
Distinct count,18
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
gravity,26780
nira/tanira,8154
other,6430
Other values (15),18036

Value,Count,Frequency (%),Unnamed: 3
gravity,26780,45.1%,
nira/tanira,8154,13.7%,
other,6430,10.8%,
submersible,4764,8.0%,
swn 80,3670,6.2%,
mono,2865,4.8%,
india mark ii,2400,4.0%,
afridev,1770,3.0%,
ksb,1415,2.4%,
other - rope pump,451,0.8%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
gravity,26780
handpump,16456
other,6430
Other values (4),9734

Value,Count,Frequency (%),Unnamed: 3
gravity,26780,45.1%,
handpump,16456,27.7%,
other,6430,10.8%,
submersible,6179,10.4%,
motorpump,2987,5.0%,
rope pump,451,0.8%,
wind-powered,117,0.2%,

0,1
Distinct count,13
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
gravity,26780
nira/tanira,8154
other,6430
Other values (10),18036

Value,Count,Frequency (%),Unnamed: 3
gravity,26780,45.1%,
nira/tanira,8154,13.7%,
other,6430,10.8%,
submersible,6179,10.4%,
swn 80,3670,6.2%,
mono,2865,4.8%,
india mark ii,2400,4.0%,
afridev,1770,3.0%,
rope pump,451,0.8%,
other handpump,364,0.6%,

0,1
Distinct count,1898
Unique (%),3.2%
Missing (%),6.1%
Missing (n),3635

0,1
Government Of Tanzania,9084
Danida,3114
Hesawa,2202
Other values (1894),41365
(Missing),3635

Value,Count,Frequency (%),Unnamed: 3
Government Of Tanzania,9084,15.3%,
Danida,3114,5.2%,
Hesawa,2202,3.7%,
Rwssp,1374,2.3%,
World Bank,1349,2.3%,
Kkkt,1287,2.2%,
World Vision,1246,2.1%,
Unicef,1057,1.8%,
Tasaf,877,1.5%,
District Council,843,1.4%,

0,1
Distinct count,2428
Unique (%),4.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,668.3
Minimum,-90
Maximum,2770
Zeros (%),34.4%

0,1
Minimum,-90.0
5-th percentile,0.0
Q1,0.0
Median,369.0
Q3,1319.2
95-th percentile,1797.0
Maximum,2770.0
Range,2860.0
Interquartile range,1319.2

0,1
Standard deviation,693.12
Coef of variation,1.0371
Kurtosis,-1.2924
Mean,668.3
MAD,637.95
Skewness,0.4624
Sum,39696856
Variance,480410
Memory size,928.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,20438,34.4%,
-15,60,0.1%,
-16,55,0.1%,
-13,55,0.1%,
-20,52,0.1%,
1290,52,0.1%,
-14,51,0.1%,
303,51,0.1%,
-18,49,0.1%,
-19,47,0.1%,

Value,Count,Frequency (%),Unnamed: 3
-90,1,0.0%,
-63,2,0.0%,
-59,1,0.0%,
-57,1,0.0%,
-55,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2623,1,0.0%,
2626,2,0.0%,
2627,1,0.0%,
2628,1,0.0%,
2770,1,0.0%,

0,1
Distinct count,59400
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,37115
Minimum,0
Maximum,74247
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,3730.9
Q1,18520.0
Median,37062.0
Q3,55656.0
95-th percentile,70564.0
Maximum,74247.0
Range,74247.0
Interquartile range,37137.0

0,1
Standard deviation,21453
Coef of variation,0.57802
Kurtosis,-1.2015
Mean,37115
MAD,18586
Skewness,0.0026225
Sum,2204638827
Variance,460240000
Memory size,3.4 MiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
72310,1,0.0%,
49805,1,0.0%,
51852,1,0.0%,
62091,1,0.0%,
64138,1,0.0%,
57993,1,0.0%,
60040,1,0.0%,
33413,1,0.0%,
35460,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
74240,1,0.0%,
74242,1,0.0%,
74243,1,0.0%,
74246,1,0.0%,
74247,1,0.0%,

0,1
Distinct count,2146
Unique (%),3.6%
Missing (%),6.2%
Missing (n),3655

0,1
DWE,17402
Government,1825
RWE,1206
Other values (2142),35312
(Missing),3655

Value,Count,Frequency (%),Unnamed: 3
DWE,17402,29.3%,
Government,1825,3.1%,
RWE,1206,2.0%,
Commu,1060,1.8%,
DANIDA,1050,1.8%,
KKKT,898,1.5%,
Hesawa,840,1.4%,
0,777,1.3%,
TCRS,707,1.2%,
Central government,622,1.0%,

0,1
Distinct count,57517
Unique (%),96.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,-5.706
Minimum,-11.649
Maximum,-2e-08
Zeros (%),0.0%

0,1
Minimum,-11.649
5-th percentile,-10.586
Q1,-8.5406
Median,-5.0216
Q3,-3.3262
95-th percentile,-1.4089
Maximum,-2e-08
Range,11.649
Interquartile range,5.2145

0,1
Standard deviation,2.946
Coef of variation,-0.5163
Kurtosis,-1.0576
Mean,-5.706
MAD,2.5678
Skewness,-0.15204
Sum,-338940
Variance,8.679
Memory size,928.1 KiB

Value,Count,Frequency (%),Unnamed: 3
-2e-08,1812,3.1%,
-6.98584173,2,0.0%,
-3.79757861,2,0.0%,
-6.98188419,2,0.0%,
-7.10462503,2,0.0%,
-7.05692253,2,0.0%,
-7.17517443,2,0.0%,
-6.99073094,2,0.0%,
-6.978755499999999,2,0.0%,
-6.99470401,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
-11.64944018,1,0.0%,
-11.64837759,1,0.0%,
-11.58629656,1,0.0%,
-11.56857679,1,0.0%,
-11.56680457,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
-0.99911702,1,0.0%,
-0.99901209,1,0.0%,
-0.998916,1,0.0%,
-0.99846435,1,0.0%,
-2e-08,1812,3.1%,

0,1
Distinct count,125
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Njombe,2503
Arusha Rural,1252
Moshi Rural,1251
Other values (122),54394

Value,Count,Frequency (%),Unnamed: 3
Njombe,2503,4.2%,
Arusha Rural,1252,2.1%,
Moshi Rural,1251,2.1%,
Bariadi,1177,2.0%,
Rungwe,1106,1.9%,
Kilosa,1094,1.8%,
Kasulu,1047,1.8%,
Mbozi,1034,1.7%,
Meru,1009,1.7%,
Bagamoyo,997,1.7%,

0,1
Distinct count,57516
Unique (%),96.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,34.077
Minimum,0
Maximum,40.345
Zeros (%),3.1%

0,1
Minimum,0.0
5-th percentile,30.041
Q1,33.09
Median,34.909
Q3,37.178
95-th percentile,39.133
Maximum,40.345
Range,40.345
Interquartile range,4.088

0,1
Standard deviation,6.5674
Coef of variation,0.19272
Kurtosis,19.187
Mean,34.077
MAD,3.3023
Skewness,-4.191
Sum,2024200
Variance,43.131
Memory size,928.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,1812,3.1%,
37.54090064,2,0.0%,
33.01050977,2,0.0%,
39.09348389,2,0.0%,
32.972718699999994,2,0.0%,
33.00627548,2,0.0%,
39.10395018,2,0.0%,
37.54278497,2,0.0%,
36.80248988,2,0.0%,
39.09837398,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,1812,3.1%,
29.6071219,1,0.0%,
29.60720109,1,0.0%,
29.61032056,1,0.0%,
29.61096482,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
40.32340181,1,0.0%,
40.32522643,1,0.0%,
40.32523996,1,0.0%,
40.34430089,1,0.0%,
40.34519307,1,0.0%,

0,1
Distinct count,12
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
vwc,40507
wug,6515
water board,2933
Other values (9),9445

Value,Count,Frequency (%),Unnamed: 3
vwc,40507,68.2%,
wug,6515,11.0%,
water board,2933,4.9%,
wua,2535,4.3%,
private operator,1971,3.3%,
parastatal,1768,3.0%,
water authority,904,1.5%,
other,844,1.4%,
company,685,1.2%,
unknown,561,0.9%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
user-group,52490
commercial,3638
parastatal,1768
Other values (2),1504

Value,Count,Frequency (%),Unnamed: 3
user-group,52490,88.4%,
commercial,3638,6.1%,
parastatal,1768,3.0%,
other,943,1.6%,
unknown,561,0.9%,

0,1
Distinct count,65
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.47414
Minimum,0
Maximum,1776
Zeros (%),98.7%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,0
Maximum,1776
Range,1776
Interquartile range,0

0,1
Standard deviation,12.236
Coef of variation,25.807
Kurtosis,11137
Mean,0.47414
MAD,0.9362
Skewness,91.934
Sum,28164
Variance,149.73
Memory size,928.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,58643,98.7%,
6,81,0.1%,
1,73,0.1%,
5,46,0.1%,
8,46,0.1%,
32,40,0.1%,
45,36,0.1%,
15,35,0.1%,
39,30,0.1%,
93,28,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,58643,98.7%,
1,73,0.1%,
2,23,0.0%,
3,27,0.0%,
4,20,0.0%,

Value,Count,Frequency (%),Unnamed: 3
672,1,0.0%,
698,1,0.0%,
755,1,0.0%,
1402,1,0.0%,
1776,1,0.0%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
never pay,25348
pay per bucket,8985
pay monthly,8300
Other values (4),16767

Value,Count,Frequency (%),Unnamed: 3
never pay,25348,42.7%,
pay per bucket,8985,15.1%,
pay monthly,8300,14.0%,
unknown,8157,13.7%,
pay when scheme fails,3914,6.6%,
pay annually,3642,6.1%,
other,1054,1.8%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
never pay,25348
per bucket,8985
monthly,8300
Other values (4),16767

Value,Count,Frequency (%),Unnamed: 3
never pay,25348,42.7%,
per bucket,8985,15.1%,
monthly,8300,14.0%,
unknown,8157,13.7%,
on failure,3914,6.6%,
annually,3642,6.1%,
other,1054,1.8%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),5.1%
Missing (n),3056

0,1
True,38852
(Missing),20548

Value,Count,Frequency (%),Unnamed: 3
True,38852,65.4%,
(Missing),20548,34.6%,

0,1
Distinct count,1049
Unique (%),1.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,179.91
Minimum,0
Maximum,30500
Zeros (%),36.0%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,25
Q3,215
95-th percentile,680
Maximum,30500
Range,30500
Interquartile range,215

0,1
Standard deviation,471.48
Coef of variation,2.6207
Kurtosis,402.28
Mean,179.91
MAD,214.7
Skewness,12.661
Sum,10686653
Variance,222300
Memory size,928.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,21381,36.0%,
1,7025,11.8%,
200,1940,3.3%,
150,1892,3.2%,
250,1681,2.8%,
300,1476,2.5%,
100,1146,1.9%,
50,1139,1.9%,
500,1009,1.7%,
350,986,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,21381,36.0%,
1,7025,11.8%,
2,4,0.0%,
3,4,0.0%,
4,13,0.0%,

Value,Count,Frequency (%),Unnamed: 3
9865,1,0.0%,
10000,3,0.0%,
11463,1,0.0%,
15300,1,0.0%,
30500,1,0.0%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),5.6%
Missing (n),3334

0,1
True,51011
(Missing),8389

Value,Count,Frequency (%),Unnamed: 3
True,51011,85.9%,
(Missing),8389,14.1%,

0,1
Distinct count,6
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
good,50818
salty,5195
unknown,1876
Other values (3),1511

Value,Count,Frequency (%),Unnamed: 3
good,50818,85.6%,
salty,5195,8.7%,
unknown,1876,3.2%,
milky,804,1.4%,
colored,490,0.8%,
fluoride,217,0.4%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
enough,33186
insufficient,15129
dry,6246
Other values (2),4839

Value,Count,Frequency (%),Unnamed: 3
enough,33186,55.9%,
insufficient,15129,25.5%,
dry,6246,10.5%,
seasonal,4050,6.8%,
unknown,789,1.3%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
enough,33186
insufficient,15129
dry,6246
Other values (2),4839

Value,Count,Frequency (%),Unnamed: 3
enough,33186,55.9%,
insufficient,15129,25.5%,
dry,6246,10.5%,
seasonal,4050,6.8%,
unknown,789,1.3%,

0,1
Constant value,GeoData Consultants Ltd

0,1
Distinct count,21
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Iringa,5294
Shinyanga,4982
Mbeya,4639
Other values (18),44485

Value,Count,Frequency (%),Unnamed: 3
Iringa,5294,8.9%,
Shinyanga,4982,8.4%,
Mbeya,4639,7.8%,
Kilimanjaro,4379,7.4%,
Morogoro,4006,6.7%,
Arusha,3350,5.6%,
Kagera,3316,5.6%,
Mwanza,3102,5.2%,
Kigoma,2816,4.7%,
Ruvuma,2640,4.4%,

0,1
Distinct count,27
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,15.297
Minimum,1
Maximum,99
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,2
Q1,5
Median,12
Q3,17
95-th percentile,60
Maximum,99
Range,98
Interquartile range,12

0,1
Standard deviation,17.587
Coef of variation,1.1497
Kurtosis,10.288
Mean,15.297
MAD,9.487
Skewness,3.1738
Sum,908642
Variance,309.32
Memory size,928.1 KiB

Value,Count,Frequency (%),Unnamed: 3
11,5300,8.9%,
17,5011,8.4%,
12,4639,7.8%,
3,4379,7.4%,
5,4040,6.8%,
18,3324,5.6%,
19,3047,5.1%,
2,3024,5.1%,
16,2816,4.7%,
10,2640,4.4%,

Value,Count,Frequency (%),Unnamed: 3
1,2201,3.7%,
2,3024,5.1%,
3,4379,7.4%,
4,2513,4.2%,
5,4040,6.8%,

Value,Count,Frequency (%),Unnamed: 3
40,1,0.0%,
60,1025,1.7%,
80,1238,2.1%,
90,917,1.5%,
99,423,0.7%,

0,1
Distinct count,13
Unique (%),0.0%
Missing (%),6.5%
Missing (n),3877

0,1
VWC,36793
WUG,5206
Water authority,3153
Other values (9),10371
(Missing),3877

Value,Count,Frequency (%),Unnamed: 3
VWC,36793,61.9%,
WUG,5206,8.8%,
Water authority,3153,5.3%,
WUA,2883,4.9%,
Water Board,2748,4.6%,
Parastatal,1680,2.8%,
Private operator,1063,1.8%,
Company,1061,1.8%,
Other,766,1.3%,
SWC,97,0.2%,

0,1
Distinct count,2697
Unique (%),4.5%
Missing (%),47.4%
Missing (n),28166

0,1
K,682
,644
Borehole,546
Other values (2693),29362
(Missing),28166

Value,Count,Frequency (%),Unnamed: 3
K,682,1.1%,
,644,1.1%,
Borehole,546,0.9%,
Chalinze wate,405,0.7%,
M,400,0.7%,
DANIDA,379,0.6%,
Government,320,0.5%,
Ngana water supplied scheme,270,0.5%,
wanging'ombe water supply s,261,0.4%,
wanging'ombe supply scheme,234,0.4%,

0,1
Distinct count,10
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
spring,17021
shallow well,16824
machine dbh,11075
Other values (7),14480

Value,Count,Frequency (%),Unnamed: 3
spring,17021,28.7%,
shallow well,16824,28.3%,
machine dbh,11075,18.6%,
river,9612,16.2%,
rainwater harvesting,2295,3.9%,
hand dtw,874,1.5%,
lake,765,1.3%,
dam,656,1.1%,
other,212,0.4%,
unknown,66,0.1%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
groundwater,45794
surface,13328
unknown,278

Value,Count,Frequency (%),Unnamed: 3
groundwater,45794,77.1%,
surface,13328,22.4%,
unknown,278,0.5%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
spring,17021
shallow well,16824
borehole,11949
Other values (4),13606

Value,Count,Frequency (%),Unnamed: 3
spring,17021,28.7%,
shallow well,16824,28.3%,
borehole,11949,20.1%,
river/lake,10377,17.5%,
rainwater harvesting,2295,3.9%,
dam,656,1.1%,
other,278,0.5%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
functional,32259
non functional,22824
functional needs repair,4317

Value,Count,Frequency (%),Unnamed: 3
functional,32259,54.3%,
non functional,22824,38.4%,
functional needs repair,4317,7.3%,

0,1
Distinct count,19288
Unique (%),32.5%
Missing (%),0.6%
Missing (n),371

0,1
Madukani,508
Shuleni,506
Majengo,502
Other values (19284),57513

Value,Count,Frequency (%),Unnamed: 3
Madukani,508,0.9%,
Shuleni,506,0.9%,
Majengo,502,0.8%,
Kati,373,0.6%,
Mtakuja,262,0.4%,
Sokoni,232,0.4%,
M,187,0.3%,
Muungano,172,0.3%,
Mbuyuni,164,0.3%,
Mlimani,152,0.3%,

0,1
Distinct count,2092
Unique (%),3.5%
Missing (%),0.0%
Missing (n),0

0,1
Igosi,307
Imalinyi,252
Siha Kati,232
Other values (2089),58609

Value,Count,Frequency (%),Unnamed: 3
Igosi,307,0.5%,
Imalinyi,252,0.4%,
Siha Kati,232,0.4%,
Mdandu,231,0.4%,
Nduruma,217,0.4%,
Mishamo,203,0.3%,
Kitunda,203,0.3%,
Msindo,201,0.3%,
Chalinze,196,0.3%,
Maji ya Chai,190,0.3%,

0,1
Distinct count,8
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
soft,50818
salty,4856
unknown,1876
Other values (5),1850

Value,Count,Frequency (%),Unnamed: 3
soft,50818,85.6%,
salty,4856,8.2%,
unknown,1876,3.2%,
milky,804,1.4%,
coloured,490,0.8%,
salty abandoned,339,0.6%,
fluoride,200,0.3%,
fluoride abandoned,17,0.0%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
communal standpipe,28522
hand pump,17488
other,6380
Other values (4),7010

Value,Count,Frequency (%),Unnamed: 3
communal standpipe,28522,48.0%,
hand pump,17488,29.4%,
other,6380,10.7%,
communal standpipe multiple,6103,10.3%,
improved spring,784,1.3%,
cattle trough,116,0.2%,
dam,7,0.0%,

0,1
Distinct count,6
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
communal standpipe,34625
hand pump,17488
other,6380
Other values (3),907

Value,Count,Frequency (%),Unnamed: 3
communal standpipe,34625,58.3%,
hand pump,17488,29.4%,
other,6380,10.7%,
improved spring,784,1.3%,
cattle trough,116,0.2%,
dam,7,0.0%,

0,1
Distinct count,37400
Unique (%),63.0%
Missing (%),0.0%
Missing (n),0

0,1
none,3563
Shuleni,1748
Zahanati,830
Other values (37397),53259

Value,Count,Frequency (%),Unnamed: 3
none,3563,6.0%,
Shuleni,1748,2.9%,
Zahanati,830,1.4%,
Msikitini,535,0.9%,
Kanisani,323,0.5%,
Bombani,271,0.5%,
Sokoni,260,0.4%,
Ofisini,254,0.4%,
School,208,0.4%,
Shule Ya Msingi,199,0.3%,

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [9]:
# Check size of table.
df_merge.shape

(59400, 41)

In [10]:
df_merge.dtypes

id                         int64
amount_tsh               float64
date_recorded             object
funder                    object
gps_height                 int64
installer                 object
longitude                float64
latitude                 float64
wpt_name                  object
num_private                int64
basin                     object
subvillage                object
region                    object
region_code                int64
district_code              int64
lga                       object
ward                      object
population                 int64
public_meeting            object
recorded_by               object
scheme_management         object
scheme_name               object
permit                    object
construction_year          int64
extraction_type           object
extraction_type_group     object
extraction_type_class     object
management                object
management_group          object
payment                   object
payment_ty

In [11]:
# Drop any column with more than 10% missing values
df_merge.dropna(thresh=len(df_merge)-len(df_merge)/10, axis=1, inplace=True)

In [12]:
df_merge.shape

(59400, 40)

In [13]:
# Create a new function that finds missing values in each column:
def num_missing(x):
  return sum(x.isnull())

#Applying per column:
print("Missing values per column:")
print(df_merge.apply(num_missing, axis=0)) #axis=0 defines that function is to be applied on each column

#Applying per row:
print("\nMissing values per row:")
print(df_merge.apply(num_missing, axis=1).head()) #axis=1 defines that function is to be applied on each row

Missing values per column:
id                          0
amount_tsh                  0
date_recorded               0
funder                   3635
gps_height                  0
installer                3655
longitude                   0
latitude                    0
wpt_name                    0
num_private                 0
basin                       0
subvillage                371
region                      0
region_code                 0
district_code               0
lga                         0
ward                        0
population                  0
public_meeting           3334
recorded_by                 0
scheme_management        3877
permit                   3056
construction_year           0
extraction_type             0
extraction_type_group       0
extraction_type_class       0
management                  0
management_group            0
payment                     0
payment_type                0
water_quality               0
quality_group               0
quantity     

In [14]:
# Drop column with high number of missing values
df_merge.drop(['funder', 'installer', 'public_meeting', 'scheme_management', 'permit'],axis=1, inplace=True)

In [15]:
df_merge.shape

(59400, 35)

In [16]:
# Create a new function that finds missing values in each column:
def num_missing(x):
  return sum(x.isnull())

#Applying per column:
print("Missing values per column:")
print(df_merge.apply(num_missing, axis=0)) #axis=0 defines that function is to be applied on each column

#Applying per row:
print("\nMissing values per row:")
print(df_merge.apply(num_missing, axis=1).head()) #axis=1 defines that function is to be applied on each row

Missing values per column:
id                         0
amount_tsh                 0
date_recorded              0
gps_height                 0
longitude                  0
latitude                   0
wpt_name                   0
num_private                0
basin                      0
subvillage               371
region                     0
region_code                0
district_code              0
lga                        0
ward                       0
population                 0
recorded_by                0
construction_year          0
extraction_type            0
extraction_type_group      0
extraction_type_class      0
management                 0
management_group           0
payment                    0
payment_type               0
water_quality              0
quality_group              0
quantity                   0
quantity_group             0
source                     0
source_type                0
source_class               0
waterpoint_type            0
waterpoint_type_

In [17]:
# Too many unique values. Dropping subvillage to prevent to many Dummy Variables. 
df_merge.drop(['subvillage'],axis=1, inplace=True)

In [18]:
#Applying per column:
print("Missing values per column:")
print(df_merge.apply(num_missing, axis=0)) #axis=0 defines that function is to be applied on each column

#Applying per row:
print("\nMissing values per row:")
print(df_merge.apply(num_missing, axis=1).head())

Missing values per column:
id                       0
amount_tsh               0
date_recorded            0
gps_height               0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
recorded_by              0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
source_class             0
waterpoint_type          0
waterpoint_type_group    0
status_group             0
dtype: int64

Missing values per row:
0    0
1    0
2  

In [19]:
list(df_merge.columns)

['id',
 'amount_tsh',
 'date_recorded',
 'gps_height',
 'longitude',
 'latitude',
 'wpt_name',
 'num_private',
 'basin',
 'region',
 'region_code',
 'district_code',
 'lga',
 'ward',
 'population',
 'recorded_by',
 'construction_year',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group',
 'status_group']

In [21]:
df_merge['construction_year'].value_counts()

0       20709
2010     2645
2008     2613
2009     2533
2000     2091
2007     1587
2006     1471
2003     1286
2011     1256
2004     1123
2012     1084
2002     1075
1978     1037
1995     1014
2005     1011
1999      979
1998      966
1990      954
1985      945
1980      811
1996      811
1984      779
1982      744
1994      738
1972      708
1974      676
1997      644
1992      640
1993      608
2001      540
1988      521
1983      488
1975      437
1986      434
1976      414
1970      411
1991      324
1989      316
1987      302
1981      238
1977      202
1979      192
1973      184
2013      176
1971      145
1960      102
1967       88
1963       85
1968       77
1969       59
1964       40
1962       30
1961       21
1965       19
1966       17
Name: construction_year, dtype: int64

In [22]:
# Change construction year into decades in order to reduce the number of Dummy columns later. 

def construction_wrangler(row):
    if row['construction_year'] >= 1960 and row['construction_year'] < 1970:
        return '1960s'
    elif row['construction_year'] >= 1970 and row['construction_year'] < 1980:
        return '1970s'
    elif row['construction_year'] >= 1980 and row['construction_year'] < 1990:
        return '1980s'
    elif row['construction_year'] >= 1990 and row['construction_year'] < 2000:
        return '1990s'
    elif row['construction_year'] >= 2000 and row['construction_year'] < 2010:
        return '2000s'
    elif row['construction_year'] >= 2010:
        return '2010s'
    else:
        return 'unknown'
    
df_merge['construction_year'] = df_merge.apply(lambda row: construction_wrangler(row), axis=1)

In [23]:
df_merge['construction_year'].value_counts()

unknown    20709
2000s      15330
1990s       7678
1980s       5578
2010s       5161
1970s       4406
1960s        538
Name: construction_year, dtype: int64

In [24]:
df_merge['quantity'].value_counts()

enough          33186
insufficient    15129
dry              6246
seasonal         4050
unknown           789
Name: quantity, dtype: int64

In [25]:
df_merge['quantity_group'].value_counts()

enough          33186
insufficient    15129
dry              6246
seasonal         4050
unknown           789
Name: quantity_group, dtype: int64

In [26]:
# quantity and auantity group are exactly the same. Dropping quantity group.
df_merge.drop(['quantity_group'], axis=1, inplace=True)

In [27]:
df_merge['water_quality'].value_counts()

soft                  50818
salty                  4856
unknown                1876
milky                   804
coloured                490
salty abandoned         339
fluoride                200
fluoride abandoned       17
Name: water_quality, dtype: int64

In [28]:
df_merge['quality_group'].value_counts()

good        50818
salty        5195
unknown      1876
milky         804
colored       490
fluoride      217
Name: quality_group, dtype: int64

In [29]:
# Dropping water_quality. close enough to quality_group.
df_merge.drop(['water_quality'], axis=1, inplace=True)

In [30]:
df_merge['waterpoint_type'].value_counts()

communal standpipe             28522
hand pump                      17488
other                           6380
communal standpipe multiple     6103
improved spring                  784
cattle trough                    116
dam                                7
Name: waterpoint_type, dtype: int64

In [31]:
df_merge['waterpoint_type_group'].value_counts()

communal standpipe    34625
hand pump             17488
other                  6380
improved spring         784
cattle trough           116
dam                       7
Name: waterpoint_type_group, dtype: int64

In [32]:
df_merge.drop(['waterpoint_type_group'], axis=1, inplace=True)

In [33]:
df_merge['source'].value_counts()

spring                  17021
shallow well            16824
machine dbh             11075
river                    9612
rainwater harvesting     2295
hand dtw                  874
lake                      765
dam                       656
other                     212
unknown                    66
Name: source, dtype: int64

In [34]:
df_merge['source_type'].value_counts()

spring                  17021
shallow well            16824
borehole                11949
river/lake              10377
rainwater harvesting     2295
dam                       656
other                     278
Name: source_type, dtype: int64

In [35]:
df_merge['source_class'].value_counts()

groundwater    45794
surface        13328
unknown          278
Name: source_class, dtype: int64

In [36]:
df_merge.drop(['source_type'], axis=1, inplace=True)

In [37]:
df_merge['payment'].value_counts()

never pay                25348
pay per bucket            8985
pay monthly               8300
unknown                   8157
pay when scheme fails     3914
pay annually              3642
other                     1054
Name: payment, dtype: int64

In [38]:
df_merge['payment_type'].value_counts()

never pay     25348
per bucket     8985
monthly        8300
unknown        8157
on failure     3914
annually       3642
other          1054
Name: payment_type, dtype: int64

In [39]:
df_merge.drop(['payment_type'], axis=1, inplace=True)

In [40]:
df_merge['management'].value_counts()

vwc                 40507
wug                  6515
water board          2933
wua                  2535
private operator     1971
parastatal           1768
water authority       904
other                 844
company               685
unknown               561
other - school         99
trust                  78
Name: management, dtype: int64

In [41]:
df_merge['management_group'].value_counts()

user-group    52490
commercial     3638
parastatal     1768
other           943
unknown         561
Name: management_group, dtype: int64

In [42]:
df_merge['extraction_type'].value_counts()

gravity                      26780
nira/tanira                   8154
other                         6430
submersible                   4764
swn 80                        3670
mono                          2865
india mark ii                 2400
afridev                       1770
ksb                           1415
other - rope pump              451
other - swn 81                 229
windmill                       117
india mark iii                  98
cemo                            90
other - play pump               85
walimi                          48
climax                          32
other - mkulima/shinyanga        2
Name: extraction_type, dtype: int64

In [43]:
df_merge['extraction_type_group'].value_counts()

gravity            26780
nira/tanira         8154
other               6430
submersible         6179
swn 80              3670
mono                2865
india mark ii       2400
afridev             1770
rope pump            451
other handpump       364
other motorpump      122
wind-powered         117
india mark iii        98
Name: extraction_type_group, dtype: int64

In [44]:
df_merge['extraction_type_class'].value_counts()

gravity         26780
handpump        16456
other            6430
submersible      6179
motorpump        2987
rope pump         451
wind-powered      117
Name: extraction_type_class, dtype: int64

In [45]:
df_merge.drop(['extraction_type'], axis=1, inplace=True)

In [46]:
df_merge['region'].value_counts()

Iringa           5294
Shinyanga        4982
Mbeya            4639
Kilimanjaro      4379
Morogoro         4006
Arusha           3350
Kagera           3316
Mwanza           3102
Kigoma           2816
Ruvuma           2640
Pwani            2635
Tanga            2547
Dodoma           2201
Singida          2093
Mara             1969
Tabora           1959
Rukwa            1808
Mtwara           1730
Manyara          1583
Lindi            1546
Dar es Salaam     805
Name: region, dtype: int64

In [47]:
df_merge['region_code'].value_counts()

11    5300
17    5011
12    4639
3     4379
5     4040
18    3324
19    3047
2     3024
16    2816
10    2640
4     2513
1     2201
13    2093
14    1979
20    1969
15    1808
6     1609
21    1583
80    1238
60    1025
90     917
7      805
99     423
9      390
24     326
8      300
40       1
Name: region_code, dtype: int64

In [48]:
df_merge['district_code'].value_counts()

1     12203
2     11173
3      9998
4      8999
5      4356
6      4074
7      3343
8      1043
30      995
33      874
53      745
43      505
13      391
23      293
63      195
62      109
60       63
0        23
80       12
67        6
Name: district_code, dtype: int64

In [49]:
list(df_merge.columns)

['id',
 'amount_tsh',
 'date_recorded',
 'gps_height',
 'longitude',
 'latitude',
 'wpt_name',
 'num_private',
 'basin',
 'region',
 'region_code',
 'district_code',
 'lga',
 'ward',
 'population',
 'recorded_by',
 'construction_year',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'quality_group',
 'quantity',
 'source',
 'source_class',
 'waterpoint_type',
 'status_group']

In [50]:
df_merge['wpt_name'].value_counts()

none                  3563
Shuleni               1748
Zahanati               830
Msikitini              535
Kanisani               323
Bombani                271
Sokoni                 260
Ofisini                254
School                 208
Shule Ya Msingi        199
Shule                  152
Sekondari              146
Muungano               133
Mkombozi               111
Madukani               104
Mbugani                 94
Hospital                94
Upendo                  93
Kituo Cha Afya          90
Mkuyuni                 88
Umoja                   84
Center                  83
Ccm                     81
Kisimani                78
Mtakuja                 76
Ofisi Ya Kijiji         76
Tankini                 73
Songambele              66
Bwawani                 66
Maendeleo               64
                      ... 
Kwa Ikobya               1
Kwa Olendikimo           1
Old Mbuyu                1
Kanjilu                  1
Kwa Kanali Mstaafu       1
Shuleni Sese             1
W

In [51]:
df_merge['basin'].value_counts()

Lake Victoria              10248
Pangani                     8940
Rufiji                      7976
Internal                    7785
Lake Tanganyika             6432
Wami / Ruvu                 5987
Lake Nyasa                  5085
Ruvuma / Southern Coast     4493
Lake Rukwa                  2454
Name: basin, dtype: int64

In [52]:
df_merge['region'].value_counts()

Iringa           5294
Shinyanga        4982
Mbeya            4639
Kilimanjaro      4379
Morogoro         4006
Arusha           3350
Kagera           3316
Mwanza           3102
Kigoma           2816
Ruvuma           2640
Pwani            2635
Tanga            2547
Dodoma           2201
Singida          2093
Mara             1969
Tabora           1959
Rukwa            1808
Mtwara           1730
Manyara          1583
Lindi            1546
Dar es Salaam     805
Name: region, dtype: int64

In [53]:
df_merge['lga'].value_counts()

Njombe              2503
Arusha Rural        1252
Moshi Rural         1251
Bariadi             1177
Rungwe              1106
Kilosa              1094
Kasulu              1047
Mbozi               1034
Meru                1009
Bagamoyo             997
Singida Rural        995
Kilombero            959
Same                 877
Kibondo              874
Kyela                859
Kahama               836
Magu                 824
Kigoma Rural         824
Maswa                809
Karagwe              771
Mbinga               750
Iringa Rural         728
Serengeti            716
Lushoto              694
Namtumbo             694
Songea Rural         693
Mpanda               679
Mvomero              671
Ngara                669
Ulanga               665
                    ... 
Ileje                231
Bahi                 224
Kisarawe             223
Temeke               215
Rorya                210
Tarime               209
Ngorongoro           201
Kiteto               193
Shinyanga Urban      191


In [54]:
df_merge['ward'].value_counts()

Igosi               307
Imalinyi            252
Siha Kati           232
Mdandu              231
Nduruma             217
Mishamo             203
Kitunda             203
Msindo              201
Chalinze            196
Maji ya Chai        190
Usuka               187
Ngarenanyuki        172
Chanika             171
Vikindu             162
Mtwango             153
Matola              145
Zinga/Ikerege       141
Wanging'ombe        139
Maramba             139
Itete               137
Magomeni            135
Kikatiti            134
Ifakara             134
Olkokola            133
Maposeni            130
Igongolo            129
Mvomero             129
Mlangali            125
Nkoma               122
Nkungulu            121
                   ... 
Kirongo               1
Korongoni             1
Uchindile             1
Sungwisi              1
Igogo                 1
Mitole                1
Nsemulwa              1
Kitete                1
Mawenzi               1
Linda                 1
Themi           

In [55]:
df_merge['recorded_by'].value_counts()

GeoData Consultants Ltd    59400
Name: recorded_by, dtype: int64

In [56]:
# Same data in every row for 'recorded_by'. Dropping!
df_merge.drop(['recorded_by'], axis=1, inplace=True)

In [57]:
df_merge.to_csv('./Datasets/clean_train_data.csv', index=False)