# Cleaning the Test Data
- Cleaning the data exactly the same way I cleaned the Training Data.

In [80]:
%load_ext autoreload
%autoreload 2
 
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [81]:
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype
import pandas_profiling
import numpy as np
import re
import requests
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from scipy import stats
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import warnings
import types

warnings.filterwarnings('ignore')

In [82]:
# Import the training data and parse the date column.
df = pd.read_csv('./Datasets/Tan_Test.csv', low_memory=False, parse_dates=['date_recorded'])

In [83]:
df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,...,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,...,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


In [84]:
# Function that displays all columns and rows when you use commands like df.head() and df_tail().

def display_all(df):
    with pd.option_context('display.max_rows', 1000):
        with pd.option_context('display.max_columns', 1000):
            display(df)

In [85]:
display_all(df.head())

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,Internal,Magoma,Manyara,21,3,Mbulu,Bashay,321,True,GeoData Consultants Ltd,Parastatal,,True,2012,other,other,other,parastatal,parastatal,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,Pangani,Kimnyak,Arusha,2,2,Arusha Rural,Kimnyaki,300,True,GeoData Consultants Ltd,VWC,TPRI pipe line,True,2000,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,Internal,Msatu,Singida,13,2,Singida Rural,Puma,500,True,GeoData Consultants Ltd,VWC,P,,2010,other,other,other,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,Ruvuma / Southern Coast,Kipindimbi,Lindi,80,43,Liwale,Mkutano,250,,GeoData Consultants Ltd,VWC,,True,1987,other,other,other,vwc,user-group,unknown,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,Ruvuma / Southern Coast,Losonga,Ruvuma,10,3,Mbinga,Mbinga Urban,60,,GeoData Consultants Ltd,Water Board,BRUDER,True,2000,gravity,gravity,gravity,water board,user-group,pay monthly,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


In [86]:
# Check the size of the dataframe
df.shape

(14850, 40)

In [87]:
# View data types of all columns.
df.dtypes

id                                int64
amount_tsh                      float64
date_recorded            datetime64[ns]
funder                           object
gps_height                        int64
installer                        object
longitude                       float64
latitude                        float64
wpt_name                         object
num_private                       int64
basin                            object
subvillage                       object
region                           object
region_code                       int64
district_code                     int64
lga                              object
ward                             object
population                        int64
public_meeting                   object
recorded_by                      object
scheme_management                object
scheme_name                      object
permit                           object
construction_year                 int64
extraction_type                  object


In [9]:
# Generate a profile report of the data to examine. 
pandas_profiling.ProfileReport(df)

0,1
Number of variables,40
Number of observations,14850
Total Missing (%),1.9%
Total size in memory,4.5 MiB
Average record size in memory,320.0 B

0,1
Numeric,10
Categorical,28
Boolean,0
Date,1
Text (Unique),0
Rejected,1
Unsupported,0

0,1
Distinct count,68
Unique (%),0.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,322.83
Minimum,0
Maximum,200000
Zeros (%),70.1%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,25
95-th percentile,1200
Maximum,200000
Range,200000
Interquartile range,25

0,1
Standard deviation,2511
Coef of variation,7.7781
Kurtosis,2965.6
Mean,322.83
MAD,530.66
Skewness,43.858
Sum,4794000
Variance,6305000
Memory size,116.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,10410,70.1%,
500.0,772,5.2%,
50.0,631,4.2%,
1000.0,370,2.5%,
20.0,349,2.4%,
200.0,296,2.0%,
100.0,218,1.5%,
10.0,189,1.3%,
30.0,186,1.3%,
2000.0,178,1.2%,

Value,Count,Frequency (%),Unnamed: 3
0.0,10410,70.1%,
0.2,1,0.0%,
0.5,1,0.0%,
2.0,5,0.0%,
3.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
50000.0,2,0.0%,
60000.0,1,0.0%,
70000.0,1,0.0%,
100000.0,1,0.0%,
200000.0,1,0.0%,

0,1
Distinct count,9
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
Lake Victoria,2623
Pangani,2203
Rufiji,2011
Other values (6),8013

Value,Count,Frequency (%),Unnamed: 3
Lake Victoria,2623,17.7%,
Pangani,2203,14.8%,
Rufiji,2011,13.5%,
Internal,1857,12.5%,
Lake Tanganyika,1620,10.9%,
Wami / Ruvu,1590,10.7%,
Lake Nyasa,1247,8.4%,
Ruvuma / Southern Coast,1094,7.4%,
Lake Rukwa,605,4.1%,

0,1
Distinct count,55
Unique (%),0.4%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1289.7
Minimum,0
Maximum,2013
Zeros (%),35.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,1986
Q3,2004
95-th percentile,2010
Maximum,2013
Range,2013
Interquartile range,2004

0,1
Standard deviation,955.24
Coef of variation,0.74066
Kurtosis,-1.6284
Mean,1289.7
MAD,913.65
Skewness,-0.60938
Sum,19152169
Variance,912490
Memory size,116.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,5260,35.4%,
2010,669,4.5%,
2009,663,4.5%,
2008,630,4.2%,
2000,487,3.3%,
2006,421,2.8%,
2007,373,2.5%,
2011,335,2.3%,
2004,294,2.0%,
2003,293,2.0%,

Value,Count,Frequency (%),Unnamed: 3
0,5260,35.4%,
1960,22,0.1%,
1961,7,0.0%,
1962,6,0.0%,
1963,22,0.1%,

Value,Count,Frequency (%),Unnamed: 3
2009,663,4.5%,
2010,669,4.5%,
2011,335,2.3%,
2012,263,1.8%,
2013,33,0.2%,

0,1
Distinct count,331
Unique (%),2.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Minimum,2001-03-26 00:00:00
Maximum,2013-12-03 00:00:00

0,1
Distinct count,20
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,5.6264
Minimum,0
Maximum,80
Zeros (%),0.0%

0,1
Minimum,0
5-th percentile,1
Q1,2
Median,3
Q3,5
95-th percentile,30
Maximum,80
Range,80
Interquartile range,3

0,1
Standard deviation,9.6738
Coef of variation,1.7194
Kurtosis,16.108
Mean,5.6264
MAD,4.7547
Skewness,3.9593
Sum,83552
Variance,93.583
Memory size,116.1 KiB

Value,Count,Frequency (%),Unnamed: 3
1,3096,20.8%,
2,2756,18.6%,
3,2523,17.0%,
4,2254,15.2%,
5,1072,7.2%,
6,1034,7.0%,
7,823,5.5%,
30,261,1.8%,
8,239,1.6%,
33,189,1.3%,

Value,Count,Frequency (%),Unnamed: 3
0,4,0.0%,
1,3096,20.8%,
2,2756,18.6%,
3,2523,17.0%,
4,2254,15.2%,

Value,Count,Frequency (%),Unnamed: 3
60,13,0.1%,
62,18,0.1%,
63,69,0.5%,
67,2,0.0%,
80,1,0.0%,

0,1
Distinct count,17
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
gravity,6483
nira/tanira,2051
other,1672
Other values (14),4644

Value,Count,Frequency (%),Unnamed: 3
gravity,6483,43.7%,
nira/tanira,2051,13.8%,
other,1672,11.3%,
submersible,1218,8.2%,
swn 80,918,6.2%,
mono,763,5.1%,
india mark ii,629,4.2%,
afridev,438,2.9%,
ksb,375,2.5%,
other - rope pump,121,0.8%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
gravity,6483
handpump,4156
other,1672
Other values (4),2539

Value,Count,Frequency (%),Unnamed: 3
gravity,6483,43.7%,
handpump,4156,28.0%,
other,1672,11.3%,
submersible,1593,10.7%,
motorpump,790,5.3%,
rope pump,121,0.8%,
wind-powered,35,0.2%,

0,1
Distinct count,13
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
gravity,6483
nira/tanira,2051
other,1672
Other values (10),4644

Value,Count,Frequency (%),Unnamed: 3
gravity,6483,43.7%,
nira/tanira,2051,13.8%,
other,1672,11.3%,
submersible,1593,10.7%,
swn 80,918,6.2%,
mono,763,5.1%,
india mark ii,629,4.2%,
afridev,438,2.9%,
rope pump,121,0.8%,
other handpump,83,0.6%,

0,1
Distinct count,981
Unique (%),6.6%
Missing (%),5.9%
Missing (n),869

0,1
Government Of Tanzania,2215
Danida,793
Hesawa,580
Other values (977),10393
(Missing),869

Value,Count,Frequency (%),Unnamed: 3
Government Of Tanzania,2215,14.9%,
Danida,793,5.3%,
Hesawa,580,3.9%,
World Bank,352,2.4%,
Kkkt,336,2.3%,
Rwssp,329,2.2%,
World Vision,316,2.1%,
Unicef,267,1.8%,
Tasaf,259,1.7%,
Dhv,236,1.6%,

0,1
Distinct count,2157
Unique (%),14.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,655.15
Minimum,-57
Maximum,2777
Zeros (%),35.1%

0,1
Minimum,-57
5-th percentile,0
Q1,0
Median,344
Q3,1308
95-th percentile,1794
Maximum,2777
Range,2834
Interquartile range,1308

0,1
Standard deviation,691.26
Coef of variation,1.0551
Kurtosis,-1.2591
Mean,655.15
MAD,635.06
Skewness,0.49709
Sum,9728942
Variance,477840
Memory size,116.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,5211,35.1%,
-19,18,0.1%,
1294,18,0.1%,
1343,18,0.1%,
1283,17,0.1%,
-16,16,0.1%,
1285,16,0.1%,
1373,16,0.1%,
1301,15,0.1%,
1293,15,0.1%,

Value,Count,Frequency (%),Unnamed: 3
-57,1,0.0%,
-56,1,0.0%,
-51,1,0.0%,
-50,1,0.0%,
-48,3,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2554,1,0.0%,
2558,1,0.0%,
2566,1,0.0%,
2576,1,0.0%,
2777,1,0.0%,

0,1
Distinct count,14850
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,37162
Minimum,10
Maximum,74249
Zeros (%),0.0%

0,1
Minimum,10.0
5-th percentile,3608.2
Q1,18727.0
Median,37362.0
Q3,55800.0
95-th percentile,70400.0
Maximum,74249.0
Range,74239.0
Interquartile range,37073.0

0,1
Standard deviation,21359
Coef of variation,0.57476
Kurtosis,-1.1939
Mean,37162
MAD,18468
Skewness,-0.010584
Sum,551855298
Variance,456220000
Memory size,116.1 KiB

Value,Count,Frequency (%),Unnamed: 3
4094,1,0.0%,
39654,1,0.0%,
35588,1,0.0%,
58105,1,0.0%,
27384,1,0.0%,
70391,1,0.0%,
33525,1,0.0%,
68340,1,0.0%,
15090,1,0.0%,
41786,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
10,1,0.0%,
13,1,0.0%,
14,1,0.0%,
29,1,0.0%,
32,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
74241,1,0.0%,
74244,1,0.0%,
74245,1,0.0%,
74248,1,0.0%,
74249,1,0.0%,

0,1
Distinct count,1092
Unique (%),7.4%
Missing (%),5.9%
Missing (n),877

0,1
DWE,4349
Government,457
RWE,292
Other values (1088),8875
(Missing),877

Value,Count,Frequency (%),Unnamed: 3
DWE,4349,29.3%,
Government,457,3.1%,
RWE,292,2.0%,
Commu,287,1.9%,
DANIDA,255,1.7%,
Hesawa,230,1.5%,
KKKT,222,1.5%,
0,203,1.4%,
TCRS,180,1.2%,
CES,155,1.0%,

0,1
Distinct count,14390
Unique (%),96.9%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,-5.6847
Minimum,-11.565
Maximum,-2e-08
Zeros (%),0.0%

0,1
Minimum,-11.565
5-th percentile,-10.593
Q1,-8.444
Median,-5.0498
Q3,-3.3206
95-th percentile,-1.405
Maximum,-2e-08
Range,11.565
Interquartile range,5.1234

0,1
Standard deviation,2.9408
Coef of variation,-0.51732
Kurtosis,-1.0405
Mean,-5.6847
MAD,2.5581
Skewness,-0.15624
Sum,-84418
Variance,8.6483
Memory size,116.1 KiB

Value,Count,Frequency (%),Unnamed: 3
-2e-08,457,3.1%,
-7.10591905,2,0.0%,
-2.47456022,2,0.0%,
-7.17066586,2,0.0%,
-6.99004232,2,0.0%,
-9.07748129,1,0.0%,
-10.13392144,1,0.0%,
-7.41849811,1,0.0%,
-6.59705723,1,0.0%,
-3.57451478,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
-11.56459195,1,0.0%,
-11.55640391,1,0.0%,
-11.55631115,1,0.0%,
-11.54032988,1,0.0%,
-11.53981671,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
-1.00617876,1,0.0%,
-1.00400371,1,0.0%,
-1.00277089,1,0.0%,
-0.99875229,1,0.0%,
-2e-08,457,3.1%,

0,1
Distinct count,125
Unique (%),0.8%
Missing (%),0.0%
Missing (n),0

0,1
Njombe,625
Moshi Rural,315
Bariadi,308
Other values (122),13602

Value,Count,Frequency (%),Unnamed: 3
Njombe,625,4.2%,
Moshi Rural,315,2.1%,
Bariadi,308,2.1%,
Rungwe,275,1.9%,
Kasulu,275,1.9%,
Kilosa,274,1.8%,
Arusha Rural,269,1.8%,
Bagamoyo,266,1.8%,
Mbozi,252,1.7%,
Kilombero,248,1.7%,

0,1
Distinct count,14390
Unique (%),96.9%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,34.062
Minimum,0
Maximum,40.325
Zeros (%),3.1%

0,1
Minimum,0.0
5-th percentile,30.051
Q1,33.069
Median,34.901
Q3,37.197
95-th percentile,39.12
Maximum,40.325
Range,40.325
Interquartile range,4.1271

0,1
Standard deviation,6.593
Coef of variation,0.19356
Kurtosis,19.004
Mean,34.062
MAD,3.3223
Skewness,-4.1731
Sum,505810
Variance,43.468
Memory size,116.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,457,3.1%,
37.30228071,2,0.0%,
32.92057868,2,0.0%,
37.26006945,2,0.0%,
39.08057272,2,0.0%,
36.86248135,1,0.0%,
39.2200884,1,0.0%,
32.58318754,1,0.0%,
31.12985465,1,0.0%,
34.86569662,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,457,3.1%,
29.61277618,1,0.0%,
29.61441634,1,0.0%,
29.62943461,1,0.0%,
29.636467100000004,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
40.29797082,1,0.0%,
40.31606505,1,0.0%,
40.32094769,1,0.0%,
40.32131492,1,0.0%,
40.32501564,1,0.0%,

0,1
Distinct count,12
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
vwc,10117
wug,1593
water board,755
Other values (9),2385

Value,Count,Frequency (%),Unnamed: 3
vwc,10117,68.1%,
wug,1593,10.7%,
water board,755,5.1%,
wua,583,3.9%,
private operator,533,3.6%,
parastatal,461,3.1%,
other,239,1.6%,
water authority,219,1.5%,
company,174,1.2%,
unknown,122,0.8%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
user-group,13048
commercial,953
parastatal,461
Other values (2),388

Value,Count,Frequency (%),Unnamed: 3
user-group,13048,87.9%,
commercial,953,6.4%,
parastatal,461,3.1%,
other,266,1.8%,
unknown,122,0.8%,

0,1
Distinct count,36
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.41508
Minimum,0
Maximum,669
Zeros (%),98.7%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,0
Maximum,669
Range,669
Interquartile range,0

0,1
Standard deviation,8.1679
Coef of variation,19.678
Kurtosis,3561.1
Mean,0.41508
MAD,0.81932
Skewness,50.745
Sum,6164
Variance,66.715
Memory size,116.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,14656,98.7%,
1,21,0.1%,
5,14,0.1%,
8,12,0.1%,
15,12,0.1%,
6,11,0.1%,
3,11,0.1%,
4,10,0.1%,
93,9,0.1%,
102,8,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0,14656,98.7%,
1,21,0.1%,
2,8,0.1%,
3,11,0.1%,
4,10,0.1%,

Value,Count,Frequency (%),Unnamed: 3
120,2,0.0%,
141,1,0.0%,
180,2,0.0%,
420,1,0.0%,
669,1,0.0%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
never pay,6364
pay per bucket,2281
pay monthly,2097
Other values (4),4108

Value,Count,Frequency (%),Unnamed: 3
never pay,6364,42.9%,
pay per bucket,2281,15.4%,
pay monthly,2097,14.1%,
unknown,1992,13.4%,
pay annually,928,6.2%,
pay when scheme fails,928,6.2%,
other,260,1.8%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
never pay,6364
per bucket,2281
monthly,2097
Other values (4),4108

Value,Count,Frequency (%),Unnamed: 3
never pay,6364,42.9%,
per bucket,2281,15.4%,
monthly,2097,14.1%,
unknown,1992,13.4%,
annually,928,6.2%,
on failure,928,6.2%,
other,260,1.8%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),5.0%
Missing (n),737

0,1
True,9754
(Missing),5096

Value,Count,Frequency (%),Unnamed: 3
True,9754,65.7%,
(Missing),5096,34.3%,

0,1
Distinct count,637
Unique (%),4.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,184.11
Minimum,0
Maximum,11469
Zeros (%),36.7%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,20
Q3,220
95-th percentile,700
Maximum,11469
Range,11469
Interquartile range,220

0,1
Standard deviation,469.5
Coef of variation,2.55
Kurtosis,103.89
Mean,184.11
MAD,222.6
Skewness,8.2171
Sum,2734096
Variance,220430
Memory size,116.1 KiB

Value,Count,Frequency (%),Unnamed: 3
0,5453,36.7%,
1,1757,11.8%,
150,436,2.9%,
200,430,2.9%,
250,406,2.7%,
300,366,2.5%,
50,298,2.0%,
100,273,1.8%,
350,266,1.8%,
500,265,1.8%,

Value,Count,Frequency (%),Unnamed: 3
0,5453,36.7%,
1,1757,11.8%,
2,5,0.0%,
3,2,0.0%,
4,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
7200,1,0.0%,
7600,1,0.0%,
9000,1,0.0%,
9800,1,0.0%,
11469,1,0.0%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),5.5%
Missing (n),821

0,1
True,12738
(Missing),2112

Value,Count,Frequency (%),Unnamed: 3
True,12738,85.8%,
(Missing),2112,14.2%,

0,1
Distinct count,6
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
good,12687
salty,1310
unknown,469
Other values (3),384

Value,Count,Frequency (%),Unnamed: 3
good,12687,85.4%,
salty,1310,8.8%,
unknown,469,3.2%,
milky,201,1.4%,
colored,133,0.9%,
fluoride,50,0.3%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
enough,8336
insufficient,3767
dry,1536
Other values (2),1211

Value,Count,Frequency (%),Unnamed: 3
enough,8336,56.1%,
insufficient,3767,25.4%,
dry,1536,10.3%,
seasonal,1025,6.9%,
unknown,186,1.3%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
enough,8336
insufficient,3767
dry,1536
Other values (2),1211

Value,Count,Frequency (%),Unnamed: 3
enough,8336,56.1%,
insufficient,3767,25.4%,
dry,1536,10.3%,
seasonal,1025,6.9%,
unknown,186,1.3%,

0,1
Constant value,GeoData Consultants Ltd

0,1
Distinct count,21
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
Shinyanga,1311
Iringa,1305
Mbeya,1119
Other values (18),11115

Value,Count,Frequency (%),Unnamed: 3
Shinyanga,1311,8.8%,
Iringa,1305,8.8%,
Mbeya,1119,7.5%,
Kilimanjaro,1115,7.5%,
Morogoro,1032,6.9%,
Kagera,858,5.8%,
Mwanza,795,5.4%,
Arusha,761,5.1%,
Kigoma,717,4.8%,
Pwani,696,4.7%,

0,1
Distinct count,26
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,15.139
Minimum,1
Maximum,99
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,2
Q1,5
Median,12
Q3,17
95-th percentile,60
Maximum,99
Range,98
Interquartile range,12

0,1
Standard deviation,17.191
Coef of variation,1.1356
Kurtosis,10.621
Mean,15.139
MAD,9.2986
Skewness,3.2012
Sum,224815
Variance,295.54
Memory size,116.1 KiB

Value,Count,Frequency (%),Unnamed: 3
17,1323,8.9%,
11,1308,8.8%,
12,1120,7.5%,
3,1115,7.5%,
5,1039,7.0%,
18,859,5.8%,
19,777,5.2%,
16,717,4.8%,
2,685,4.6%,
10,666,4.5%,

Value,Count,Frequency (%),Unnamed: 3
1,578,3.9%,
2,685,4.6%,
3,1115,7.5%,
4,632,4.3%,
5,1039,7.0%,

Value,Count,Frequency (%),Unnamed: 3
24,76,0.5%,
60,273,1.8%,
80,298,2.0%,
90,216,1.5%,
99,89,0.6%,

0,1
Distinct count,12
Unique (%),0.1%
Missing (%),6.5%
Missing (n),969

0,1
VWC,9124
WUG,1290
Water authority,822
Other values (8),2645
(Missing),969

Value,Count,Frequency (%),Unnamed: 3
VWC,9124,61.4%,
WUG,1290,8.7%,
Water authority,822,5.5%,
Water Board,714,4.8%,
WUA,668,4.5%,
Parastatal,444,3.0%,
Company,280,1.9%,
Private operator,263,1.8%,
Other,230,1.5%,
SWC,26,0.2%,

0,1
Distinct count,1790
Unique (%),12.1%
Missing (%),47.8%
Missing (n),7092

0,1
K,176
Borehole,158
,150
Other values (1786),7274
(Missing),7092

Value,Count,Frequency (%),Unnamed: 3
K,176,1.2%,
Borehole,158,1.1%,
,150,1.0%,
DANIDA,104,0.7%,
Chalinze wate,96,0.6%,
M,90,0.6%,
Government,75,0.5%,
Bagamoyo wate,67,0.5%,
Ngana water supplied scheme,65,0.4%,
wanging'ombe water supply s,62,0.4%,

0,1
Distinct count,10
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
shallow well,4316
spring,4195
machine dbh,2747
Other values (7),3592

Value,Count,Frequency (%),Unnamed: 3
shallow well,4316,29.1%,
spring,4195,28.2%,
machine dbh,2747,18.5%,
river,2352,15.8%,
rainwater harvesting,568,3.8%,
hand dtw,234,1.6%,
lake,185,1.2%,
dam,184,1.2%,
other,49,0.3%,
unknown,20,0.1%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
groundwater,11492
surface,3289
unknown,69

Value,Count,Frequency (%),Unnamed: 3
groundwater,11492,77.4%,
surface,3289,22.1%,
unknown,69,0.5%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
shallow well,4316
spring,4195
borehole,2981
Other values (4),3358

Value,Count,Frequency (%),Unnamed: 3
shallow well,4316,29.1%,
spring,4195,28.2%,
borehole,2981,20.1%,
river/lake,2537,17.1%,
rainwater harvesting,568,3.8%,
dam,184,1.2%,
other,69,0.5%,

0,1
Distinct count,8444
Unique (%),56.9%
Missing (%),0.7%
Missing (n),99

0,1
Shuleni,140
Majengo,129
Madukani,121
Other values (8440),14361
(Missing),99

Value,Count,Frequency (%),Unnamed: 3
Shuleni,140,0.9%,
Majengo,129,0.9%,
Madukani,121,0.8%,
Kati,94,0.6%,
Sokoni,62,0.4%,
Mtakuja,60,0.4%,
M,56,0.4%,
Songambele,47,0.3%,
Mbuyuni,46,0.3%,
Muungano,43,0.3%,

0,1
Distinct count,1959
Unique (%),13.2%
Missing (%),0.0%
Missing (n),0

0,1
Igosi,79
Imalinyi,66
Siha Kati,65
Other values (1956),14640

Value,Count,Frequency (%),Unnamed: 3
Igosi,79,0.5%,
Imalinyi,66,0.4%,
Siha Kati,65,0.4%,
Mdandu,61,0.4%,
Kitunda,57,0.4%,
Chanika,50,0.3%,
Mishamo,48,0.3%,
Nduruma,44,0.3%,
Zinga/Ikerege,44,0.3%,
Msindo,42,0.3%,

0,1
Distinct count,8
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
soft,12687
salty,1226
unknown,469
Other values (5),468

Value,Count,Frequency (%),Unnamed: 3
soft,12687,85.4%,
salty,1226,8.3%,
unknown,469,3.2%,
milky,201,1.4%,
coloured,133,0.9%,
salty abandoned,84,0.6%,
fluoride,44,0.3%,
fluoride abandoned,6,0.0%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
communal standpipe,7106
hand pump,4396
other,1630
Other values (4),1718

Value,Count,Frequency (%),Unnamed: 3
communal standpipe,7106,47.9%,
hand pump,4396,29.6%,
other,1630,11.0%,
communal standpipe multiple,1508,10.2%,
improved spring,175,1.2%,
cattle trough,34,0.2%,
dam,1,0.0%,

0,1
Distinct count,6
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
communal standpipe,8614
hand pump,4396
other,1630
Other values (3),210

Value,Count,Frequency (%),Unnamed: 3
communal standpipe,8614,58.0%,
hand pump,4396,29.6%,
other,1630,11.0%,
improved spring,175,1.2%,
cattle trough,34,0.2%,
dam,1,0.0%,

0,1
Distinct count,10840
Unique (%),73.0%
Missing (%),0.0%
Missing (n),0

0,1
none,877
Shuleni,435
Zahanati,204
Other values (10837),13334

Value,Count,Frequency (%),Unnamed: 3
none,877,5.9%,
Shuleni,435,2.9%,
Zahanati,204,1.4%,
Msikitini,112,0.8%,
Sokoni,68,0.5%,
Ofisini,67,0.5%,
Kanisani,67,0.5%,
School,52,0.4%,
Bombani,52,0.4%,
Shule Ya Msingi,48,0.3%,

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,Internal,Magoma,Manyara,21,3,Mbulu,Bashay,321,True,GeoData Consultants Ltd,Parastatal,,True,2012,other,other,other,parastatal,parastatal,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,Pangani,Kimnyak,Arusha,2,2,Arusha Rural,Kimnyaki,300,True,GeoData Consultants Ltd,VWC,TPRI pipe line,True,2000,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,Internal,Msatu,Singida,13,2,Singida Rural,Puma,500,True,GeoData Consultants Ltd,VWC,P,,2010,other,other,other,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,Ruvuma / Southern Coast,Kipindimbi,Lindi,80,43,Liwale,Mkutano,250,,GeoData Consultants Ltd,VWC,,True,1987,other,other,other,vwc,user-group,unknown,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,Ruvuma / Southern Coast,Losonga,Ruvuma,10,3,Mbinga,Mbinga Urban,60,,GeoData Consultants Ltd,Water Board,BRUDER,True,2000,gravity,gravity,gravity,water board,user-group,pay monthly,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


In [88]:
# Drop recorded_by column since it's constant and should be ignored.
df.drop(['recorded_by'], axis=1, inplace=True)

In [89]:
# Drop columns with a very high distinct count of unique values.
df.drop(['funder', 'installer', 'lga', 'scheme_name', 'subvillage', 'ward', 'wpt_name'], axis=1, inplace=True)

In [90]:
# Check the columns
df.columns

Index(['id', 'amount_tsh', 'date_recorded', 'gps_height', 'longitude',
       'latitude', 'num_private', 'basin', 'region', 'region_code',
       'district_code', 'population', 'public_meeting', 'scheme_management',
       'permit', 'construction_year', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')

In [91]:
# Function that adds new datetype columns to the DataFrame.

def add_datepart(df, fldname):
    fld = df[fldname]
    targ_pre = re.sub('[Dd]ate$','', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
             'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
             'Is_year_end', 'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = (fld - fld.min()).dt.days
    df.drop(fldname, axis=1, inplace=True)

In [92]:
fld = df['date_recorded']

In [93]:
# use function of data.
add_datepart(df, 'date_recorded')

In [94]:
df.columns

Index(['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude',
       'num_private', 'basin', 'region', 'region_code', 'district_code',
       'population', 'public_meeting', 'scheme_management', 'permit',
       'construction_year', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'management', 'management_group', 'payment',
       'payment_type', 'water_quality', 'quality_group', 'quantity',
       'quantity_group', 'source', 'source_type', 'source_class',
       'waterpoint_type', 'waterpoint_type_group', 'date_recordedYear',
       'date_recordedMonth', 'date_recordedWeek', 'date_recordedDay',
       'date_recordedDayofweek', 'date_recordedDayofyear',
       'date_recordedIs_month_end', 'date_recordedIs_month_start',
       'date_recordedIs_quarter_end', 'date_recordedIs_quarter_start',
       'date_recordedIs_year_end', 'date_recordedIs_year_start',
       'date_recordedElapsed'],
      dtype='object')

In [95]:
# Check one of the new created date columns
df['date_recordedMonth'].head()

0    2
1    2
2    2
3    1
4    3
Name: date_recordedMonth, dtype: int64

In [96]:
df['date_recordedMonth'].unique()

array([ 2,  1,  3, 10,  7,  6,  4, 12, 11,  8,  9,  5])

In [97]:
# Turn all data types into categorical data types
def train_cats(df):
     for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

In [98]:
# apply the function
train_cats(df)

In [99]:
# Find percentage of missing values.
display_all(df.isnull().sum().sort_index()/len(df))

amount_tsh                       0.000000
basin                            0.000000
construction_year                0.000000
date_recordedDay                 0.000000
date_recordedDayofweek           0.000000
date_recordedDayofyear           0.000000
date_recordedElapsed             0.000000
date_recordedIs_month_end        0.000000
date_recordedIs_month_start      0.000000
date_recordedIs_quarter_end      0.000000
date_recordedIs_quarter_start    0.000000
date_recordedIs_year_end         0.000000
date_recordedIs_year_start       0.000000
date_recordedMonth               0.000000
date_recordedWeek                0.000000
date_recordedYear                0.000000
district_code                    0.000000
extraction_type                  0.000000
extraction_type_class            0.000000
extraction_type_group            0.000000
gps_height                       0.000000
id                               0.000000
latitude                         0.000000
longitude                        0

#### Three columns still have missing values. 

In [100]:
# Check for unique values
df['permit'].unique()

[True, NaN, False]
Categories (2, object): [False < True]

In [101]:
# Fill NaN value with the mode. 
df['permit'].fillna(df['permit'].mode()[0], inplace=True)

In [102]:
# Check for any NaN values left.
df['permit'].unique()

[True, False]
Categories (2, object): [False < True]

In [103]:
df['public_meeting'].unique()

[True, NaN, False]
Categories (2, object): [False < True]

In [104]:
df['public_meeting'].fillna(df['public_meeting'].mode()[0], inplace=True)

In [105]:
df['public_meeting'].unique()

[True, False]
Categories (2, object): [False < True]

In [106]:
df['scheme_management'].unique()

[Parastatal, VWC, Water Board, NaN, Other, ..., WUA, Water authority, Company, Private operator, Trust]
Length: 12
Categories (11, object): [Company < Other < Parastatal < Private operator ... WUA < WUG < Water Board < Water authority]

In [107]:
df['scheme_management'].fillna(df['scheme_management'].mode()[0], inplace=True)

In [108]:
df['scheme_management'].unique()

[Parastatal, VWC, Water Board, Other, SWC, ..., WUA, Water authority, Company, Private operator, Trust]
Length: 11
Categories (11, object): [Company < Other < Parastatal < Private operator ... WUA < WUG < Water Board < Water authority]

In [109]:
# Check for missing values again.
display_all(df.isnull().sum().sort_index()/len(df))

amount_tsh                       0.0
basin                            0.0
construction_year                0.0
date_recordedDay                 0.0
date_recordedDayofweek           0.0
date_recordedDayofyear           0.0
date_recordedElapsed             0.0
date_recordedIs_month_end        0.0
date_recordedIs_month_start      0.0
date_recordedIs_quarter_end      0.0
date_recordedIs_quarter_start    0.0
date_recordedIs_year_end         0.0
date_recordedIs_year_start       0.0
date_recordedMonth               0.0
date_recordedWeek                0.0
date_recordedYear                0.0
district_code                    0.0
extraction_type                  0.0
extraction_type_class            0.0
extraction_type_group            0.0
gps_height                       0.0
id                               0.0
latitude                         0.0
longitude                        0.0
management                       0.0
management_group                 0.0
num_private                      0.0
p

#### All NaN and missing values are now gone. 

In [110]:
# Displays columns on the side instead of the top.

display_all(df.head().transpose())

Unnamed: 0,0,1,2,3,4
id,50785,51630,17168,45559,49871
amount_tsh,0,0,0,0,500
gps_height,1996,1569,1567,267,1260
longitude,35.2908,36.6567,34.7679,38.058,35.0061
latitude,-4.0597,-3.30921,-5.00434,-9.41867,-10.9504
num_private,0,0,0,0,0
basin,Internal,Pangani,Internal,Ruvuma / Southern Coast,Ruvuma / Southern Coast
region,Manyara,Arusha,Singida,Lindi,Ruvuma
region_code,21,2,13,80,10
district_code,3,2,2,43,3


In [111]:
# Find columns with 'object' dtypes.
cols = df.select_dtypes(exclude=[np.number])

In [112]:
list(cols)

['basin',
 'region',
 'public_meeting',
 'scheme_management',
 'permit',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group',
 'date_recordedIs_month_end',
 'date_recordedIs_month_start',
 'date_recordedIs_quarter_end',
 'date_recordedIs_quarter_start',
 'date_recordedIs_year_end',
 'date_recordedIs_year_start']

In [113]:
# Get dummy variables for categorical features.
dummy_df = pd.get_dummies(df, columns = ['basin',
                                         'region',
                                         'public_meeting',
                                         'scheme_management',
                                         'permit',
                                         'extraction_type',
                                         'extraction_type_group',
                                         'extraction_type_class',
                                         'management',
                                         'management_group',
                                         'payment',
                                         'payment_type',
                                         'water_quality',
                                         'quality_group',
                                         'quantity',
                                         'quantity_group',
                                         'source',
                                         'source_type',
                                         'source_class',
                                         'waterpoint_type',
                                         'waterpoint_type_group',
                                         'date_recordedIs_month_end',
                                         'date_recordedIs_month_start',
                                         'date_recordedIs_quarter_end',
                                         'date_recordedIs_quarter_start',
                                         'date_recordedIs_year_end',
                                         'date_recordedIs_year_start'])

In [114]:
# Check for new columns
dummy_df.columns

Index(['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude',
       'num_private', 'region_code', 'district_code', 'population',
       'construction_year',
       ...
       'date_recordedIs_month_end_False', 'date_recordedIs_month_end_True',
       'date_recordedIs_month_start_False', 'date_recordedIs_month_start_True',
       'date_recordedIs_quarter_end_False', 'date_recordedIs_quarter_end_True',
       'date_recordedIs_quarter_start_False',
       'date_recordedIs_quarter_start_True', 'date_recordedIs_year_end_False',
       'date_recordedIs_year_start_False'],
      dtype='object', length=197)

In [115]:
dummy_df.shape

(14850, 197)

In [116]:
dummy_df.to_csv('./Datasets/clean_test.csv', index=False)