In [1]:
import pandas as pd
import matplotlib.pyplot as plot
import numpy as np

In [2]:
training_labels = pd.read_csv('../Data/training_set_labels.csv')
training_values = pd.read_csv('../Data/training_set_values.csv')
test_values = pd.read_csv('../Data/test_set_values.csv')

In [3]:
training_labels['status_group'] = training_labels['status_group'].map({'non functional': 'non operational', 'functional': 'operational', 
                                     'functional needs repair': 'operational'})

In [4]:
df = training_values.merge(training_labels, on='id')

In [5]:
df.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'status_group'],
      dtype='object')

In [6]:
df.drop("id", axis = 1,inplace=True)

In [7]:
def full_value_counts(df):
    for column in df.columns:
        print(column)
        print(df[column].value_counts())
        print()

In [19]:
def transform_columns(dataframe, columns):
    transformed_df = pd.DataFrame()
    
    for column in columns:
        unique_vals = dataframe[column].value_counts().index
        string_to_numbers = dataframe[column].replace(to_replace=unique_vals, value=list(range(len(unique_vals))))
        transformed_df[column] = string_to_numbers
        
    return transformed_df

In [8]:
full_value_counts(df)

amount_tsh
0.0         41639
500.0        3102
50.0         2472
1000.0       1488
20.0         1463
            ...  
8500.0          1
6300.0          1
220.0           1
138000.0        1
12.0            1
Name: amount_tsh, Length: 98, dtype: int64

date_recorded
2011-03-15    572
2011-03-17    558
2013-02-03    546
2011-03-14    520
2011-03-16    513
             ... 
2011-08-31      1
2011-09-16      1
2011-09-08      1
2012-01-21      1
2011-09-12      1
Name: date_recorded, Length: 356, dtype: int64

funder
Government Of Tanzania    9084
Danida                    3114
Hesawa                    2202
Rwssp                     1374
World Bank                1349
                          ... 
Nazalet Church               1
Kizenga                      1
Hdv                          1
Wanakijiji                   1
Ripati                       1
Name: funder, Length: 1897, dtype: int64

gps_height
 0       20438
-15         60
-16         55
-13         55
-20         52
         ..

soft                  50818
salty                  4856
unknown                1876
milky                   804
coloured                490
salty abandoned         339
fluoride                200
fluoride abandoned       17
Name: water_quality, dtype: int64

quality_group
good        50818
salty        5195
unknown      1876
milky         804
colored       490
fluoride      217
Name: quality_group, dtype: int64

quantity
enough          33186
insufficient    15129
dry              6246
seasonal         4050
unknown           789
Name: quantity, dtype: int64

quantity_group
enough          33186
insufficient    15129
dry              6246
seasonal         4050
unknown           789
Name: quantity_group, dtype: int64

source
spring                  17021
shallow well            16824
machine dbh             11075
river                    9612
rainwater harvesting     2295
hand dtw                  874
lake                      765
dam                       656
other                     2

In [9]:
columns_to_drop = ["amount_tsh", "num_private", "recorded_by", "payment_type", "extraction_type", "extraction_type_group", 
                   "water_quality", "quantity_group"]
df_small = df.drop(columns_to_drop, axis = 1)

In [16]:
df_small_small = df_small.replace({'none': None,'unknown' : None, -2.00E-08: None, "0": None})
df_small_small["district_code"].replace({0: None}, inplace=True)
df_small_small["population"].replace({0: None}, inplace=True)
df_small_small["construction_year"].replace({0: None}, inplace=True)

In [17]:
df_small_small.isna().sum()

date_recorded                0
funder                    4412
gps_height                   0
installer                 4433
longitude                    0
latitude                  1812
wpt_name                  3563
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code               23
lga                          0
ward                         0
population               21381
public_meeting            3334
scheme_management         3877
scheme_name              28191
permit                    3056
construction_year        20709
extraction_type_class        0
management                 561
management_group           561
payment                   8157
quality_group             1876
quantity                   789
source                      66
source_type                  0
source_class               278
waterpoint_type              0
waterpoint_type_group        0
status_group                 0
dtype: i

In [18]:
df_small_small.dropna(subset=['latitude'],inplace=True)

In [20]:
df_transformed = transform_columns(df_small_small,df_small_small.columns)

Unnamed: 0,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,basin,subvillage,region,...,management_group,payment,quality_group,quantity,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,3,33.0,519,87.0,43397.0,26320.0,,6,1624.0,0,...,0.0,4.0,0.0,0.0,0.0,0,0.0,0,0,0
1,183,131.0,93,160.0,36313.0,5327.0,1.0,1,2428.0,14,...,0.0,0.0,0.0,1.0,4.0,4,1.0,0,0,0
2,88,430.0,1531,15.0,13633.0,51772.0,1549.0,0,0.0,18,...,0.0,1.0,0.0,0.0,6.0,5,1.0,3,0,0
3,30,7.0,508,32.0,12515.0,32945.0,26096.0,7,265.0,17,...,0.0,0.0,0.0,2.0,2.0,2,0.0,3,0,1
4,112,1759.0,0,59.0,25239.0,20338.0,0.0,1,7712.0,6,...,3.0,0.0,0.0,3.0,4.0,4,1.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,145,13.0,451,9.0,46061.0,31775.0,11701.0,0,3575.0,2,...,0.0,1.0,0.0,0.0,0.0,0,0.0,0,0,0
59396,309,246.0,405,308.0,14693.0,5273.0,24758.0,2,343.0,0,...,0.0,4.0,0.0,0.0,3.0,3,1.0,0,0,0
59397,163,,0,,53921.0,15666.0,133.0,2,8653.0,1,...,0.0,2.0,4.0,0.0,2.0,2,0.0,1,1,0
59398,52,690.0,0,562.0,25922.0,35422.0,27782.0,2,156.0,12,...,0.0,0.0,0.0,1.0,1.0,1,0.0,1,1,0


In [29]:
df_small_small['installer'].value_counts()

DWE                 16255
Government           1670
RWE                  1181
Commu                1060
DANIDA               1050
                    ...  
Kuwaiti                 1
church                  1
Kagulo                  1
Ta                      1
Rashid Seng'ombe        1
Name: installer, Length: 2111, dtype: int64

In [None]:
plt.