In [25]:
import pandas as pd
import matplotlib.pyplot as plot
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

In [2]:
training_labels = pd.read_csv('../Data/training_set_labels.csv')
training_values = pd.read_csv('../Data/training_set_values.csv')
test_values = pd.read_csv('../Data/test_set_values.csv')

In [3]:
training_labels['status_group'] = training_labels['status_group'].map({'non functional': 'non operational', 'functional': 'operational', 
                                     'functional needs repair': 'operational'})

In [4]:
df = training_values.merge(training_labels, on='id')

In [5]:
df.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'status_group'],
      dtype='object')

In [6]:
df.drop("id", axis = 1,inplace=True)

In [7]:
def full_value_counts(df):
    for column in df.columns:
        print(column)
        print(df[column].value_counts())
        print()

In [8]:
def transform_columns(dataframe, columns):
    transformed_df = pd.DataFrame()
    
    for column in columns:
        unique_vals = dataframe[column].value_counts().index
        string_to_numbers = dataframe[column].replace(to_replace=unique_vals, value=list(range(len(unique_vals))))
        transformed_df[column] = string_to_numbers
        
    return transformed_df

In [9]:
full_value_counts(df)

amount_tsh
0.0         41639
500.0        3102
50.0         2472
1000.0       1488
20.0         1463
            ...  
8500.0          1
6300.0          1
220.0           1
138000.0        1
12.0            1
Name: amount_tsh, Length: 98, dtype: int64

date_recorded
2011-03-15    572
2011-03-17    558
2013-02-03    546
2011-03-14    520
2011-03-16    513
             ... 
2013-01-01      1
2011-09-17      1
2011-09-15      1
2011-09-06      1
2011-09-18      1
Name: date_recorded, Length: 356, dtype: int64

funder
Government Of Tanzania       9084
Danida                       3114
Hesawa                       2202
Rwssp                        1374
World Bank                   1349
                             ... 
Maseka Community                1
Hotels And Lodge Tanzania       1
Umoja                           1
Pentecost                       1
Handeni Trunk Maini             1
Name: funder, Length: 1897, dtype: int64

gps_height
 0       20438
-15         60
-16         55
-13     

Name: source, dtype: int64

source_type
spring                  17021
shallow well            16824
borehole                11949
river/lake              10377
rainwater harvesting     2295
dam                       656
other                     278
Name: source_type, dtype: int64

source_class
groundwater    45794
surface        13328
unknown          278
Name: source_class, dtype: int64

waterpoint_type
communal standpipe             28522
hand pump                      17488
other                           6380
communal standpipe multiple     6103
improved spring                  784
cattle trough                    116
dam                                7
Name: waterpoint_type, dtype: int64

waterpoint_type_group
communal standpipe    34625
hand pump             17488
other                  6380
improved spring         784
cattle trough           116
dam                       7
Name: waterpoint_type_group, dtype: int64

status_group
operational        36576
non operational    22824

In [10]:
columns_to_drop = ["amount_tsh", "num_private", "recorded_by", "payment_type", "extraction_type", "extraction_type_group", 
                   "water_quality", "quantity_group"]
df_small = df.drop(columns_to_drop, axis = 1)

In [11]:
df_small_small = df_small.replace({'none': None,'unknown' : None, -2.00E-08: None, "0": None})
df_small_small["district_code"].replace({0: None}, inplace=True)
df_small_small["population"].replace({0: None}, inplace=True)
df_small_small["construction_year"].replace({0: None}, inplace=True)

In [12]:
df_small_small.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date_recorded          59400 non-null  object 
 1   funder                 54988 non-null  object 
 2   gps_height             59400 non-null  int64  
 3   installer              54967 non-null  object 
 4   longitude              59400 non-null  float64
 5   latitude               57588 non-null  float64
 6   wpt_name               55837 non-null  object 
 7   basin                  59400 non-null  object 
 8   subvillage             59029 non-null  object 
 9   region                 59400 non-null  object 
 10  region_code            59400 non-null  int64  
 11  district_code          59377 non-null  object 
 12  lga                    59400 non-null  object 
 13  ward                   59400 non-null  object 
 14  population             38019 non-null  object 
 15  pu

In [13]:
df_small_small.dropna(subset=['latitude'],inplace=True)

In [57]:
index = df_small_small['installer'].value_counts().reset_index().drop('installer', axis = 1).to_dict()['index']
index

{0: 'DWE',
 1: 'Government',
 2: 'RWE',
 3: 'Commu',
 4: 'DANIDA',
 5: 'KKKT',
 6: 'Hesawa',
 7: 'TCRS',
 8: 'Central government',
 9: 'CES',
 10: 'DANID',
 11: 'District Council',
 12: 'Community',
 13: 'HESAWA',
 14: 'LGA',
 15: 'World vision',
 16: 'WEDECO',
 17: 'District council',
 18: 'Gover',
 19: 'TASAF',
 20: 'AMREF',
 21: 'TWESA',
 22: 'WU',
 23: 'Dmdd',
 24: 'ACRA',
 25: 'SEMA',
 26: 'World Vision',
 27: 'DW',
 28: 'OXFAM',
 29: 'Da',
 30: 'UNICEF',
 31: 'Idara ya maji',
 32: 'Gove',
 33: 'Sengerema Water Department',
 34: 'Kiliwater',
 35: 'FinW',
 36: 'NORAD',
 37: 'DH',
 38: 'Villagers',
 39: 'DWSP',
 40: 'Distri',
 41: 'Lawatefuka water sup',
 42: 'Magadini-Makiwaru wa',
 43: 'RC',
 44: 'FW',
 45: 'KKKT _ Konde and DWE',
 46: 'Centr',
 47: 'MWE',
 48: 'Handeni Trunk Main(',
 49: 'Is',
 50: 'Norad',
 51: 'Fini Water',
 52: 'RWSSP',
 53: 'SHIPO',
 54: 'Kuwait',
 55: 'Private',
 56: 'JAICA',
 57: 'Central govt',
 58: 'Artisan',
 59: 'ISF',
 60: 'Fini water',
 61: 'GOVER',
 

In [59]:
imputed_df = pd.read_csv("../Data/X_train_imputed_df.csv")
imputed_df.drop('Unnamed: 0', axis = 1, inplace = True)

In [16]:
# key = {}
# key_entry {}
# for index in column.index:
#     key_entry[index] = 
# df_small_small['installer'].value_counts().to_dict()
#     string_to_numbers = dataframe[column].replace(to_replace=unique_vals, value=list(range(len(unique_vals))))

In [17]:
def transform_columns(dataframe, columns):
    transformed_df = pd.DataFrame()
    key = {}
    for column in columns:
        unique_vals = dataframe[column].value_counts().index
        key[column] = dataframe[column].value_counts().reset_index().drop(column, axis = 1).to_dict()['index']
        string_to_numbers = dataframe[column].replace(to_replace=unique_vals, value=list(range(len(unique_vals))))
        transformed_df[column] = string_to_numbers
        
    return transformed_df, key

In [49]:
def revert_back_to_strings(df, key):
    for column in df.columns:
        if column not in ['date_recorded', 'gps_height', 'longitude', 'latitude', 'population', 'construction_year'] :
            df[column].replace(key[column], inplace = True)
    
    return df

In [61]:
transformed_df, key = transform_columns(df_small_small, ['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 
                                                    'region_code', 'district_code', 'lga', 'ward', 'public_meeting', 
                                                    'scheme_management', 'scheme_name', 'permit', 'extraction_type_class',
                                                    'management', 'management_group', 'payment', 'quality_group', 
                                                    'quantity', 'source', 'source_type', 'source_class','waterpoint_type',
                                                    'waterpoint_type_group', 'status_group'])

KeyboardInterrupt: 

In [64]:
key["funder"]

{0: 'Government Of Tanzania',
 1: 'Danida',
 2: 'Hesawa',
 3: 'World Bank',
 4: 'Kkkt',
 5: 'World Vision',
 6: 'Rwssp',
 7: 'Unicef',
 8: 'District Council',
 9: 'Tasaf',
 10: 'Dhv',
 11: 'Private Individual',
 12: 'Norad',
 13: 'Germany Republi',
 14: 'Tcrs',
 15: 'Ministry Of Water',
 16: 'Water',
 17: 'Dwe',
 18: 'Netherlands',
 19: 'Hifab',
 20: 'Adb',
 21: 'Lga',
 22: 'Amref',
 23: 'Fini Water',
 24: 'Oxfam',
 25: 'Dwsp',
 26: 'Wateraid',
 27: 'Rc Church',
 28: 'Isf',
 29: 'Rudep',
 30: 'Mission',
 31: 'Private',
 32: 'Jaica',
 33: 'Roman',
 34: 'Rural Water Supply And Sanitat',
 35: 'Adra',
 36: 'Ces(gmbh)',
 37: 'Shipo',
 38: 'Rc',
 39: 'Jica',
 40: 'Finw',
 41: 'Dh',
 42: 'Ded',
 43: 'Plan Int',
 44: 'Wsdp',
 45: 'Kiliwater',
 46: 'Dmdd',
 47: 'Go',
 48: 'Lawatefuka Water Supply',
 49: 'Oxfarm',
 50: 'Magadini-makiwaru Water',
 51: 'Fw',
 52: 'W.B',
 53: 'Kkkt_makwale',
 54: 'Ces (gmbh)',
 55: 'Oikos E.Afrika',
 56: 'Nethalan',
 57: 'Mkinga Distric Coun',
 58: 'Lvia',
 59: 'Co

In [34]:
X = transformed_df.drop('status_group', axis=1)
y = transformed_df['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=0.3)

In [62]:
imputed_df.rename(columns={'0': 'funder', '1': 'installer', '2': 'wpt_name', '3': 'basin', '4': 'subvillage',
                                  '5': 'region', '6': 'region_code', '7': 'district_code', '8': 'lga', '9': 'ward',
                                  '10': 'public_meeting', '11': 'scheme_management', '12': 'scheme_name', '13': 'permit',
                                  '14': 'extraction_type_class', '15': 'management', '16': 'management_group', '17': 
                                  'payment', '18': 'quality_group', '19': 'quantity', '20': 'source', '21': 'source_type',
                                  '22': 'source_class', '23': 'waterpoint_type', '24': 'waterpoint_type_group', '25': 
                                  'date_recorded', '26': 'gps_height', '27': 'longitude', '28': 'latitude', '29': 'population',
                                  '30': 'construction_year'}, inplace=True)

In [54]:
df = revert_back_to_strings(imputed_df[['date_recorded']], key)


AttributeError: 'Series' object has no attribute 'columns'

In [63]:
imputed_df

Unnamed: 0,funder,installer,wpt_name,basin,subvillage,region,region_code,district_code,lga,ward,...,source_type,source_class,waterpoint_type,waterpoint_type_group,date_recorded,gps_height,longitude,latitude,population,construction_year
0,16.0,159.0,25199.0,5.0,25.0,12.0,11.0,4.0,68.0,323.0,...,2.0,0.0,3.0,0.0,2011.0,0.0,35.891855,-6.153545,212.337527,1998.387233
1,12.0,2.0,26832.0,4.0,12814.0,7.0,7.0,0.0,12.0,66.0,...,0.0,0.0,0.0,0.0,2013.0,1260.0,30.914468,-3.326810,530.000000,1993.000000
2,12.0,50.0,3854.0,8.0,9256.0,16.0,15.0,3.0,103.0,990.0,...,3.0,1.0,0.0,0.0,2013.0,2137.0,31.631254,-7.863417,750.000000,1984.000000
3,1.0,4.0,36153.0,6.0,16800.0,8.0,8.0,2.0,18.0,424.0,...,0.0,0.0,0.0,0.0,2013.0,462.0,34.831606,-11.319762,96.000000,1992.000000
4,0.0,0.0,21887.0,2.0,8230.0,3.0,3.0,3.0,27.0,472.0,...,1.0,0.0,1.0,1.0,2011.0,295.0,36.624641,-8.410004,400.000000,1976.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40306,1172.0,0.0,7193.0,0.0,153.0,10.0,9.0,4.0,80.0,802.0,...,2.0,0.0,0.0,0.0,2011.0,52.0,38.973581,-5.375739,12.000000,1995.000000
40307,28.0,59.0,2.0,0.0,8.0,2.0,2.0,2.0,11.0,520.0,...,3.0,1.0,0.0,0.0,2013.0,500.0,38.078320,-4.480761,140.000000,2013.000000
40308,387.0,2.0,18712.0,5.0,2579.0,3.0,3.0,5.0,25.0,540.0,...,3.0,1.0,0.0,0.0,2011.0,520.0,37.560400,-6.917776,1.000000,1985.000000
40309,1.0,4.0,11833.0,2.0,215.0,8.0,8.0,4.0,22.0,679.0,...,3.0,1.0,3.0,0.0,2013.0,844.0,36.122400,-10.463274,250.000000,1982.000000


In [23]:
transformed_df['date_recorded'] = df_small_small['date_recorded']
transformed_df['gps_height'] = df_small_small['gps_height']
transformed_df['longitude'] = df_small_small['longitude']
transformed_df['latitude'] = df_small_small['latitude']
transformed_df['population'] = df_small_small['population']
transformed_df['construction_year'] = df_small_small['construction_year']

In [24]:
X = transformed_df.drop('status_group', axis=1)
y = transformed_df['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=0.3)

NameError: name 'train_test_split' is not defined

In [None]:
imputed_df.rename(columns={'0': 'funder', '1': 'installer', '2': 'wpt_name', '3': 'basin', '4': 'subvillage',
                                  '5': 'region', '6': 'region_code', '7': 'district_code', '8': 'lga', '9': 'ward',
                                  '10': 'public_meeting', '11': 'scheme_management', '12': 'scheme_name', '13': 'permit',
                                  '14': 'extraction_type_class', '15': 'management', '16': 'management_group', '17': 
                                  'payment', '18': 'quality_group', '19': 'quantity', '20': 'source', '21': 'source_type',
                                  '22': 'source_class', '23': 'waterpoint_type', '24': 'waterpoint_type_group', '25': 
                                  'date_recorded', '26': 'gps_height', '27': 'longitude', '28': 'latitude', '29': 'population',
                                  '30': 'construction_year'}, inplace=True)

In [27]:
tree = DecisionTreeClassifier(random_state=42)
tree.fit(imputed_df, y_train)

DecisionTreeClassifier(random_state=42)

In [29]:
tree.score(imputed_df, y_train)

1.0

In [56]:
tree.score(X_test, y_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').