In [76]:
import pandas as pd
import matplotlib.pyplot as plot
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import precision_score

In [2]:
training_labels = pd.read_csv('../Data/training_set_labels.csv')
training_values = pd.read_csv('../Data/training_set_values.csv')
test_values = pd.read_csv('../Data/test_set_values.csv')

In [3]:
training_labels['status_group'] = training_labels['status_group'].map({'non functional': 'non operational', 'functional': 'operational', 
                                     'functional needs repair': 'operational'})

In [4]:
df = training_values.merge(training_labels, on='id')

In [5]:
df.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'status_group'],
      dtype='object')

In [6]:
df.drop("id", axis = 1,inplace=True)

In [7]:
def full_value_counts(df):
    for column in df.columns:
        print(column)
        print(df[column].value_counts())
        print()

In [9]:
full_value_counts(df)

amount_tsh
0.0         41639
500.0        3102
50.0         2472
1000.0       1488
20.0         1463
            ...  
8500.0          1
6300.0          1
220.0           1
138000.0        1
12.0            1
Name: amount_tsh, Length: 98, dtype: int64

date_recorded
2011-03-15    572
2011-03-17    558
2013-02-03    546
2011-03-14    520
2011-03-16    513
             ... 
2012-01-25      1
2011-09-20      1
2011-09-05      1
2011-09-23      1
2011-09-27      1
Name: date_recorded, Length: 356, dtype: int64

funder
Government Of Tanzania    9084
Danida                    3114
Hesawa                    2202
Rwssp                     1374
World Bank                1349
                          ... 
Wug And Ded                  1
Msiki                        1
Cc Motor Day 2010            1
Tanga Cement                 1
Rudep /dwe                   1
Name: funder, Length: 1897, dtype: int64

gps_height
 0       20438
-15         60
-16         55
-13         55
-20         52
         ..

Name: source_type, dtype: int64

source_class
groundwater    45794
surface        13328
unknown          278
Name: source_class, dtype: int64

waterpoint_type
communal standpipe             28522
hand pump                      17488
other                           6380
communal standpipe multiple     6103
improved spring                  784
cattle trough                    116
dam                                7
Name: waterpoint_type, dtype: int64

waterpoint_type_group
communal standpipe    34625
hand pump             17488
other                  6380
improved spring         784
cattle trough           116
dam                       7
Name: waterpoint_type_group, dtype: int64

status_group
operational        36576
non operational    22824
Name: status_group, dtype: int64



In [31]:
columns_to_drop = ["amount_tsh", "num_private", "recorded_by", "payment_type", "extraction_type", "extraction_type_group", 
                   "water_quality", "quantity_group", "scheme_name"]
df_small = df.drop(columns_to_drop, axis = 1)

In [11]:
df_small_small = df_small.replace({'none': None,'unknown' : None, -2.00E-08: None, "0": None})
df_small_small["district_code"].replace({0: None}, inplace=True)
df_small_small["population"].replace({0: None}, inplace=True)
df_small_small["construction_year"].replace({0: None}, inplace=True)

In [12]:
df_small_small.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date_recorded          59400 non-null  object 
 1   funder                 54988 non-null  object 
 2   gps_height             59400 non-null  int64  
 3   installer              54967 non-null  object 
 4   longitude              59400 non-null  float64
 5   latitude               57588 non-null  float64
 6   wpt_name               55837 non-null  object 
 7   basin                  59400 non-null  object 
 8   subvillage             59029 non-null  object 
 9   region                 59400 non-null  object 
 10  region_code            59400 non-null  int64  
 11  district_code          59377 non-null  object 
 12  lga                    59400 non-null  object 
 13  ward                   59400 non-null  object 
 14  population             38019 non-null  object 
 15  pu

In [13]:
df_small_small.dropna(subset=['latitude'],inplace=True)

In [78]:
index_test = df_small_small['installer'].value_counts().reset_index().drop('installer', axis = 1).to_dict()['index']

{0: 'DWE',
 1: 'Government',
 2: 'RWE',
 3: 'Commu',
 4: 'DANIDA',
 5: 'KKKT',
 6: 'Hesawa',
 7: 'TCRS',
 8: 'Central government',
 9: 'CES',
 10: 'DANID',
 11: 'District Council',
 12: 'Community',
 13: 'HESAWA',
 14: 'LGA',
 15: 'World vision',
 16: 'WEDECO',
 17: 'District council',
 18: 'Gover',
 19: 'TASAF',
 20: 'AMREF',
 21: 'TWESA',
 22: 'WU',
 23: 'Dmdd',
 24: 'ACRA',
 25: 'SEMA',
 26: 'World Vision',
 27: 'DW',
 28: 'OXFAM',
 29: 'Da',
 30: 'Gove',
 31: 'UNICEF',
 32: 'Idara ya maji',
 33: 'Sengerema Water Department',
 34: 'Kiliwater',
 35: 'NORAD',
 36: 'FinW',
 37: 'DH',
 38: 'Villagers',
 39: 'DWSP',
 40: 'Distri',
 41: 'Lawatefuka water sup',
 42: 'Magadini-Makiwaru wa',
 43: 'RC',
 44: 'FW',
 45: 'KKKT _ Konde and DWE',
 46: 'Centr',
 47: 'MWE',
 48: 'Handeni Trunk Main(',
 49: 'Is',
 50: 'Norad',
 51: 'Fini Water',
 52: 'RWSSP',
 53: 'SHIPO',
 54: 'Kuwait',
 55: 'Private',
 56: 'JAICA',
 57: 'Central govt',
 58: 'Artisan',
 59: 'ISF',
 60: 'Fini water',
 61: 'GOVER',
 

In [15]:
def transform_columns(dataframe, columns):
    transformed_df = pd.DataFrame()
    key = {}
    for column in columns:
        unique_vals = dataframe[column].value_counts().index
        key[column] = dataframe[column].value_counts().reset_index().drop(column, axis = 1).to_dict()['index']
        string_to_numbers = dataframe[column].replace(to_replace=unique_vals, value=list(range(len(unique_vals))))
        transformed_df[column] = string_to_numbers
        
    return transformed_df, key

In [16]:
def revert_back_to_strings(df, key):
    for column in df.columns:
        if column not in ['date_recorded', 'gps_height', 'longitude', 'latitude', 'population', 'construction_year'] :
            df[column].replace(key[column], inplace = True)
    
    return df

In [30]:
# transformed_df, key = transform_columns(df_small_small, ['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 
#                                                      'region_code', 'district_code', 'lga', 'ward', 'public_meeting', 
#                                                      'scheme_management', 'permit', 'extraction_type_class',
#                                                      'management', 'management_group', 'payment', 'quality_group', 
#                                                      'quantity', 'source', 'source_type', 'source_class','waterpoint_type',
#                                                      'waterpoint_type_group', 'status_group'])

In [40]:
# transformed_df['date_recorded'] = df_small_small['date_recorded']
# transformed_df['gps_height'] = df_small_small['gps_height']
# transformed_df['longitude'] = df_small_small['longitude']
# transformed_df['latitude'] = df_small_small['latitude']
# transformed_df['population'] = df_small_small['population']
# transformed_df['construction_year'] = df_small_small['construction_year']

In [48]:
transformed_df.to_csv('transformed_df.csv')

In [79]:
transformed_df = pd.read_csv('transformed_df.csv', index_col=0)
transformed_df.columns

Index(['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region',
       'region_code', 'district_code', 'lga', 'ward', 'public_meeting',
       'scheme_management', 'permit', 'extraction_type_class', 'management',
       'management_group', 'payment', 'quality_group', 'quantity', 'source',
       'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'status_group', 'date_recorded', 'gps_height',
       'longitude', 'latitude', 'population', 'construction_year'],
      dtype='object')

In [34]:
X = transformed_df.drop('status_group', axis=1)
y = transformed_df['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=0.3)

In [19]:
imputed_df.rename(columns={'0': 'funder', '1': 'installer', '2': 'wpt_name', '3': 'basin', '4': 'subvillage',
                                   '5': 'region', '6': 'region_code', '7': 'district_code', '8': 'lga', '9': 'ward',
                                   '10': 'public_meeting', '11': 'scheme_management', '12': 'scheme_name', '13': 'permit',
                                   '14': 'extraction_type_class', '15': 'management', '16': 'management_group', '17': 
                                   'payment', '18': 'quality_group', '19': 'quantity', '20': 'source', '21': 'source_type',
                                   '22': 'source_class', '23': 'waterpoint_type', '24': 'waterpoint_type_group', '25': 
                                   'date_recorded', '26': 'gps_height', '27': 'longitude', '28': 'latitude', '29': 'population',
                                   '30': 'construction_year'}, inplace=True)

In [21]:
# df = revert_back_to_strings(imputed_df[['date_recorded']], key)

In [53]:
X_train_imputed = pd.read_csv('X_train_imputed.csv', index_col=0)
X_test_imputed = pd.read_csv('X_test_imputed.csv', index_col=0)

# Decision Tree Model

In [54]:
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train_imputed, y_train)

DecisionTreeClassifier(random_state=42)

In [None]:
tree.predict(X_test_imputed)

In [58]:
#untuned score
precision_score(y_test, tree.predict(X_test_imputed))

0.7240023303233324

In [75]:
# hyperparameters to search over
parameters = {'max_depth': [2, 4, 6, 8], 'min_samples_split': [2, 4, 6, 8], 'min_samples_leaf': [1, 2, 3, 4]}

# cross-validate to tune parameters
search = GridSearchCV(tree, parameters, cv=5, scoring='precision')
search.fit(X_train_imputed, y_train)

#best parameters and associated score 
print("Best hyperparameters:", search.best_params_)
print("Best score:", search.best_score_)

Best hyperparameters: {'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best score: 0.9662694535803361


In [70]:
#Tree with tuned parameters
tuned_tree = DecisionTreeClassifier(random_state=42, max_depth = 2, min_samples_leaf = 1, min_samples_split = 2)

In [73]:
scores = cross_validate(tuned_tree, X_train_imputed, y_train, cv=5, scoring='precision')
scores

{'fit_time': array([0.06499267, 0.06041336, 0.05896425, 0.05898142, 0.05799127]),
 'score_time': array([0.00398946, 0.00349569, 0.004987  , 0.0040381 , 0.00398922]),
 'test_score': array([0.97222222, 0.95726496, 0.96466431, 0.96840826, 0.96878752])}