In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import accuracy_score,precision_score,precision_score,recall_score,precision_recall_curve,auc,classification_report
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,OneHotEncoder
from collections import Counter
from imblearn.over_sampling import SMOTENC

# Business Understanding: Predicting Water Pump Functionality in Tanzania

## Problem Statement
Tanzania faces significant challenges in maintaining its water infrastructure. Many pumps fail unexpectedly, causing water shortages that impact community health and development. Current maintenance approaches are reactive, leading to inefficient resource allocation and preventable service disruptions. 

This project addresses this problem by developing a predictive model that classifies water pumps into three categories:
- **Functional**: Operating normally
- **Needs repairs**: Requiring maintenance
- **Non functional**: Not operational

## Stakeholders and Applications
### Tanzanian Water Authorities
- Prioritize maintenance resources for high-risk pumps
- Optimize repair schedules and budget allocation

### NGOs and Aid Organizations
- Target interventions to prevent failures in vulnerable regions
- Evaluate infrastructure investment effectiveness

### Local Maintenance Teams
- Receive actionable alerts for pumps needing immediate attention
- Reduce response time for critical repairs

## Project Impact
Successful implementation would:
- Improve water access reliability for communities
- Reduce maintenance costs through proactive interventions
- Enable data-driven infrastructure planning
- Support sustainable development goals for clean water access

This solution directly addresses a critical infrastructure challenge affecting millions in Tanzania, transforming maintenance from reactive to predictive.

In [4]:
train_features = pd.read_csv('training_set_values.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_values.csv')
df = pd.merge(train_features,train_labels,on='id')


In [5]:
df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [6]:
df.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'status_group'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [8]:
df.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,317.650385,668.297239,34.077427,-5.706033,0.474141,15.297003,5.629747,179.909983,1300.652475
std,21453.128371,2997.574558,693.11635,6.567432,2.946019,12.23623,17.587406,9.633649,471.482176,951.620547
min,0.0,0.0,-90.0,0.0,-11.64944,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.090347,-8.540621,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.908743,-5.021597,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.178387,-3.326156,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


In [9]:
df.isna().any()[df.isna().any() == True]

funder               True
installer            True
subvillage           True
public_meeting       True
scheme_management    True
scheme_name          True
permit               True
dtype: bool

In [10]:
df.isna().mean()[df.isna().mean() > 0]

funder               0.061195
installer            0.061532
subvillage           0.006246
public_meeting       0.056128
scheme_management    0.065269
scheme_name          0.474175
permit               0.051448
dtype: float64

The Scheme_Name columns has about 47% missing data. We will drop this column.

We will also drop the null rows of the other columns since they consist of a small percentage of the data (<7%).

In [11]:
df.drop(axis=1,columns=['scheme_name'],inplace=True)

In [12]:
df.dropna(inplace=True)

We will confirm that the null values have been dropped

In [13]:
df.isna().mean()[df.isna().mean() > 0]

Series([], dtype: float64)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48288 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     48288 non-null  int64  
 1   amount_tsh             48288 non-null  float64
 2   date_recorded          48288 non-null  object 
 3   funder                 48288 non-null  object 
 4   gps_height             48288 non-null  int64  
 5   installer              48288 non-null  object 
 6   longitude              48288 non-null  float64
 7   latitude               48288 non-null  float64
 8   wpt_name               48288 non-null  object 
 9   num_private            48288 non-null  int64  
 10  basin                  48288 non-null  object 
 11  subvillage             48288 non-null  object 
 12  region                 48288 non-null  object 
 13  region_code            48288 non-null  int64  
 14  district_code          48288 non-null  int64  
 15  lg

In [15]:
df.select_dtypes('number')

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
0,69572,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999
2,34310,25.0,686,37.460664,-3.821329,0,21,4,250,2009
3,67743,0.0,263,38.486161,-11.155298,0,90,63,58,1986
5,9944,20.0,0,39.172796,-4.765587,0,4,8,1,2009
6,19816,0.0,0,33.362410,-3.766365,0,17,3,0,0
...,...,...,...,...,...,...,...,...,...,...
59394,11164,500.0,351,37.634053,-6.124830,0,5,6,89,2007
59395,60739,10.0,1210,37.169807,-3.253847,0,3,5,125,1999
59396,27263,4700.0,1212,35.249991,-9.070629,0,11,4,56,1996
59398,31282,0.0,0,35.861315,-6.378573,0,1,4,0,0


In [16]:
df.select_dtypes('object')

Unnamed: 0,date_recorded,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,2011-03-14,Roman,Roman,none,Lake Nyasa,Mnyusi B,Iringa,Ludewa,Mundindi,True,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
2,2013-02-25,Lottery Club,World vision,Kwa Mahundi,Pangani,Majengo,Manyara,Simanjiro,Ngorika,True,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,2013-01-28,Unicef,UNICEF,Zahanati Ya Nanyumbu,Ruvuma / Southern Coast,Mahakamani,Mtwara,Nanyumbu,Nanyumbu,True,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
5,2011-03-13,Mkinga Distric Coun,DWE,Tajiri,Pangani,Moa/Mwereme,Tanga,Mkinga,Moa,True,...,salty,salty,enough,enough,other,other,unknown,communal standpipe multiple,communal standpipe,functional
6,2012-10-01,Dwsp,DWSP,Kwa Ngomho,Internal,Ishinabulandi,Shinyanga,Shinyanga Rural,Samuye,True,...,soft,good,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump,non functional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59394,2011-03-09,World Bank,ML appro,Chimeredya,Wami / Ruvu,Komstari,Morogoro,Mvomero,Diongoya,True,...,soft,good,enough,enough,machine dbh,borehole,groundwater,communal standpipe,communal standpipe,non functional
59395,2013-05-03,Germany Republi,CES,Area Three Namba 27,Pangani,Kiduruni,Kilimanjaro,Hai,Masama Magharibi,True,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
59396,2011-05-07,Cefa-njombe,Cefa,Kwa Yahona Kuvala,Rufiji,Igumbilo,Iringa,Njombe,Ikondo,True,...,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional
59398,2011-03-08,Malec,Musa,Mshoro,Rufiji,Mwinyi,Dodoma,Chamwino,Mvumi Makulu,True,...,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional


In [17]:
df.select_dtypes('object').columns

Index(['date_recorded', 'funder', 'installer', 'wpt_name', 'basin',
       'subvillage', 'region', 'lga', 'ward', 'public_meeting', 'recorded_by',
       'scheme_management', 'permit', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group',
       'status_group'],
      dtype='object')

In [18]:
df['num_private'].unique()

array([   0,   39,    5,   45,    6,    3,  698,   32,   15,    7,   25,
        102,    1,   93,   14,   34,  120,   17,  213,   47,    8,   41,
         80,  141,   20,   35,  131,    4,   22,   11,   87,   65,    2,
        180,   38,   62,    9,   16,   23,   42,   24,   12,  668,  672,
         58,  150,  280,   50, 1776,   27,   10,   94,   26,  240,  755,
         60,   55, 1402], dtype=int64)

We will add a column called Year Recorded.

We will then drop the Date_Recorded column

In [19]:
df['Year_Recorded'] = df['date_recorded'].str.slice(0,4)

In [20]:
df[['date_recorded','Year_Recorded']]

Unnamed: 0,date_recorded,Year_Recorded
0,2011-03-14,2011
2,2013-02-25,2013
3,2013-01-28,2013
5,2011-03-13,2011
6,2012-10-01,2012
...,...,...
59394,2011-03-09,2011
59395,2013-05-03,2013
59396,2011-05-07,2011
59398,2011-03-08,2011


In [21]:
df['Year_Recorded'] = df['Year_Recorded'].astype('int64')

In [22]:
df['Year_Recorded'].unique()

array([2011, 2013, 2012, 2004, 2002], dtype=int64)

We will now drop the date_recorded column

In [23]:
df.drop(axis=1,columns=['date_recorded'],inplace=True)

In [24]:
df.funder.value_counts()

Government Of Tanzania            8080
Danida                            2920
Hesawa                            1388
Kkkt                              1260
World Bank                        1086
                                  ... 
Care Int                             1
Ddca                                 1
Kipo Potry                           1
Rotary Club Of Chico And Moshi       1
Tacri                                1
Name: funder, Length: 1586, dtype: int64

In [25]:
df.installer.value_counts()

DWE                     14637
Government               1438
RWE                      1031
Commu                     893
KKKT                      885
                        ...  
GACHUMA CONSTRUCTION        1
Bonite Bottles Ltd          1
CG/RC                       1
TCRS/TWESA                  1
ISSAA KANYANGE              1
Name: installer, Length: 1787, dtype: int64

In [26]:
df['source'].unique()

array(['spring', 'dam', 'machine dbh', 'other', 'shallow well', 'river',
       'hand dtw', 'rainwater harvesting', 'lake', 'unknown'],
      dtype=object)

In [27]:
df['source_type'].unique()

array(['spring', 'dam', 'borehole', 'other', 'shallow well', 'river/lake',
       'rainwater harvesting'], dtype=object)

In [28]:
df[['source','source_type']].value_counts()

source                source_type         
spring                spring                  14852
shallow well          shallow well            12415
machine dbh           borehole                 8730
river                 river/lake               8506
rainwater harvesting  rainwater harvesting     1607
hand dtw              borehole                  779
lake                  river/lake                604
dam                   dam                       581
other                 other                     182
unknown               other                      32
dtype: int64

The source and source_type columns give almost the exact same information.

'Machine dbh' and 'Hand dtw' both are borehole source types.

Source has a few more unique values so we will drop the source_type column.

In [29]:
df.drop(axis=1,columns='source',inplace=True)

In [30]:
df['waterpoint_type'].unique()

array(['communal standpipe', 'communal standpipe multiple', 'hand pump',
       'other', 'improved spring', 'cattle trough', 'dam'], dtype=object)

In [31]:
df['waterpoint_type_group'].unique()

array(['communal standpipe', 'hand pump', 'other', 'improved spring',
       'cattle trough', 'dam'], dtype=object)

In [32]:
df[['waterpoint_type_group','waterpoint_type']].value_counts()

waterpoint_type_group  waterpoint_type            
communal standpipe     communal standpipe             23837
hand pump              hand pump                      13602
communal standpipe     communal standpipe multiple     5459
other                  other                           4651
improved spring        improved spring                  651
cattle trough          cattle trough                     82
dam                    dam                                6
dtype: int64

The waterpoint_type and the water_point_type_group columns give us the same information.

The waterpoint_type column has a few more unique values.

We will drop the waterpoint_type_group column

In [33]:
df.drop(axis=1,columns='waterpoint_type_group',inplace=True)

In [34]:
df['quantity'].unique()

array(['enough', 'dry', 'seasonal', 'insufficient', 'unknown'],
      dtype=object)

In [35]:
df['quantity_group'].unique()

array(['enough', 'dry', 'seasonal', 'insufficient', 'unknown'],
      dtype=object)

In [36]:
df[['quantity','quantity_group']].value_counts()

quantity      quantity_group
enough        enough            28355
insufficient  insufficient      11799
dry           dry                4945
seasonal      seasonal           2899
unknown       unknown             290
dtype: int64

The two columns give you the same information. 

So we will drop the quantity column

In [37]:
df.drop(axis=1,columns='quantity',inplace=True)

We will ordinally encode the quantity_group column.

We might have to drop the unknown rows.

We will have to check how many of the rows are part of the unknown category.

In [38]:
df[['extraction_type','extraction_type_group','extraction_type_class']].value_counts()

extraction_type            extraction_type_group  extraction_type_class
gravity                    gravity                gravity                  23036
nira/tanira                nira/tanira            handpump                  6260
other                      other                  other                     4538
submersible                submersible            submersible               3645
swn 80                     swn 80                 handpump                  2858
mono                       mono                   motorpump                 2382
india mark ii              india mark ii          handpump                  2047
afridev                    afridev                handpump                  1346
ksb                        submersible            submersible               1330
other - rope pump          rope pump              rope pump                  212
other - swn 81             other handpump         handpump                   199
windmill                   wind-power

I will keep the extraction_type and extraction_type_class columns. 

We will get all the information from these two columns.

They do not have an order so we will use OneHotEncoding

In [39]:
df.drop(axis=1,columns='extraction_type_group',inplace=True)

In [40]:
df['management'].unique()

array(['vwc', 'private operator', 'wug', 'water board', 'wua', 'company',
       'other', 'water authority', 'parastatal', 'other - school',
       'unknown', 'trust'], dtype=object)

In [41]:
df['management_group'].unique()

array(['user-group', 'commercial', 'other', 'parastatal', 'unknown'],
      dtype=object)

In [42]:
df[['management','management_group']].value_counts()

management        management_group
vwc               user-group          33577
wug               user-group           4807
water board       user-group           2665
wua               user-group           2288
private operator  commercial           1479
parastatal        parastatal           1331
water authority   commercial            790
company           commercial            654
other             other                 449
other - school    other                  99
trust             commercial             75
unknown           unknown                74
dtype: int64

Management and management_group columns are giving us different information.

The management_group column seems to tell us the category that the entry in management is in.

I will keep both the columns.

In [43]:
df['payment'].unique()

array(['pay annually', 'pay per bucket', 'never pay',
       'pay when scheme fails', 'other', 'pay monthly', 'unknown'],
      dtype=object)

In [44]:
df['payment_type'].unique()

array(['annually', 'per bucket', 'never pay', 'on failure', 'other',
       'monthly', 'unknown'], dtype=object)

In [45]:
df['water_quality'].unique()

array(['soft', 'salty', 'unknown', 'milky', 'fluoride', 'coloured',
       'salty abandoned', 'fluoride abandoned'], dtype=object)

In [46]:
df['quality_group'].unique()

array(['good', 'salty', 'unknown', 'milky', 'fluoride', 'colored'],
      dtype=object)

In [47]:
df[['water_quality','quality_group']].value_counts()

water_quality       quality_group
soft                good             42667
salty               salty             3718
unknown             unknown            880
coloured            colored            320
milky               milky              299
salty abandoned     salty              231
fluoride            fluoride           160
fluoride abandoned  fluoride            13
dtype: int64

The water_quality and quality group are lamost giving the exact same information.

I am going to drop the water_quality column

In [48]:
df.drop(axis=1,inplace=True,columns='water_quality')

In [49]:
df['management'].unique()

array(['vwc', 'private operator', 'wug', 'water board', 'wua', 'company',
       'other', 'water authority', 'parastatal', 'other - school',
       'unknown', 'trust'], dtype=object)

In [50]:
df['management_group'].unique()

array(['user-group', 'commercial', 'other', 'parastatal', 'unknown'],
      dtype=object)

In [51]:
df['public_meeting'].unique()

array([True, False], dtype=object)

In [52]:
df.head()

Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,management_group,payment,payment_type,quality_group,quantity_group,source_type,source_class,waterpoint_type,status_group,Year_Recorded
0,69572,6000.0,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,...,user-group,pay annually,annually,good,enough,spring,groundwater,communal standpipe,functional,2011
2,34310,25.0,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,...,user-group,pay per bucket,per bucket,good,enough,dam,surface,communal standpipe multiple,functional,2013
3,67743,0.0,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,...,user-group,never pay,never pay,good,dry,borehole,groundwater,communal standpipe multiple,non functional,2013
5,9944,20.0,Mkinga Distric Coun,0,DWE,39.172796,-4.765587,Tajiri,0,Pangani,...,user-group,pay per bucket,per bucket,salty,enough,other,unknown,communal standpipe multiple,functional,2011
6,19816,0.0,Dwsp,0,DWSP,33.36241,-3.766365,Kwa Ngomho,0,Internal,...,user-group,never pay,never pay,good,enough,borehole,groundwater,hand pump,non functional,2012


In [53]:
df.columns

Index(['id', 'amount_tsh', 'funder', 'gps_height', 'installer', 'longitude',
       'latitude', 'wpt_name', 'num_private', 'basin', 'subvillage', 'region',
       'region_code', 'district_code', 'lga', 'ward', 'population',
       'public_meeting', 'recorded_by', 'scheme_management', 'permit',
       'construction_year', 'extraction_type', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'quality_group', 'quantity_group', 'source_type', 'source_class',
       'waterpoint_type', 'status_group', 'Year_Recorded'],
      dtype='object')

I will now drop the columns that are not useful in determinining if the pump is in need of fixing.

I am dropping them because they will not help the model determine the condition of the pump, they will also increase the amount of compute needed to train the model. In order to ensure the model is as accurate and simple as possible I am dropping these columns.

These are columns with information about the location of the pump and funder that had the pump installed.

I am dropping the following location columns since I already have the longitude and latitude columns which tell me about the location.

The funder is not important since they are hiring other third parties to install the pumps, knowing who funded the installation will not help us know the quality of the installation since we don't know exactly who installed the pump.


In [54]:
#gonna drop the columns not useful for training then i will onehotencode and ordinal encode

df.drop(axis=1,inplace=True, columns=['id','funder','gps_height','subvillage','region','region_code','district_code','lga','ward'
                                      ,'recorded_by','scheme_management','wpt_name','payment','extraction_type','installer','public_meeting'
                                      ,'management','basin'])

In [55]:
df.columns

Index(['amount_tsh', 'longitude', 'latitude', 'num_private', 'population',
       'permit', 'construction_year', 'extraction_type_class',
       'management_group', 'payment_type', 'quality_group', 'quantity_group',
       'source_type', 'source_class', 'waterpoint_type', 'status_group',
       'Year_Recorded'],
      dtype='object')

In [56]:
df.select_dtypes('object').head()

Unnamed: 0,permit,extraction_type_class,management_group,payment_type,quality_group,quantity_group,source_type,source_class,waterpoint_type,status_group
0,False,gravity,user-group,annually,good,enough,spring,groundwater,communal standpipe,functional
2,True,gravity,user-group,per bucket,good,enough,dam,surface,communal standpipe multiple,functional
3,True,submersible,user-group,never pay,good,dry,borehole,groundwater,communal standpipe multiple,non functional
5,True,submersible,user-group,per bucket,salty,enough,other,unknown,communal standpipe multiple,functional
6,True,handpump,user-group,never pay,good,enough,borehole,groundwater,hand pump,non functional


Now that I am finished dropping columns I want to ensure the data is balanced.

In order to know if the data is balanced I will be looking at the number of instances of each category in the target variable.

The target variable is status_group.

I will do a value counts on this target variable.


In [57]:
df.status_group.value_counts()

functional                 26517
non functional             18273
functional needs repair     3498
Name: status_group, dtype: int64

As one can see the data is very unbalanced.

The functional category has 26,517 entries.

While the non-functional category has 18,273 columns.

Lastly, the functional-needs-repair category has 3498 entries.

I will use random undersampling on the functional and non-functional entries to make sure they have 15,000 entries.

I am undersampling so that the majority classes are closer to the minority class. I will use SMOTE on X_train so that the minority class can match the two majority classes during the training. That will be later after the data is split.

In [58]:
#creating seperate dataframes that only have 1 one each category
df_functional = df[df['status_group'] == 'functional']
df_non_functional = df[df['status_group'] == 'non functional']
df_functional_needs_repair = df[df['status_group'] == 'functional needs repair']

#picking 15,000 random samples from the two majority classes
functional_samples = df_functional.sample(n=15000,random_state=42)
non_functional_samples = df_non_functional.sample(n=15000,random_state=42)



In [59]:
#combining the undersampled functional and non-functional categories
df_undersampled = pd.concat([functional_samples,non_functional_samples])

#combining the df_undersampled with the functional-needs-repair category
#we are not undersampling functional-needs-repair category because it is still the minority sample
final_df_undersampled = pd.concat([df_undersampled,df_functional_needs_repair])
final_df_undersampled.head()

Unnamed: 0,amount_tsh,longitude,latitude,num_private,population,permit,construction_year,extraction_type_class,management_group,payment_type,quality_group,quantity_group,source_type,source_class,waterpoint_type,status_group,Year_Recorded
59233,0.0,33.61373,-4.259876,0,0,False,0,other,user-group,never pay,unknown,unknown,rainwater harvesting,surface,communal standpipe,functional,2012
57858,0.0,36.928466,-3.342058,0,130,True,2003,gravity,user-group,unknown,good,insufficient,river/lake,surface,communal standpipe,functional,2013
57685,5000.0,38.865788,-6.829784,102,20,True,2010,motorpump,commercial,monthly,good,insufficient,river/lake,surface,communal standpipe,functional,2011
2932,0.0,0.0,-2e-08,0,0,True,0,handpump,user-group,per bucket,good,insufficient,shallow well,groundwater,hand pump,functional,2011
40406,2000.0,34.777955,-4.357275,0,140,True,1997,handpump,user-group,on failure,good,enough,shallow well,groundwater,hand pump,functional,2013


In [60]:
#checking we have 15,000 counts of the majority classes
final_df_undersampled.status_group.value_counts()

functional                 15000
non functional             15000
functional needs repair     3498
Name: status_group, dtype: int64

I am going to be using OneHotEncoding to encode categorical columns that have no order to them.

OneHotEncoding will turn the string data into a numerical format the model will be able to use for training.

These columns are:

basin,permit,extraction_type_class,management_group,payment_type,source_type,source_class,waterpoint_type

In [61]:
# list = []
# for id,name in enumerate(X_train.columns.tolist()):
#     if name not in num_cols:
#         list.append((id,name))
# # print(list)
# print(list)

# categorical_id = []
# for i in list:
#     categorical_id.append(i[0])

# print(categorical_id)

I will now use SMOTE to create synthetic data of the minority category.

The process for SMOTE will go as such:

1. Split the data using train_test_split ensure stratify=y argument to maintain the original split of the data.

2. Use Counter(y_train) to find out the split of the data.

3. Create two lists one for the categorical features and one for the numerical features.

4. Create a copy of X_train and X_test

5. Do ordinal encoding on the copies.

6. Call an instance of SMOTENC.

7. Fit_resample on X_train_ord,y_train

8. Perform OneHotEncoding on the resampled dataframe. (remember the output will be a numpy array)

9. Extract the numerical features from the resampled dataframe.

10. Recreate the X_train and X_test dataframes

11. Perform Label Encoding on y_train and y_test

I can see that category 1 is under represented in the y_train data.

I will use SMOTENC to generate synthetic entries of category 1 to make sure that it has 12,000 entries so the dataset is  balanced.

In [62]:
# 1. FIRST: Split Data into Train/Test Sets
# =====================================================================
X = final_df_undersampled.drop('status_group', axis=1)
y = final_df_undersampled['status_group']

# Stratify to preserve imbalance ratio in splits
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y,
    random_state=42
)

print("\nTraining set class distribution:")
print(Counter(y_train))  # e.g., A:560, B:160, C:80


Training set class distribution:
Counter({'non functional': 12000, 'functional': 12000, 'functional needs repair': 2798})


In [63]:
final_df_undersampled.head()

Unnamed: 0,amount_tsh,longitude,latitude,num_private,population,permit,construction_year,extraction_type_class,management_group,payment_type,quality_group,quantity_group,source_type,source_class,waterpoint_type,status_group,Year_Recorded
59233,0.0,33.61373,-4.259876,0,0,False,0,other,user-group,never pay,unknown,unknown,rainwater harvesting,surface,communal standpipe,functional,2012
57858,0.0,36.928466,-3.342058,0,130,True,2003,gravity,user-group,unknown,good,insufficient,river/lake,surface,communal standpipe,functional,2013
57685,5000.0,38.865788,-6.829784,102,20,True,2010,motorpump,commercial,monthly,good,insufficient,river/lake,surface,communal standpipe,functional,2011
2932,0.0,0.0,-2e-08,0,0,True,0,handpump,user-group,per bucket,good,insufficient,shallow well,groundwater,hand pump,functional,2011
40406,2000.0,34.777955,-4.357275,0,140,True,1997,handpump,user-group,on failure,good,enough,shallow well,groundwater,hand pump,functional,2013


We will need to make sure the 'functional needs repair' category reaches 12,000 entries using smote

In [64]:
final_df_undersampled.columns

Index(['amount_tsh', 'longitude', 'latitude', 'num_private', 'population',
       'permit', 'construction_year', 'extraction_type_class',
       'management_group', 'payment_type', 'quality_group', 'quantity_group',
       'source_type', 'source_class', 'waterpoint_type', 'status_group',
       'Year_Recorded'],
      dtype='object')

In [65]:
# 2. Preprocess Categorical Features with ORDINAL ENCODING (For SMOTENC compatibility)
# Identify categorical columns
cat_cols = ['permit','extraction_type_class','management_group', 'payment_type', 'quality_group', 'quantity_group',
            'source_type', 'source_class', 'waterpoint_type']

#Identify numerical columns
num_cols = ['amount_tsh', 'longitude', 'latitude', 'num_private', 'population','construction_year','Year_Recorded']

# Initialize and fit ordinal encoder on TRAINING DATA ONLY
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train_ord = X_train.copy()
X_train_ord[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])

# Apply to test data using same encoder
X_test_ord = X_test.copy()
X_test_ord[cat_cols] = ordinal_encoder.transform(X_test[cat_cols])

In [66]:
# 3. Apply SMOTENC to Training Data
# Get categorical feature indices (positions after encoding)
cat_idx = [X_train_ord.columns.get_loc(col) for col in cat_cols]

# Define resampling strategy
sampling_strategy = {'functional needs repair': 12000}  # Upsample to match majority

sm = SMOTENC(
    categorical_features=cat_idx,
    sampling_strategy=sampling_strategy,
    k_neighbors=5,
    random_state=42
)

# Apply SMOTENC to ordinal-encoded training data
X_train_res, y_train_res = sm.fit_resample(X_train_ord, y_train)

print("\nAfter SMOTENC resampling:")
print(Counter(y_train_res))  # A:12000, B:12000, C:12000




After SMOTENC resampling:
Counter({'non functional': 12000, 'functional needs repair': 12000, 'functional': 12000})


In [67]:
# 4. Apply ONE-HOT ENCODING (After SMOTENC)
# =====================================================================
# Initialize and fit one-hot encoder on RESAMPLED training data
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_ohe = onehot_encoder.fit_transform(X_train_res[cat_cols])

# Apply to test data using same encoder
X_test_ohe = onehot_encoder.transform(X_test_ord[cat_cols])

In [68]:
# 5. Combine Features for Final Dataset
# =====================================================================
# Extract numerical features (unchanged)
X_train_num = X_train_res[num_cols].reset_index(drop=True)
X_test_num = X_test_ord[num_cols].reset_index(drop=True)

# Create feature names for one-hot encoded columns
ohe_columns = onehot_encoder.get_feature_names_out(cat_cols)

# Combine numerical and one-hot categorical features
X_train_final = pd.concat([X_train_num, pd.DataFrame(X_train_ohe, columns=ohe_columns)], axis=1)

X_test_final = pd.concat([X_test_num, pd.DataFrame(X_test_ohe, columns=ohe_columns)], axis=1)

In [69]:
# Encoding target variable 
label_encoder = LabelEncoder()
y_train_final = label_encoder.fit_transform(y_train_res)
y_test_final = label_encoder.transform(y_test)

In [70]:
y_train_final

array([2, 1, 2, ..., 1, 1, 1])

In [71]:
y_train_res.head()

0             non functional
1    functional needs repair
2             non functional
3                 functional
4                 functional
Name: status_group, dtype: object

Lets remember the encoding of the target

'non_functional' : 2

'functional needs repair' : 1

'functional' : 0

In [72]:
print("\nFinal training set shape:", X_train_final.shape)
print("Final test set shape:", X_test_final.shape)
print("Sample of final training data:")
X_train_final.head()


Final training set shape: (36000, 56)
Final test set shape: (6700, 56)
Sample of final training data:


Unnamed: 0,amount_tsh,longitude,latitude,num_private,population,construction_year,Year_Recorded,permit_0.0,permit_1.0,extraction_type_class_0.0,...,source_class_0.0,source_class_1.0,source_class_2.0,waterpoint_type_0.0,waterpoint_type_1.0,waterpoint_type_2.0,waterpoint_type_3.0,waterpoint_type_4.0,waterpoint_type_5.0,waterpoint_type_6.0
0,0.0,32.277249,-4.390906,0,0,0,2012,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,33.811988,-3.831992,0,0,0,2012,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,33.976453,-1.477176,0,2500,1983,2012,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,30.489613,-3.650591,0,130,2009,2013,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,37.754119,-6.052232,0,1,2011,2011,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [73]:
model = DecisionTreeClassifier()
model.fit(X_train_final,y_train_final)
y_pred = model.predict(X_test_final)
print(classification_report(y_test_final,y_pred))

              precision    recall  f1-score   support

           0       0.75      0.71      0.73      3000
           1       0.39      0.50      0.44       700
           2       0.78      0.78      0.78      3000

    accuracy                           0.72      6700
   macro avg       0.64      0.66      0.65      6700
weighted avg       0.73      0.72      0.72      6700



In [74]:
model = LogisticRegression()
model.fit(X_train_final,y_train_final)
y_pred = model.predict(X_test_final)
print(classification_report(y_test_final,y_pred))

              precision    recall  f1-score   support

           0       0.66      0.48      0.56      3000
           1       0.23      0.60      0.34       700
           2       0.64      0.58      0.61      3000

    accuracy                           0.54      6700
   macro avg       0.51      0.55      0.50      6700
weighted avg       0.61      0.54      0.56      6700



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [78]:
model2 = DecisionTreeClassifier()
model2.fit(X_train_final,y_train_final)
y_pred2 = model2.predict(X_test_final)
print(classification_report(y_test_final,y_pred2))

              precision    recall  f1-score   support

           0       0.75      0.70      0.73      3000
           1       0.39      0.49      0.44       700
           2       0.78      0.78      0.78      3000

    accuracy                           0.72      6700
   macro avg       0.64      0.66      0.65      6700
weighted avg       0.73      0.72      0.72      6700



In [75]:
def gridsearch_results(X, y, models_param_grids, cv=5, scoring=None):
    """
    Perform GridSearchCV for multiple models and return results in a DataFrame.
    
    Parameters:
    X (array-like): Feature matrix
    y (array-like): Target vector
    models_param_grids (dict): Dictionary of model names and (estimator, param_grid) tuples
    cv (int): Cross-validation folds
    scoring (dict): Scoring metrics (default: {'accuracy', 'precision', 'recall'})
    
    Returns:
    pd.DataFrame: Results with model names, parameters, and metrics
    """
    from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score
    
    # Default scoring metrics (binary classification)
    if scoring is None:
        scoring = {
            'accuracy': make_scorer(accuracy_score),
            'precision': make_scorer(precision_score, average='weighted', zero_division=0),
            'recall': make_scorer(recall_score, average='weighted', zero_division=0)
        }
    
    results = []
    
    for model_name, (estimator, param_grid) in models_param_grids.items():
        # Set up GridSearchCV with multiple metrics
        gs = GridSearchCV(
            estimator=estimator,
            param_grid=param_grid,
            cv=cv,
            scoring=scoring,
            refit=False,  # We'll collect all results, not just the best
            return_train_score=False,
            error_score='raise'
        )
        gs.fit(X, y)
        
        # Collect results for each parameter combination
        for i in range(len(gs.cv_results_['params'])):
            result = {
                'model': model_name,
                'params': gs.cv_results_['params'][i],
                'accuracy': gs.cv_results_['mean_test_accuracy'][i],
                'precision': gs.cv_results_['mean_test_precision'][i],
                'recall': gs.cv_results_['mean_test_recall'][i]
            }
            results.append(result)
    
    return pd.DataFrame(results)

In [76]:
from sklearn.linear_model import LogisticRegression
models_param_grids = {
    'DecisionTree': (
        DecisionTreeClassifier(random_state=42),
        {'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None, 0.8, 0.9],
    'max_leaf_nodes': [None, 10, 20, 50, 100],
    'min_impurity_decrease': [0.0, 0.01, 0.05, 0.1],
    'class_weight': [None, 'balanced']} )
    ,'LogisticRegressionL2':(
        LogisticRegression(random_state=42),
        {'penalty':['l2'],'C':[2,5,10,15,30],'solver':['newton-cg','lbfgs']}
    )
    ,'LogisticRegressionL1':(
        LogisticRegression(random_state=42),
        {'penalty':['l1'],'C':[2,5,10,15,30],'solver':['liblinear']
         ,'intercept_scaling':[2,5,10,15,30]} #intercept scaling only works for liblinear
    )
}



In [77]:
# # Get results
results_df = gridsearch_results(X_train_final, y_train_final, models_param_grids, cv=5).sort_values('accuracy',ascending=False)
results_df.head(50)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,model,params,accuracy,precision,recall
40963,DecisionTree,"{'class_weight': None, 'criterion': 'log_loss'...",0.757528,0.76216,0.757528
98563,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'log...",0.757528,0.76216,0.757528
37763,DecisionTree,"{'class_weight': None, 'criterion': 'entropy',...",0.757528,0.76216,0.757528
21763,DecisionTree,"{'class_weight': None, 'criterion': 'entropy',...",0.757528,0.76216,0.757528
114563,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'log...",0.757528,0.76216,0.757528
79363,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'ent...",0.757528,0.76216,0.757528
56963,DecisionTree,"{'class_weight': None, 'criterion': 'log_loss'...",0.757528,0.76216,0.757528
95363,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'ent...",0.757528,0.76216,0.757528
71685,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'gin...",0.756056,0.75974,0.756056
14085,DecisionTree,"{'class_weight': None, 'criterion': 'gini', 'm...",0.756056,0.75974,0.756056


In [83]:
results_df.loc[40963,'params']

{'class_weight': None,
 'criterion': 'log_loss',
 'max_depth': None,
 'max_features': 0.9,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'splitter': 'random'}

In [85]:
top_model = DecisionTreeClassifier(class_weight=None,criterion='log_loss',max_depth=None,max_features=0.9,
                                   max_leaf_nodes=None,min_impurity_decrease=0.0,min_samples_leaf=1,min_samples_split=5)
top_model.fit(X_train_final,y_train_final)
y_pred_top = top_model.predict(X_test_final)
print(classification_report(y_test_final,y_pred_top))

              precision    recall  f1-score   support

           0       0.74      0.73      0.73      3000
           1       0.39      0.52      0.45       700
           2       0.80      0.76      0.78      3000

    accuracy                           0.72      6700
   macro avg       0.64      0.67      0.65      6700
weighted avg       0.73      0.72      0.72      6700



In [None]:
results_df.head(10)

Unnamed: 0,model,params,accuracy,precision,recall
40963,DecisionTree,"{'class_weight': None, 'criterion': 'log_loss'...",0.757528,0.76216,0.757528
98563,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'log...",0.757528,0.76216,0.757528
37763,DecisionTree,"{'class_weight': None, 'criterion': 'entropy',...",0.757528,0.76216,0.757528
21763,DecisionTree,"{'class_weight': None, 'criterion': 'entropy',...",0.757528,0.76216,0.757528
114563,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'log...",0.757528,0.76216,0.757528
79363,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'ent...",0.757528,0.76216,0.757528
56963,DecisionTree,"{'class_weight': None, 'criterion': 'log_loss'...",0.757528,0.76216,0.757528
95363,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'ent...",0.757528,0.76216,0.757528
71685,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'gin...",0.756056,0.75974,0.756056
14085,DecisionTree,"{'class_weight': None, 'criterion': 'gini', 'm...",0.756056,0.75974,0.756056


In [89]:
results_df.sort_values('recall',ascending=False).head(10)

Unnamed: 0,model,params,accuracy,precision,recall
40963,DecisionTree,"{'class_weight': None, 'criterion': 'log_loss'...",0.757528,0.76216,0.757528
79363,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'ent...",0.757528,0.76216,0.757528
95363,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'ent...",0.757528,0.76216,0.757528
56963,DecisionTree,"{'class_weight': None, 'criterion': 'log_loss'...",0.757528,0.76216,0.757528
98563,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'log...",0.757528,0.76216,0.757528
114563,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'log...",0.757528,0.76216,0.757528
37763,DecisionTree,"{'class_weight': None, 'criterion': 'entropy',...",0.757528,0.76216,0.757528
21763,DecisionTree,"{'class_weight': None, 'criterion': 'entropy',...",0.757528,0.76216,0.757528
71685,DecisionTree,"{'class_weight': 'balanced', 'criterion': 'gin...",0.756056,0.75974,0.756056
14085,DecisionTree,"{'class_weight': None, 'criterion': 'gini', 'm...",0.756056,0.75974,0.756056


In [98]:
results_df[results_df['model'] == 'LogisticRegressionL2'].sort_values('accuracy',ascending=False)

Unnamed: 0,model,params,accuracy,precision,recall
115202,LogisticRegressionL2,"{'C': 5, 'penalty': 'l2', 'solver': 'newton-cg'}",0.601028,0.615976,0.601028
115208,LogisticRegressionL2,"{'C': 30, 'penalty': 'l2', 'solver': 'newton-cg'}",0.600972,0.615938,0.600972
115206,LogisticRegressionL2,"{'C': 15, 'penalty': 'l2', 'solver': 'newton-cg'}",0.600833,0.615741,0.600833
115200,LogisticRegressionL2,"{'C': 2, 'penalty': 'l2', 'solver': 'newton-cg'}",0.600833,0.615788,0.600833
115204,LogisticRegressionL2,"{'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}",0.600806,0.615763,0.600806
115203,LogisticRegressionL2,"{'C': 5, 'penalty': 'l2', 'solver': 'lbfgs'}",0.516944,0.517175,0.516944
115201,LogisticRegressionL2,"{'C': 2, 'penalty': 'l2', 'solver': 'lbfgs'}",0.515944,0.516447,0.515944
115209,LogisticRegressionL2,"{'C': 30, 'penalty': 'l2', 'solver': 'lbfgs'}",0.510472,0.510965,0.510472
115207,LogisticRegressionL2,"{'C': 15, 'penalty': 'l2', 'solver': 'lbfgs'}",0.510167,0.510997,0.510167
115205,LogisticRegressionL2,"{'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}",0.508333,0.509673,0.508333
