In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [3]:
df_numeric = pd.read_csv('numerical_only_cleaned.csv')
df_numeric.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,region_code,district_code,population,construction_year,year_recorded,age,target_encoded,population_logged,amount_tsh_encoded
0,69572,6000.0,1390,34.938093,-9.856322,11,5,109,1999,2011,12,1,4.691348,1
1,8776,0.0,1399,34.698766,-2.147466,20,2,280,2010,2013,3,1,5.63479,0
2,34310,25.0,686,37.460664,-3.821329,21,4,250,2009,2013,4,1,5.521461,1
3,67743,0.0,263,38.486161,-11.155298,90,63,58,1986,2013,27,0,4.060443,0
4,19728,0.0,0,31.130847,-1.825359,18,1,1,1986,2011,25,1,0.0,0


In [4]:
df_numeric.shape

(59400, 14)

In [5]:
target_encoded = df_numeric.pop('target_encoded')

In [6]:
labels = pd.read_csv('train_labels.csv')
labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [7]:
labels.shape

(59400, 2)

In [8]:
df_categorical = pd.read_csv('categorical_only_cleaned.csv')
df_categorical.head()

Unnamed: 0,id,target,funder_cleaned,installer_cleaned,scheme_management_cleaned,extraction_type_cleaned,management_cleaned,management_group_cleaned,payment_cleaned,water_quality_cleaned,quality_group_cleaned,quantity_cleaned,source_cleaned,source_class_cleaned,waterpoint_type_cleaned,target_cleaned
0,69572,functional,Other,Other,VWC,gravity,vwc,user-group,other,soft,good,enough,spring,groundwater,standpipe,functional
1,8776,functional,Other,Other,Other,gravity,other,user-group,never pay,soft,good,other,other,surface,standpipe,functional
2,34310,functional,Other,Other,VWC,gravity,vwc,user-group,other,soft,good,enough,other,surface,standpipe,functional
3,67743,non functional,Other,Other,VWC,other,vwc,user-group,never pay,soft,good,other,machine dbh,groundwater,standpipe,non functional
4,19728,functional,Other,Other,Other,gravity,other,other,never pay,soft,good,other,other,surface,standpipe,functional


In [9]:
df_combo = df_categorical.merge(df_numeric, on='id', how='right')
df_combo.head()

Unnamed: 0,id,target,funder_cleaned,installer_cleaned,scheme_management_cleaned,extraction_type_cleaned,management_cleaned,management_group_cleaned,payment_cleaned,water_quality_cleaned,quality_group_cleaned,quantity_cleaned,source_cleaned,source_class_cleaned,waterpoint_type_cleaned,target_cleaned,amount_tsh,gps_height,longitude,latitude,region_code,district_code,population,construction_year,year_recorded,age,population_logged,amount_tsh_encoded
0,69572,functional,Other,Other,VWC,gravity,vwc,user-group,other,soft,good,enough,spring,groundwater,standpipe,functional,6000.0,1390,34.938093,-9.856322,11,5,109,1999,2011,12,4.691348,1
1,8776,functional,Other,Other,Other,gravity,other,user-group,never pay,soft,good,other,other,surface,standpipe,functional,0.0,1399,34.698766,-2.147466,20,2,280,2010,2013,3,5.63479,0
2,34310,functional,Other,Other,VWC,gravity,vwc,user-group,other,soft,good,enough,other,surface,standpipe,functional,25.0,686,37.460664,-3.821329,21,4,250,2009,2013,4,5.521461,1
3,67743,non functional,Other,Other,VWC,other,vwc,user-group,never pay,soft,good,other,machine dbh,groundwater,standpipe,non functional,0.0,263,38.486161,-11.155298,90,63,58,1986,2013,27,4.060443,0
4,19728,functional,Other,Other,Other,gravity,other,other,never pay,soft,good,other,other,surface,standpipe,functional,0.0,0,31.130847,-1.825359,18,1,1,1986,2011,25,0.0,0


In [10]:
df_combo.shape

(59400, 28)

In [11]:
df_categorical.columns

Index(['id', 'target', 'funder_cleaned', 'installer_cleaned', 'scheme_management_cleaned', 'extraction_type_cleaned', 'management_cleaned', 'management_group_cleaned', 'payment_cleaned', 'water_quality_cleaned', 'quality_group_cleaned', 'quantity_cleaned', 'source_cleaned', 'source_class_cleaned', 'waterpoint_type_cleaned', 'target_cleaned'], dtype='object')

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import category_encoders as ce

X = df_combo.drop(['id', 'construction_year', 'year_recorded', 'population', 'amount_tsh', 'target', 'target_cleaned'],  axis=1)
y = df_combo['target_cleaned']

cat_list_to_encode = ['funder_cleaned', 'installer_cleaned', 'scheme_management_cleaned', 'extraction_type_cleaned',
                      'management_cleaned', 'management_group_cleaned', 'payment_cleaned', 'water_quality_cleaned',
                      'quality_group_cleaned', 'quantity_cleaned', 'source_cleaned', 'source_class_cleaned', 'waterpoint_type_cleaned']

encoder = ce.OneHotEncoder(cols=cat_list_to_encode)
encoder.fit(X)
X_encoded = encoder.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=42)

lr = LogisticRegression(C=100).fit(X_train, y_train)

y_pred = lr.predict(X_test)

print("Training score: {:.2f}".format(lr.score(X_train, y_train)))
print()
print("Testing score: {:.2f}".format(lr.score(X_test, y_test)))



Training score: 0.69

Testing score: 0.69


In [13]:
y_pred

array(['non functional', 'non functional', 'functional', ...,
       'functional', 'functional', 'non functional'], dtype=object)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import category_encoders as ce

X = df_combo.drop(['id', 'construction_year', 'year_recorded', 'population', 'amount_tsh', 'target', 'target_cleaned'],  axis=1)
y = df_combo['target_cleaned']

cat_list_to_encode = ['funder_cleaned', 'installer_cleaned', 'scheme_management_cleaned', 'extraction_type_cleaned',
                      'management_cleaned', 'management_group_cleaned', 'payment_cleaned', 'water_quality_cleaned',
                      'quality_group_cleaned', 'quantity_cleaned', 'source_cleaned', 'source_class_cleaned', 'waterpoint_type_cleaned']

encoder = ce.OneHotEncoder(cols=cat_list_to_encode)
encoder.fit(X)
X_encoded = encoder.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=42)

rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=3).fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Training score: {:.2f}".format(rf.score(X_train, y_train)))
print()
print("Testing score: {:.2f}".format(rf.score(X_test, y_test)))

Training score: 0.89

Testing score: 0.80


In [15]:
df_combo.head()

Unnamed: 0,id,target,funder_cleaned,installer_cleaned,scheme_management_cleaned,extraction_type_cleaned,management_cleaned,management_group_cleaned,payment_cleaned,water_quality_cleaned,quality_group_cleaned,quantity_cleaned,source_cleaned,source_class_cleaned,waterpoint_type_cleaned,target_cleaned,amount_tsh,gps_height,longitude,latitude,region_code,district_code,population,construction_year,year_recorded,age,population_logged,amount_tsh_encoded
0,69572,functional,Other,Other,VWC,gravity,vwc,user-group,other,soft,good,enough,spring,groundwater,standpipe,functional,6000.0,1390,34.938093,-9.856322,11,5,109,1999,2011,12,4.691348,1
1,8776,functional,Other,Other,Other,gravity,other,user-group,never pay,soft,good,other,other,surface,standpipe,functional,0.0,1399,34.698766,-2.147466,20,2,280,2010,2013,3,5.63479,0
2,34310,functional,Other,Other,VWC,gravity,vwc,user-group,other,soft,good,enough,other,surface,standpipe,functional,25.0,686,37.460664,-3.821329,21,4,250,2009,2013,4,5.521461,1
3,67743,non functional,Other,Other,VWC,other,vwc,user-group,never pay,soft,good,other,machine dbh,groundwater,standpipe,non functional,0.0,263,38.486161,-11.155298,90,63,58,1986,2013,27,4.060443,0
4,19728,functional,Other,Other,Other,gravity,other,other,never pay,soft,good,other,other,surface,standpipe,functional,0.0,0,31.130847,-1.825359,18,1,1,1986,2011,25,0.0,0


In [16]:
target = df_combo.pop('target')
target_cleaned = df_combo.pop('target_cleaned')

In [17]:
df_combo.head()

Unnamed: 0,id,funder_cleaned,installer_cleaned,scheme_management_cleaned,extraction_type_cleaned,management_cleaned,management_group_cleaned,payment_cleaned,water_quality_cleaned,quality_group_cleaned,quantity_cleaned,source_cleaned,source_class_cleaned,waterpoint_type_cleaned,amount_tsh,gps_height,longitude,latitude,region_code,district_code,population,construction_year,year_recorded,age,population_logged,amount_tsh_encoded
0,69572,Other,Other,VWC,gravity,vwc,user-group,other,soft,good,enough,spring,groundwater,standpipe,6000.0,1390,34.938093,-9.856322,11,5,109,1999,2011,12,4.691348,1
1,8776,Other,Other,Other,gravity,other,user-group,never pay,soft,good,other,other,surface,standpipe,0.0,1399,34.698766,-2.147466,20,2,280,2010,2013,3,5.63479,0
2,34310,Other,Other,VWC,gravity,vwc,user-group,other,soft,good,enough,other,surface,standpipe,25.0,686,37.460664,-3.821329,21,4,250,2009,2013,4,5.521461,1
3,67743,Other,Other,VWC,other,vwc,user-group,never pay,soft,good,other,machine dbh,groundwater,standpipe,0.0,263,38.486161,-11.155298,90,63,58,1986,2013,27,4.060443,0
4,19728,Other,Other,Other,gravity,other,other,never pay,soft,good,other,other,surface,standpipe,0.0,0,31.130847,-1.825359,18,1,1,1986,2011,25,0.0,0


In [18]:
df_combo_dummies = pd.get_dummies(df_combo)
df_combo_dummies.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,region_code,district_code,population,construction_year,year_recorded,age,population_logged,amount_tsh_encoded,funder_cleaned_Other,funder_cleaned_Tanzania,installer_cleaned_DWE,installer_cleaned_Other,scheme_management_cleaned_Other,scheme_management_cleaned_VWC,extraction_type_cleaned_gravity,extraction_type_cleaned_other,management_cleaned_other,management_cleaned_vwc,management_group_cleaned_other,management_group_cleaned_user-group,payment_cleaned_never pay,payment_cleaned_other,water_quality_cleaned_other,water_quality_cleaned_soft,quality_group_cleaned_good,quality_group_cleaned_other,quantity_cleaned_enough,quantity_cleaned_other,source_cleaned_machine dbh,source_cleaned_other,source_cleaned_shallow well,source_cleaned_spring,source_class_cleaned_groundwater,source_class_cleaned_surface,waterpoint_type_cleaned_handpump,waterpoint_type_cleaned_other,waterpoint_type_cleaned_standpipe
0,69572,6000.0,1390,34.938093,-9.856322,11,5,109,1999,2011,12,4.691348,1,1,0,0,1,0,1,1,0,0,1,0,1,0,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,1
1,8776,0.0,1399,34.698766,-2.147466,20,2,280,2010,2013,3,5.63479,0,1,0,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,1,0,1,0,0,0,1,0,0,1
2,34310,25.0,686,37.460664,-3.821329,21,4,250,2009,2013,4,5.521461,1,1,0,0,1,0,1,1,0,0,1,0,1,0,1,0,1,1,0,1,0,0,1,0,0,0,1,0,0,1
3,67743,0.0,263,38.486161,-11.155298,90,63,58,1986,2013,27,4.060443,0,1,0,0,1,0,1,0,1,0,1,0,1,1,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
4,19728,0.0,0,31.130847,-1.825359,18,1,1,1986,2011,25,0.0,0,1,0,0,1,1,0,1,0,1,0,1,0,1,0,0,1,1,0,0,1,0,1,0,0,0,1,0,0,1


In [19]:
df_combo_dummies['target_cleaned'] = target_cleaned
df_combo_dummies.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,region_code,district_code,population,construction_year,year_recorded,age,population_logged,amount_tsh_encoded,funder_cleaned_Other,funder_cleaned_Tanzania,installer_cleaned_DWE,installer_cleaned_Other,scheme_management_cleaned_Other,scheme_management_cleaned_VWC,extraction_type_cleaned_gravity,extraction_type_cleaned_other,management_cleaned_other,management_cleaned_vwc,management_group_cleaned_other,management_group_cleaned_user-group,payment_cleaned_never pay,payment_cleaned_other,water_quality_cleaned_other,water_quality_cleaned_soft,quality_group_cleaned_good,quality_group_cleaned_other,quantity_cleaned_enough,quantity_cleaned_other,source_cleaned_machine dbh,source_cleaned_other,source_cleaned_shallow well,source_cleaned_spring,source_class_cleaned_groundwater,source_class_cleaned_surface,waterpoint_type_cleaned_handpump,waterpoint_type_cleaned_other,waterpoint_type_cleaned_standpipe,target_cleaned
0,69572,6000.0,1390,34.938093,-9.856322,11,5,109,1999,2011,12,4.691348,1,1,0,0,1,0,1,1,0,0,1,0,1,0,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,1,functional
1,8776,0.0,1399,34.698766,-2.147466,20,2,280,2010,2013,3,5.63479,0,1,0,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,1,0,1,0,0,0,1,0,0,1,functional
2,34310,25.0,686,37.460664,-3.821329,21,4,250,2009,2013,4,5.521461,1,1,0,0,1,0,1,1,0,0,1,0,1,0,1,0,1,1,0,1,0,0,1,0,0,0,1,0,0,1,functional
3,67743,0.0,263,38.486161,-11.155298,90,63,58,1986,2013,27,4.060443,0,1,0,0,1,0,1,0,1,0,1,0,1,1,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1,non functional
4,19728,0.0,0,31.130847,-1.825359,18,1,1,1986,2011,25,0.0,0,1,0,0,1,1,0,1,0,1,0,1,0,1,0,0,1,1,0,0,1,0,1,0,0,0,1,0,0,1,functional


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = df_combo_dummies.drop(['id', 'construction_year', 'year_recorded', 'population', 'amount_tsh', 'target_cleaned'],  axis=1)
y = df_combo_dummies['target_cleaned']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

lr = LogisticRegression(C=100).fit(X_train, y_train)

y_pred = lr.predict(X_test)

print("Training score: {:.2f}".format(lr.score(X_train, y_train)))
print()
print("Testing score: {:.2f}".format(lr.score(X_test, y_test)))



Training score: 0.69

Testing score: 0.69


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df_combo_dummies.drop(['id', 'construction_year', 'year_recorded', 'population', 'amount_tsh', 'target_cleaned'],  axis=1)
y = df_combo_dummies['target_cleaned']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=3).fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Training score: {:.2f}".format(rf.score(X_train, y_train)))
print()
print("Testing score: {:.2f}".format(rf.score(X_test, y_test)))

Training score: 0.90

Testing score: 0.80


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X = df_combo_dummies.drop(['id', 'construction_year', 'year_recorded', 'population', 'amount_tsh', 'target_cleaned'],  axis=1)
y = df_combo_dummies['target_cleaned']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

lr = LogisticRegression(C=100).fit(X_train, y_train)
scores = cross_val_score(lr, X, y, cv=5)

y_pred = lr.predict(X_test)

print("CV scores: {}".format(scores))
print("CV scores mean: {}".format(scores.mean()))



CV scores: [0.69000926 0.68114478 0.67954545 0.69267677 0.68448523]
CV scores mean: 0.6855722975752112


In [23]:
df_combo_dummies.shape

(59400, 43)

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X = df_combo_dummies.drop(['id', 'construction_year', 'year_recorded', 'population', 'amount_tsh', 'target_cleaned'],  axis=1)
y = df_combo_dummies['target_cleaned']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=3).fit(X_train, y_train)
scores = cross_val_score(rf, X, y, cv=5)

y_pred = rf.predict(X_test)

print("CV scores: {}".format(scores))
print("CV scores mean: {}".format(scores.mean()))

CV scores: [0.80498275 0.80025253 0.80563973 0.80042088 0.8012459 ]
CV scores mean: 0.8025083545984943


In [25]:
y_pred

array(['non functional', 'functional', 'functional', ...,
       'non functional', 'functional', 'non functional'], dtype=object)