In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
df_train = pd.read_csv('train_features.csv', parse_dates=['date_recorded'])
df_train.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [11]:
df_train_labels = pd.read_csv('train_labels.csv')
df_train_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [12]:
# Combine the DataFrames
# Remember, the target variable is 'status_group'

df_train_combined = df.merge(df_train_labels, on='id', how='left')
df_train_combined.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [13]:
df_train_combined.dtypes

id                                int64
amount_tsh                      float64
date_recorded            datetime64[ns]
funder                           object
gps_height                        int64
installer                        object
longitude                       float64
latitude                        float64
wpt_name                         object
num_private                       int64
basin                            object
subvillage                       object
region                           object
region_code                       int64
district_code                     int64
lga                              object
ward                             object
population                        int64
public_meeting                   object
recorded_by                      object
scheme_management                object
scheme_name                      object
permit                           object
construction_year                 int64
extraction_type                  object


In [14]:
df_train_combined_numeric = df_train_combined.select_dtypes(include=['number']).copy()
df_train_combined_numeric.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
0,69572,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999
1,8776,0.0,1399,34.698766,-2.147466,0,20,2,280,2010
2,34310,25.0,686,37.460664,-3.821329,0,21,4,250,2009
3,67743,0.0,263,38.486161,-11.155298,0,90,63,58,1986
4,19728,0.0,0,31.130847,-1.825359,0,18,1,0,0


In [15]:
df_train_combined_numeric['status_group'] = df_train_combined['status_group']
df_train_combined_numeric.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,status_group
0,69572,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999,functional
1,8776,0.0,1399,34.698766,-2.147466,0,20,2,280,2010,functional
2,34310,25.0,686,37.460664,-3.821329,0,21,4,250,2009,functional
3,67743,0.0,263,38.486161,-11.155298,0,90,63,58,1986,non functional
4,19728,0.0,0,31.130847,-1.825359,0,18,1,0,0,functional


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df_train_combined_numeric.loc[:, 'id':'construction_year']
y = df_train_combined_numeric['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=5).fit(X_train, y_train)

print("Training score: {:.2f}".format(rf.score(X_train, y_train)))
print("Test score: {:.2f}".format(rf.score(X_test, y_test)))


Training score: 0.84
Test score: 0.72


In [29]:
df_test_num = pd.read_csv('test_numeric_only.csv')
df_test_num.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
0,50785,0.0,1996,35.290799,-4.059696,0,21,3,321,2012
1,51630,0.0,1569,36.656709,-3.309214,0,2,2,300,2000
2,17168,0.0,1567,34.767863,-5.004344,0,13,2,500,2010
3,45559,0.0,267,38.058046,-9.418672,0,80,43,250,1987
4,49871,500.0,1260,35.006123,-10.950412,0,10,3,60,2000


In [30]:
test_features_preds = rf.predict(df_test_num)
test_features_preds[:5]

array(['functional', 'functional', 'functional', 'non functional',
       'functional'], dtype=object)

In [31]:
kaggle_baseline_submission_3 = pd.DataFrame({'id': df_test_num.id, 'status_group': test_features_preds})

kaggle_baseline_submission_3.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [32]:
kaggle_baseline_submission_3.to_csv('kaggle_baseline_submision_3.csv', index=False)

In [33]:
# Tried adding dates to the numeric only predictions using RandomForestClassifier to no effect

date_recorded = pd.read_csv('date_recorded.csv')
date_recorded.head()

Unnamed: 0,id,date_recorded
0,69572,2011-03-14
1,8776,2013-03-06
2,34310,2013-02-25
3,67743,2013-01-28
4,19728,2011-07-13


In [34]:
df_train_combined_numeric.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,status_group
0,69572,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999,functional
1,8776,0.0,1399,34.698766,-2.147466,0,20,2,280,2010,functional
2,34310,25.0,686,37.460664,-3.821329,0,21,4,250,2009,functional
3,67743,0.0,263,38.486161,-11.155298,0,90,63,58,1986,non functional
4,19728,0.0,0,31.130847,-1.825359,0,18,1,0,0,functional


In [35]:
df_train_combined_numeric_2 = df_train_combined_numeric.merge(date_recorded, on='id', how='left')
df_train_combined_numeric_2.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,status_group,date_recorded
0,69572,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999,functional,2011-03-14
1,8776,0.0,1399,34.698766,-2.147466,0,20,2,280,2010,functional,2013-03-06
2,34310,25.0,686,37.460664,-3.821329,0,21,4,250,2009,functional,2013-02-25
3,67743,0.0,263,38.486161,-11.155298,0,90,63,58,1986,non functional,2013-01-28
4,19728,0.0,0,31.130847,-1.825359,0,18,1,0,0,functional,2011-07-13


In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df_train_combined_numeric.drop('status_group', axis=1)
y = df_train_combined_numeric['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=5).fit(X_train, y_train)

print("Training score: {:.2f}".format(rf.score(X_train, y_train)))
print("Test score: {:.2f}".format(rf.score(X_test, y_test)))

Training score: 0.84
Test score: 0.71


In [75]:
# We can actually run RandomForest on the entire dataset so:

df_train = pd.read_csv('train_features.csv', parse_dates=['date_recorded'])
df_train.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [76]:
df_train_labels = pd.read_csv('train_labels.csv')
df_train_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [77]:
# Combine the DataFrames
# Remember, the target variable is 'status_group'

df_train_combined = df.merge(df_train_labels, on='id', how='left')
df_train_combined.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [78]:
df_train_combined = df_train_combined.dropna()

In [79]:
df_train_combined.drop('date_recorded', inplace=True, axis=1)

In [80]:
status_group = df_train_combined.pop('status_group')

In [81]:
df_train_combined_dummies = pd.get_dummies(df_train_combined)
df_train_combined_dummies.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,69572,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999,...,0,0,0,0,0,1,0,0,0,0
2,34310,25.0,686,37.460664,-3.821329,0,21,4,250,2009,...,0,0,0,0,0,1,0,0,0,0
5,9944,20.0,0,39.172796,-4.765587,0,4,8,1,2009,...,0,0,0,0,0,1,0,0,0,0
13,50495,0.0,1368,37.092574,-3.181783,0,3,7,1,2009,...,0,0,0,0,0,1,0,0,0,0
14,53752,0.0,0,34.364073,-3.629333,0,17,6,0,0,...,0,1,0,0,0,0,0,1,0,0


In [82]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df_train_combined_dummies
y = status_group

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=5).fit(X_train, y_train)

print("Training score: {:.2f}".format(rf.score(X_train, y_train)))
print("Test score: {:.2f}".format(rf.score(X_test, y_test)))

Training score: 0.76
Test score: 0.74


In [83]:
df_test = pd.read_csv('test_features.csv')
df_test.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,...,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,...,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


In [84]:
df_test.shape

(14358, 40)

In [85]:
df_test = df_test.dropna()

In [86]:
df_test.drop('date_recorded', inplace=True, axis=1)

In [73]:
df_test_dummies = pd.get_dummies(df_test)
df_test_dummies.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,...,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
1,51630,0.0,1569,36.656709,-3.309214,0,2,2,300,2000,...,0,0,0,0,0,1,0,0,0,0
5,52449,0.0,1685,36.685279,-3.30242,0,2,2,200,1990,...,0,0,0,0,0,1,0,0,0,0
7,28965,0.0,234,39.60742,-10.893786,0,9,4,1,1982,...,0,0,0,0,0,1,0,0,0,0
8,36301,30.0,584,39.262951,-10.823588,0,90,33,40,1997,...,0,0,0,0,0,1,0,0,0,0
9,54122,0.0,1083,37.096108,-3.251754,0,3,7,1,2003,...,0,0,0,0,0,1,0,0,0,0


In [74]:
test_features_preds = rf.predict(df_test_dummies)

test_features_preds[:5]

ValueError: Number of features of the model must match the input. Model n_features is 33232 and input n_features is 13217 

In [66]:
kaggle_baseline_submission_4 = pd.DataFrame({
    'id': df_test_dummies.id, 'status_group': test_features_preds
})

kaggle_baseline_submission_4.head()

Unnamed: 0,id,status_group
0,69572,functional
2,34310,functional
5,9944,functional
13,50495,functional
14,53752,functional


In [67]:
kaggle_baseline_submission_4.to_csv('kaggle_baseline_submission_4.csv', index=False)