# Pump it Up: Data Mining the Water Table

By: [Ville Heilala](https://heila.la), 2017

Datasource: http://taarifa.org/, http://maji.go.tz/, https://www.drivendata.org

Goal is to predict the operating condition of a waterpoint for each record in the dataset.

## Model the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sys
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)
print('Matplotlib version ' + matplotlib.__version__)

Python version 3.6.0 |Anaconda 4.3.1 (64-bit)| (default, Dec 23 2016, 12:22:00) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
Pandas version 0.19.2
Matplotlib version 2.0.0


In [2]:
######################
# Read data
##################################################################

train_values = pd.read_csv("/train_values_processed.csv")
test_values = pd.read_csv("/test_values_processed.csv")

In [3]:
exclude = ("id", "test", "train", "status_group", "num_private", "scheme_name",
           'waterpoint_type_group',
           'quality_group',
           'payment_type',
           'extraction_type_group',
           'extraction_type_class',
           'management_group',
           'source_type',
           'source_class')

In [4]:
from sklearn.ensemble import RandomForestClassifier

######################
# Select features
##################################################################

# Select all features except which start with a string in exclude
features = [col for col in list(train_values) if not col.startswith(exclude)]

######################
# Train model
##################################################################

# Build a forest of trees from the training set
clf = RandomForestClassifier(n_estimators=500, 
                             min_samples_leaf=2, 
                             max_features=0.20, 
                             oob_score=True, 
                             random_state=20).fit(train_values[features],
                                                  train_values["status_group"])

######################
# Evaluate model
##################################################################

print("OOB Error Score: " + str(round(1 - clf.oob_score_, 4)))

OOB Error Score: 0.183


## Evaluate model

In [6]:
from sklearn.model_selection import cross_val_score

# Make prediction
predicted = clf.predict(train_values[features])

# Confusion matrix
pd.crosstab(train_values.status_group, predicted, rownames=['Actual'], colnames=['Predicted'])

# Sorted feature importance
fi = sorted(list(zip(train_values[features], clf.feature_importances_)), key=lambda x: str(x[1]), reverse=True)
fi

# Evaluate a score by cross-validation
#scores = cross_val_score(clf, train_values[features], train_values["status_group"], cv=10)
#print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Predicted,functional,functional needs repair,non functional
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
functional,31541,177,541
functional needs repair,1273,2713,331
non functional,1660,103,21061


[('funder_lam', 9.9847791249815839e-05),
 ('funder_coc', 9.9442787509889603e-05),
 ('construction_year_1969', 9.8523406598226515e-05),
 ('funder_the', 9.8277219660590533e-05),
 ('funder_isl', 9.676052706341612e-05),
 ('funder_twe', 9.4010752390927572e-05),
 ('funder_nat', 9.3324312704433842e-05),
 ('funder_ta', 9.231920554152226e-05),
 ('funder_fat', 9.188937733179473e-05),
 ('funder_co', 9.1856472413304055e-05),
 ('funder_dmd', 8.6293744039828184e-05),
 ('extraction_type_other - play pump', 8.4860888936473839e-05),
 ('lga_Kilindi', 8.3568312347856547e-05),
 ('lga_Makete', 8.2992827366966585e-05),
 ('lga_Sikonge', 8.2558650757924901e-05),
 ('funder_tz ', 8.1642084648540556e-06),
 ('installer_ces', 7.9985864573313279e-05),
 ('funder_chu', 7.9344766730652552e-05),
 ('source_unknown', 7.7737447803173928e-05),
 ('funder_caf', 7.7495509298127938e-06),
 ('funder_dad', 7.691730529000074e-05),
 ('funder_is', 7.6374265853008754e-05),
 ('funder_ser', 7.5493862854838039e-05),
 ('lga_Bukoba Urban'

## Predict competition data set

In [7]:
######################
# Prediction for competition
##################################################################

prediction = clf.predict(test_values[features])
prediction_df = pd.DataFrame(prediction, columns=["status_group"])
submission = pd.merge(test_values.id.reset_index()["id"].to_frame(), prediction_df, right_index = True, left_index = True)
submission.head()
submission.to_csv("sub.csv", index = False)

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional
