# Pump it Up: Data Mining the Water Table

By: [Ville Heilala](https://heila.la), 2017

Datasource: http://taarifa.org/, http://maji.go.tz/, https://www.drivendata.org

Goal is to predict the operating condition of a waterpoint for each record in the dataset.

## Model the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sys
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)
print('Matplotlib version ' + matplotlib.__version__)

Python version 3.6.0 |Anaconda 4.3.1 (64-bit)| (default, Dec 23 2016, 12:22:00) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
Pandas version 0.19.2
Matplotlib version 2.0.0


In [2]:
######################
# Read data
##################################################################

train_values = pd.read_csv("/train_values_processed.csv")
test_values = pd.read_csv("/test_values_processed.csv")

In [3]:
exclude = ("id", "test", "train", "status_group", "num_private", "scheme_name",
           'waterpoint_type_group',
           'quality_group',
           'payment_type',
           'extraction_type_group',
           'extraction_type_class',
           'management_group',
           'source_type',
           'source_class')

In [None]:
from sklearn.ensemble import RandomForestClassifier

######################
# Select features
##################################################################

features = [col for col in list(train_values) if not col.startswith(exclude)]

######################
# Train model
##################################################################

# Build a forest of trees from the training set
clf = RandomForestClassifier(n_estimators=500, 
                             min_samples_leaf=2, 
                             max_features=0.20, 
                             oob_score=True, 
                             random_state=20).fit(train_values[features],
                                                  train_values["status_group"])

######################
# Evaluate model
##################################################################

print("OOB Error Score: " + str(round(1 - clf.oob_score_, 4)))

## Evaluate model

In [158]:
from sklearn.model_selection import cross_val_score

# Make prediction
predicted = clf.predict(train_values[features])

# Confusion matrix
pd.crosstab(train_values.status_group, predicted, rownames=['Actual'], colnames=['Predicted'])

# Sorted feature importance
fi = sorted(list(zip(train_values[features], clf.feature_importances_)), key=lambda x: str(x[1]), reverse=True)
fi

# Evaluate a score by cross-validation
#scores = cross_val_score(clf, train_values[features], train_values["status_group"], cv=10)
#print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Predicted,functional,functional needs repair,non functional
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
functional,31537,182,540
functional needs repair,1286,2698,333
non functional,1617,105,21102


[('funder_ida', 9.9293968618851346e-05),
 ('funder_no', 9.6841060795652234e-05),
 ('funder_st', 9.672118385998802e-06),
 ('funder_twe', 9.6241311703866266e-05),
 ('region_code_1971', 9.5870294289378499e-05),
 ('funder_chu', 9.4255557701772199e-05),
 ('funder_lam', 9.3835209623678914e-05),
 ('funder_the', 9.3542763544155819e-05),
 ('funder_ta', 9.2768376165127506e-05),
 ('lga_Longido', 9.2627442498962337e-05),
 ('lga_Kilindi', 8.9484572271170297e-05),
 ('funder_miz', 8.8535145939588872e-07),
 ('lga_Makete', 8.8497174432104018e-05),
 ('funder_nat', 8.7920889508381271e-05),
 ('funder_isl', 8.6259573091860333e-05),
 ('funder_cef', 8.4464029940762099e-06),
 ('source_unknown', 8.3123080902120801e-05),
 ('extraction_type_climax', 8.0883973366533707e-06),
 ('funder_dmd', 8.0530088738474092e-05),
 ('funder_coc', 8.0175481037901444e-05),
 ('lga_Sikonge', 8.0164236565765214e-05),
 ('scheme_management_SWC', 7.9421850731117087e-05),
 ('lga_Bukoba Urban', 7.9278461290664651e-05),
 ('extraction_type_

## Predict competition data set

In [162]:
######################
# Prediction for competition
##################################################################

prediction = clf.predict(test_values[features])
prediction_df = pd.DataFrame(prediction, columns=["status_group"])
submission = pd.merge(test_values.id.reset_index()["id"].to_frame(), prediction_df, right_index = True, left_index = True)
submission.head()
submission.to_csv("sub.csv", index = False)

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional
