# Pump it Up: Data Mining the Water Table

By: [Ville Heilala](https://heila.la), 2017

Datasource: http://taarifa.org/, http://maji.go.tz/, https://www.drivendata.org

Goal is to predict the operating condition of a waterpoint for each record in the dataset.

## Get missing gps values

Use geocoder library and Google Maps API to get missing data. Limit per day is 2500 values at a time.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

print("python v. %s.%s.%s" % sys.version_info[:3])
print("numpy v. %s" % np.version.version)
print("pandas v. %s" % pd.__version__)

In [None]:
import geocoder

######################
# Load data
##################################################################

# Read data
train_values = pd.read_csv("/train_values.csv")
train_labels = pd.read_csv("/train_labels.csv")
test_values = pd.read_csv("/test_values.csv")

# Add binary variables for train set and test values
train_values["train"] = True
test_values["test"] = True

# Merge train values and test values
data = pd.concat([train_values, test_values], ignore_index = True)

######################
# Process latitude and longitude data: round and replace
##################################################################

# latitude, longitude: round
cols = ["latitude",
        "longitude"]

for col in cols:
    data[col] = data[col].map(lambda x: round(x, 7))
    data[col] = data[col].replace(to_replace=-0.0, value=0)
    
######################
# Process latitude and longitude data: add missing values
##################################################################

for lga in data[(data.latitude == 0) | (data.longitude == 0)]["lga"].unique():
    g = geocoder.google(lga)
    data.loc[data.lga == lga, "latitude"] = g.latlng[0]
    data.loc[data.lga == lga, "longitude"] = g.latlng[1]

######################
# Process latitude and longitude data: find missing gps_height values
# and save them to file heights.csv
##################################################################    

from time import sleep
heights = pd.read_csv("heights.csv")
heights = pd.Series(heights.gps_height.values, index=heights.id).to_dict()
for i in data.id:
    if (data.loc[data.id == i, "gps_height"] == 0).bool() and (i not in heights):
        lat = data.loc[data.id == i, "latitude"]
        long = data.loc[data.id == i, "longitude"]
        a = 0
        while True:
            sleep(1)
            g = geocoder.elevation([lat, long])
            if g.ok:
                print("n -> " + str(len(heights)) + " -> " + str(g.meters))
                heights[i] = int(g.meters)
                break
            else:
                a += 1
                print("none")
                if a > 20:
                    print("skip -> " + str(lat) + ", " + str(long))
                    break
        pd.DataFrame.from_dict(heights, orient="index").reset_index().to_csv("heights.csv", header=["id", "gps_height"], index=False)