# Load data

In [1]:
#imports
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import sklearn.linear_model as linear_models
import sklearn.svm as svm
import sklearn.ensemble as ensemble
from sklearn.cross_validation import train_test_split
from IPython.display import display
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.grid_search import GridSearchCV
import numpy as np


In [2]:
# Notebook Settings
pd.set_option('display.max_columns', 50)
pd.set_option('precision', 4)
%matplotlib inline
from __future__ import print_function
from __future__ import division

In [3]:
training_labels_df = pd.read_csv('../../data/water_pump/training_labels.csv', header=0)
training_values_df = pd.read_csv('../../data/water_pump/training.csv', header=0)

#training_labels_df = pd.read_csv(training_labels_file)
#training_values_df = pd.read_csv(training_values_file)

#training_values_df['status_group'] = training_labels_df['status_group']

# Run classification algorithms

In [8]:
y_df = training_labels_df['status_group']
X_df = training_values_df[['construction_year','population','gps_height','longitude','latitude']] 

#cat_list = ['quantity','district_code','scheme_management','extraction_type_group','water_quality','basin']
cat_list = ['quantity','district_code','scheme_management','extraction_type_group','basin']

for cat_feature in cat_list:
    temp_df = pd.get_dummies(training_values_df[cat_feature])
    X_df = X_df.join(temp_df)
X_df.sample(5)

Unnamed: 0,construction_year,population,gps_height,longitude,latitude,dry,enough,insufficient,seasonal,unknown,0,1,2,3,4,5,6,7,8,13,23,30,33,43,53,...,WUG,Water Board,Water authority,afridev,gravity,india mark ii,india mark iii,mono,nira/tanira,other,other handpump,other motorpump,rope pump,submersible,swn 80,wind-powered,Internal,Lake Nyasa,Lake Rukwa,Lake Tanganyika,Lake Victoria,Pangani,Rufiji,Ruvuma / Southern Coast,Wami / Ruvu
51327,2010,60,335,37.6097,-6.1386,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
26225,0,0,0,32.203,-4.6788,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
18897,0,0,0,31.7413,-1.6136,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
47340,0,0,0,33.5542,-3.369,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6603,1983,500,1840,31.3311,-7.6533,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# cleaning
construction_mean = X_df[X_df.construction_year>0].mean()
X_df['construction_year'].replace(0, construction_mean)

pop_mean = X_df[X_df.population > 0].mean()
X_df['population'].replace(0,pop_mean)

gps_height_mean = X_df[X_df.gps_height>0].mean();
X_df['gps_height'].replace(0,gps_height_mean);

In [14]:
# split training/test
X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                    y_df, test_size=0.25, random_state=42)

In [17]:
# run classification algorithms
clf = linear_models.LogisticRegressionCV(scoring='accuracy',
                                         cv=5, n_jobs=4, Cs=8)
y_hat = clf.fit(X_train, y_train).predict(X_test)
print( "Accuracy with logistic regression = %4.4f" %
      metrics.accuracy_score(y_test, y_hat))

Accuracy with logistic regression = 0.6953


In [18]:
naive_b = GaussianNB()
y_predict_naive_b = naive_b.fit(X_train,y_train).predict(X_test)
print( "Accuracy with Naive Bayes = %4.4f " %
      accuracy_score(y_test,y_predict_naive_b))

Accuracy with Naive Bayes = 0.5279 


In [19]:
d_tree = DecisionTreeClassifier()
y_predict_d_tree = d_tree.fit(X_train,y_train).predict(X_test)

print("Accuracy with Decision Tree = %4.4f" %
      accuracy_score(y_test,y_predict_d_tree))

Accuracy with Decision Tree = 0.7421


In [22]:
grid = {'max_depth': [3, 4, 5, 6],
        'n_estimators': [40, 50, 60,  100]
       }
clf = GridSearchCV(RandomForestClassifier(), n_jobs=4,
                   param_grid=grid,
                   scoring='f1_micro', cv=8)

#clf.fit(X_train, y_train)
y_hat = clf.fit(X_train,y_train).predict(X_test)

print('Best Params: \n', clf.best_params_ )
print("Accuracy with Random Forest = %4.4f"  %
      accuracy_score(y_test, y_hat))

Best Params: 
 {'n_estimators': 100, 'max_depth': 6}
Accuracy with Random Forest = 0.6949


In [32]:
m = metrics.confusion_matrix(y_test, y_hat)
print(m)
m[:, 1] = m[:,1:].sum(axis=1)
m[1, :] = m[1:,:].sum(axis=0)

m = m[:2, :2]
print(m)

[[7753    0  269]
 [1054    0   57]
 [3151    0 2566]]
[[7753  269]
 [4205 2623]]


In [21]:
# commenting out for now since it takes too long
grid = {'C': 10 ** np.arange(-3, 4,0.5), 'gamma': 10 ** np.arange(-2.5, 2)}

clf = GridSearchCV(SVC(),n_jobs=3, param_grid=grid, scoring='f1_micro',cv=5)

# clf.fit(X_train, y_train)
# y_hat = clf.predict(X_test)
# # print("Accuracy with SVM = %4.4f" %
#       accuracy_score(y_test, y_hat))

In [None]:
from sklearn.metrics import confusion_matrix
# get y_pred and y_actual
y_pred = model.predict(X_test)
#convert y's to binaryconf_matrix = confusion_matrix(y_actual, y_pred)confusion_matrix[1,:] = confusion_matrix[:, 1:2].sum()
y_actual = y_test

np.