In [1]:
import pandas as pd
import numpy as np
import re

import time

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer
from sklearn import datasets, linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import sklearn.datasets as datasets

import pandas_profiling

from sklearn.linear_model import LogisticRegression, LinearRegression
# import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
%%time
df_train = pd.read_csv('./clean_train_data.csv')

CPU times: user 412 ms, sys: 66.5 ms, total: 478 ms
Wall time: 585 ms


In [3]:
df_test = pd.read_csv('./clean_test_data.csv') 

In [5]:
df_train.shape, df_test.shape

((59400, 27), (14850, 26))

In [9]:
# Find all columns that are objects.
cols = df_train.select_dtypes(exclude=[np.number])

In [10]:
list(cols)

['date_recorded',
 'wpt_name',
 'basin',
 'region',
 'lga',
 'ward',
 'construction_year',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'quality_group',
 'quantity',
 'source',
 'source_class',
 'waterpoint_type',
 'status_group']

In [11]:
df_train.dtypes

id                         int64
amount_tsh               float64
date_recorded             object
gps_height                 int64
longitude                float64
latitude                 float64
wpt_name                  object
num_private                int64
basin                     object
region                    object
region_code                int64
district_code              int64
lga                       object
ward                      object
population                 int64
construction_year         object
extraction_type_group     object
extraction_type_class     object
management                object
management_group          object
payment                   object
quality_group             object
quantity                  object
source                    object
source_class              object
waterpoint_type           object
status_group              object
dtype: object

In [63]:
# Create dataframe of 'best parameters' to be converted into dummy variables. 
cols2 = df_train[['waterpoint_type', 'construction_year', 'extraction_type_class','management',
                'management_group', 'payment', 'quality_group', 'quantity', 'basin',
                'region', 'source', 'source_class']]

In [64]:
list(cols2)

['waterpoint_type',
 'construction_year',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'quality_group',
 'quantity',
 'basin',
 'region',
 'source',
 'source_class']

In [65]:
# create dummy variables for columns in cols2
dummy_col = pd.get_dummies(cols2)

In [66]:
# Make sure status_group only has the 3 unique values I'm searching for.
df_train['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

In [67]:
dummy_col.head()

Unnamed: 0,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,construction_year_00s,construction_year_10s,construction_year_60s,...,source_machine dbh,source_other,source_rainwater harvesting,source_river,source_shallow well,source_spring,source_unknown,source_class_groundwater,source_class_surface,source_class_unknown
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,0,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


## Logistic Regression

In [None]:
y = df_train['status_group'].values
X = dummy_col.drop(['status_group', 'id', 'amount_tsh', 'num_private', 
                    'date_recorded', 'wpt_name', 'lga', 'ward'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
print(X.shape, y.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
ss = StandardScaler()
Xs = ss.fit_transform(X)

In [39]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

print(lr.score(X_test, y_test))

0.7307744107744107


In [41]:
# Set up the parameters. Looking at C regularization strengths on a log scale.
gs_params = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

lr_gridsearch = GridSearchCV(LogisticRegression(), gs_params, cv=5, verbose=1)

In [42]:
%%time
lr_gridsearch = lr_gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed: 24.6min finished


CPU times: user 23min 52s, sys: 58.9 s, total: 24min 51s
Wall time: 24min 36s


In [43]:
# best score on the training data:
lr_gridsearch.best_score_

0.7359371492704826

In [44]:
# best parameters on the training data:
lr_gridsearch.best_params_

{'C': 0.2477076355991709, 'penalty': 'l2', 'solver': 'liblinear'}

In [45]:
# assign the best estimator to a variable:
best_lr = lr_gridsearch.best_estimator_

In [46]:
# Score it on the testing data:
best_lr.score(X_test, y_test)

0.7301010101010101

In [47]:
coef_df = pd.DataFrame({
        'coef':best_lr.coef_[0],
        'feature':X.columns
    })

In [48]:
coef_df['abs_coef'] = np.abs(coef_df.coef)

In [49]:
# sort by absolute value of coefficient (magnitude)
coef_df.sort_values('abs_coef', ascending=False, inplace=True)

In [86]:
# Show non-zero coefs and predictors
coef_df[coef_df.coef != 0].head(10)

Unnamed: 0,coef,feature,abs_coef
51,-3.092123,quantity_dry,3.092123
6,-1.350119,waterpoint_type_other,1.350119
52,1.210503,quantity_enough,1.210503
21,-1.179681,management_company,1.179681
57,0.986747,basin_Lake Nyasa,0.986747
8,0.962425,construction_year_10s,0.962425
54,0.96197,quantity_seasonal,0.96197
5,0.94451,waterpoint_type_improved spring,0.94451
25,0.913428,management_private operator,0.913428
48,0.822496,quality_group_milky,0.822496


In [51]:
from sklearn.metrics import classification_report, confusion_matrix

y_preds = lr_gridsearch.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), 
             columns=['Pred +', 'Pred Fix', 'Pred -'], 
             index=['Act +', 'Act Fix', 'Act -'])

                         precision    recall  f1-score   support

             functional       0.71      0.90      0.79      8091
functional needs repair       0.47      0.04      0.08      1075
         non functional       0.78      0.62      0.69      5684

            avg / total       0.72      0.73      0.70     14850



Unnamed: 0,Pred +,Pred Fix,Pred -
Act +,7268,23,800
Act Fix,845,44,186
Act -,2128,26,3530


In [87]:
preds2 = pd.DataFrame(y_preds)

In [88]:
n_test2 = pd.read_csv('./clean_test_data.csv')

In [93]:
x_final2 = n_test.drop(['id'], axis=1)
Xtrain2 = train.drop(['status_group'], axis=1)
Ytrain2 = train['status_group']

In [94]:
predict2 = pd.concat((n_test2['id'], preds), axis=1)

In [95]:
predict2.columns=['id', 'status_group']

In [97]:
predict.to_csv('./Submission_7.csv', index=False)

## Logistic Regression using top ten coefficents.

In [53]:
# Select top ten coefficents from above.
cols3 = dummy_col[['quantity_dry', 'waterpoint_type_other', 'quantity_enough', 'management_company',
                  'construction_year_10s', 'quantity_seasonal', 'waterpoint_type_improved spring', 
                  'management_private operator', 'quality_group_milky', 'region_Mbeya']]

In [54]:
y2 = df_train['status_group'].values
X2 = cols3

In [74]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X2, y2, test_size=0.25)

In [75]:
lr = LogisticRegression()
lr.fit(X_train1, y_train1)

print(lr.score(X_test1, y_test1))

0.6927946127946127


In [76]:
# Score is not as good as the origional logistic regression.  

In [77]:
gs_params = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

lr_gridsearch = GridSearchCV(LogisticRegression(), gs_params, cv=5, verbose=1)

In [84]:
%%time
lr_gridsearch1 = lr_gridsearch.fit(X_train1, y_train1)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:  4.6min finished


CPU times: user 4min 17s, sys: 13.4 s, total: 4min 30s
Wall time: 4min 38s


In [79]:
lr_gridsearch1.best_score_

0.6972615039281705

In [80]:
lr_gridsearch.best_params_

{'C': 0.19630406500402725, 'penalty': 'l2', 'solver': 'liblinear'}

In [81]:
best_lr = lr_gridsearch.best_estimator_

In [83]:
best_lr.score(X_test1, y_test1)

0.6929966329966329

In [None]:
# Score is still worse. 