In [1]:
import pandas as pd
import numpy as np
import re

import time

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer
from sklearn import datasets, linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import sklearn.datasets as datasets

import pandas_profiling

from sklearn.linear_model import LogisticRegression, LinearRegression
# import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
%%time
df_train = pd.read_csv('./Datasets/clean_train_data.csv')

CPU times: user 488 ms, sys: 84.8 ms, total: 573 ms
Wall time: 648 ms


In [3]:
df_test = pd.read_csv('./Datasets/clean_test_data.csv') 

In [4]:
df_train.shape, df_test.shape

((59400, 27), (14850, 26))

In [5]:
# Find all columns that are objects.
cols = df_train.select_dtypes(exclude=[np.number])

In [6]:
list(cols)

['date_recorded',
 'wpt_name',
 'basin',
 'region',
 'lga',
 'ward',
 'construction_year',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'quality_group',
 'quantity',
 'source',
 'source_class',
 'waterpoint_type',
 'status_group']

In [7]:
df_train.dtypes

id                         int64
amount_tsh               float64
date_recorded             object
gps_height                 int64
longitude                float64
latitude                 float64
wpt_name                  object
num_private                int64
basin                     object
region                    object
region_code                int64
district_code              int64
lga                       object
ward                      object
population                 int64
construction_year         object
extraction_type_group     object
extraction_type_class     object
management                object
management_group          object
payment                   object
quality_group             object
quantity                  object
source                    object
source_class              object
waterpoint_type           object
status_group              object
dtype: object

In [8]:
# Create dataframe of 'best parameters' to be converted into dummy variables. 
cols2 = df_train[['waterpoint_type', 'construction_year', 'extraction_type_class','management',
                'management_group', 'payment', 'quality_group', 'quantity', 'basin',
                'region', 'source', 'source_class']]

In [9]:
list(cols2)

['waterpoint_type',
 'construction_year',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'quality_group',
 'quantity',
 'basin',
 'region',
 'source',
 'source_class']

In [10]:
# create dummy variables for columns in cols2
dummy_col = pd.get_dummies(cols2)

In [11]:
# Make sure status_group only has the 3 unique values I'm searching for.
df_train['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

In [12]:
dummy_col.head()

Unnamed: 0,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,construction_year_1960s,construction_year_1970s,construction_year_1980s,...,source_machine dbh,source_other,source_rainwater harvesting,source_river,source_shallow well,source_spring,source_unknown,source_class_groundwater,source_class_surface,source_class_unknown
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


## Logistic Regression

In [13]:
y = df_train['status_group'].values
X = dummy_col

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [15]:
print(X.shape, y.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(59400, 99) (59400,)
(44550, 99) (44550,)
(14850, 99) (14850,)


In [16]:
ss = StandardScaler()
Xs = ss.fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [17]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

lr.score(X_test, y_test)



0.7358249158249158

In [18]:
from sklearn.metrics import classification_report, confusion_matrix

y_preds = lr.predict(X_test)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), 
             columns=['Pred +', 'Pred Fix', 'Pred -'], 
             index=['Act +', 'Act Fix', 'Act -'])

                         precision    recall  f1-score   support

             functional       0.72      0.89      0.80      8159
functional needs repair       0.50      0.05      0.09      1071
         non functional       0.77      0.64      0.70      5620

              micro avg       0.74      0.74      0.74     14850
              macro avg       0.67      0.53      0.53     14850
           weighted avg       0.73      0.74      0.71     14850



Unnamed: 0,Pred +,Pred Fix,Pred -
Act +,7297,31,831
Act Fix,807,55,209
Act -,2021,24,3575


In [23]:
preds = pd.DataFrame(y_preds)

In [26]:
n_test = pd.read_csv('./Datasets/clean_test_data.csv')

In [27]:
predict = pd.concat((n_test['id'], preds), axis=1)

In [29]:
predict.columns=['id', 'status_group']

In [30]:
predict.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,non functional
2,17168,functional
3,45559,functional
4,49871,functional


In [32]:
predict.to_csv('./Submissions/Submission_2.csv', index=False)