In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)  # Unlimited columns
import nbimporter
# Imported from my other notebook
from data_cleanup import cleanup1
from data_cleanup import cleanup2
from data_cleanup import cleanup3

Importing Jupyter notebook from data_cleanup.ipynb


In [162]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import accuracy_score
from category_encoders.target_encoder import TargetEncoder
from sklearn.ensemble import RandomForestClassifier

In [127]:
# Import all data
sample_submission = pd.read_csv('original_data/sample_submission.csv')
test_features = pd.read_csv('original_data/test_features.csv')
train_features = pd.read_csv('original_data/train_features.csv')
train_labels = pd.read_csv('original_data/train_labels.csv')

In [3]:
# Clean data with previously defined cleanup function
X_train = cleanup3(cleanup2(cleanup1(train_features)))
X_test = cleanup3(cleanup2(cleanup1(test_features)))

In [128]:
y_true = train_labels['status_group']
y_works = [1.0 if x == 'functional' else 0.0 for x in y_true]
y_broken = [1.0 if x == 'non functional' else 0.0 for x in y_true]
y_repair = [1.0 if x == 'functional needs repair' else 0.0 for x in y_true]

# OHE + Logistic Regression
I'll start by one-hot encoding all the categorical variables and running a simple logistic regression.  Many of the features have way too much cardinality for one-hot encoding.  Let's separate them into two lists by cardinality.

In [115]:
# Define groups of columns in X_train
numericals = ['amount_tsh',
                    'date_recorded',
                    'gps_height',
                    'longitude',
                    'latitude',
                    'num_private',
                    'population',
                    'construction_year']

alt_numericals = ['amount_tsh',
                    'date_recorded',
                    'gps_height',
#                     'longitude',
#                     'latitude',
                    'num_private',
                    'population',
                    'construction_year']

categoricals = ['funder',
                     'installer',
                     'wpt_name',
                     'basin',
                     'subvillage',
                     'region',
                     'region_code',
                     'district_code',
                     'lga',
                     'ward',
                     'public_meeting',
                     'scheme_management',
                     'scheme_name',
                     'permit',
                     'extraction_type',
                     'extraction_type_group',
                     'extraction_type_class',
                     'management',
                     'management_group',
                     'payment',
                     'payment_type',
                     'water_quality',
                     'quality_group',
                     'quantity',
                     'quantity_group',
                     'source',
                     'source_type',
                     'source_class',
                     'waterpoint_type',
                     'waterpoint_type_group']

trash_cols = ['amount_tsh_trash',
                     'construction_year_trash',
                     'gps_height_trash',
                     'latitude_trash',
                     'longitude_trash',
                     'num_private_trash',
                     'population_trash']

In [142]:
# # OHE that works
X_train_cats = X_train[categoricals]
# OHE = OneHotEncoder()
# X_train_cats_expanded = OHE.fit_transform(X_train_cats)
# X_train_cats_expanded

In [159]:
X_train_cats_expanded

<59400x630 sparse matrix of type '<class 'numpy.float64'>'
	with 2197800 stored elements in Compressed Sparse Row format>

In [150]:
target_encoder = TargetEncoder()
X_TE1 = target_encoder.fit_transform(X=X_train_cats, y=y_works)
target_encoder = TargetEncoder()
X_TE2 = target_encoder.fit_transform(X=X_train_cats, y=y_broken)
target_encoder = TargetEncoder()
X_TE3 = target_encoder.fit_transform(X=X_train_cats, y=y_repair)

In [151]:
X_TE1.head()

Unnamed: 0,funder,installer,wpt_name,basin,subvillage,region,region_code,district_code,lga,ward,public_meeting,scheme_management,scheme_name,permit,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,0.821818,0.563818,0.546613,0.653687,0.54604,0.782206,0.781698,0.5691,0.774823,0.520431,0.556899,0.515315,0.884892,0.517094,0.599253,0.599253,0.599253,0.504234,0.538236,0.752334,0.752334,0.565941,0.565941,0.652323,0.652323,0.62229,0.62229,0.54232,0.621485,0.576491
1,0.582615,0.563818,0.518072,0.497658,0.54604,0.449975,0.449975,0.555178,0.592179,0.520431,0.503299,0.498234,0.534426,0.554437,0.599253,0.599253,0.599253,0.59954,0.538236,0.448911,0.448911,0.565941,0.565941,0.523234,0.523234,0.603922,0.603922,0.545168,0.621485,0.576491
2,0.582615,0.625179,0.546613,0.600895,0.466135,0.6235,0.6235,0.621736,0.581169,0.520431,0.556899,0.515315,0.534426,0.554437,0.599253,0.599253,0.599253,0.504234,0.538236,0.677796,0.677796,0.565941,0.565941,0.652323,0.652323,0.385671,0.385671,0.545168,0.366213,0.576491
3,0.567644,0.534188,0.546613,0.371689,0.54604,0.30289,0.211559,0.317949,0.265823,0.520431,0.556899,0.515315,0.534426,0.554437,0.551217,0.53876,0.53876,0.504234,0.538236,0.448911,0.448911,0.565941,0.565941,0.025136,0.025136,0.489571,0.495355,0.54232,0.366213,0.576491
4,0.582615,0.555556,0.491419,0.497658,0.54604,0.520808,0.520156,0.537409,0.53048,0.520431,0.556899,0.498234,0.534426,0.554437,0.599253,0.599253,0.599253,0.504425,0.5,0.448911,0.448911,0.565941,0.565941,0.574074,0.574074,0.603922,0.603922,0.545168,0.621485,0.576491


In [152]:
X_TE2.head()

Unnamed: 0,funder,installer,wpt_name,basin,subvillage,region,region_code,district_code,lga,ward,public_meeting,scheme_management,scheme_name,permit,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,0.145455,0.363274,0.382219,0.297148,0.381613,0.19456,0.195094,0.386364,0.205674,0.408258,0.370195,0.421249,0.086331,0.407443,0.299888,0.299888,0.299888,0.426864,0.38735,0.179846,0.179846,0.357236,0.357236,0.275357,0.275357,0.302744,0.302744,0.397825,0.299278,0.339523
1,0.343075,0.363274,0.387952,0.405835,0.381613,0.519553,0.519553,0.369283,0.370112,0.408258,0.44991,0.449616,0.393655,0.376145,0.299888,0.299888,0.299888,0.301458,0.38735,0.475856,0.475856,0.357236,0.357236,0.380924,0.380924,0.259259,0.259259,0.33696,0.299278,0.339523
2,0.343075,0.247496,0.382219,0.345749,0.462151,0.315856,0.315856,0.321702,0.402597,0.408258,0.370195,0.421249,0.393655,0.376145,0.299888,0.299888,0.299888,0.426864,0.38735,0.276683,0.276683,0.357236,0.357236,0.275357,0.275357,0.577744,0.577744,0.33696,0.527609,0.339523
3,0.338694,0.42735,0.382219,0.555753,0.381613,0.624277,0.68157,0.65641,0.734177,0.408258,0.370195,0.421249,0.393655,0.376145,0.401134,0.420295,0.420295,0.426864,0.38735,0.475856,0.475856,0.357236,0.357236,0.96894,0.96894,0.466095,0.462131,0.397825,0.527609,0.339523
4,0.343075,0.4,0.426773,0.405835,0.381613,0.387515,0.388387,0.357125,0.411154,0.408258,0.370195,0.449616,0.393655,0.376145,0.299888,0.299888,0.299888,0.439317,0.444814,0.475856,0.475856,0.357236,0.357236,0.32321,0.32321,0.259259,0.259259,0.33696,0.299278,0.339523


In [153]:
X_TE3.head()

Unnamed: 0,funder,installer,wpt_name,basin,subvillage,region,region_code,district_code,lga,ward,public_meeting,scheme_management,scheme_name,permit,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,0.032727,0.072908,0.071167,0.049164,0.072347,0.023234,0.023208,0.044536,0.019504,0.07131,0.072906,0.063436,0.028777,0.075463,0.100859,0.100859,0.100859,0.068902,0.074414,0.06782,0.06782,0.076823,0.076823,0.07232,0.07232,0.074966,0.074966,0.059855,0.079237,0.083986
1,0.07431,0.072908,0.093976,0.096507,0.072347,0.030472,0.030472,0.075539,0.037709,0.07131,0.046791,0.05215,0.071919,0.069417,0.100859,0.100859,0.100859,0.099002,0.074414,0.075233,0.075233,0.076823,0.076823,0.095842,0.095842,0.136819,0.136819,0.117872,0.079237,0.083986
2,0.07431,0.127325,0.071167,0.053356,0.071713,0.060644,0.060644,0.056562,0.016234,0.07131,0.072906,0.063436,0.071919,0.069417,0.100859,0.100859,0.100859,0.068902,0.074414,0.04552,0.04552,0.076823,0.076823,0.07232,0.07232,0.036585,0.036585,0.117872,0.106177,0.083986
3,0.093661,0.038462,0.071167,0.072557,0.072347,0.072832,0.10687,0.025641,0.0,0.07131,0.072906,0.063436,0.071919,0.069417,0.047649,0.040945,0.040945,0.068902,0.074414,0.075233,0.075233,0.076823,0.076823,0.005924,0.005924,0.044334,0.042514,0.059855,0.106177,0.083986
4,0.07431,0.044444,0.081808,0.096507,0.072347,0.091677,0.091456,0.105466,0.058366,0.07131,0.072906,0.05215,0.071919,0.069417,0.100859,0.100859,0.100859,0.056258,0.055186,0.075233,0.075233,0.076823,0.076823,0.102716,0.102716,0.136819,0.136819,0.117872,0.079237,0.083986


In [155]:
X_TE_all = pd.concat([X_TE1,X_TE2,X_TE3], sort=False, axis=1)

In [None]:
X_TE_nums = pd.concat([X_TE_all, X_train[numericals]], sort=False, axis=1)

In [156]:
X_TE_all.shape

(59400, 90)

In [160]:
logreg = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=500)
logreg.fit(X_train_cats_expanded, y_true)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [161]:
y_pred = logreg.predict(X_train_cats_expanded)
accuracy_score(y_true, y_pred)

0.7607912457912458

In [158]:
y_pred = logreg.predict(X_TE_all)
accuracy_score(y_true, y_pred)

0.7311784511784511

# RFC + OHE

In [166]:
RFC = RandomForestClassifier()
RFC.fit(X_train_cats_expanded, y_true)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [167]:
y_pred = RFC.predict(X_train_cats_expanded)
accuracy_score(y_true, y_pred)

0.8871717171717172

# RFC + Target

In [168]:
RFC = RandomForestClassifier()
RFC.fit(X_TE_all, y_true)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [169]:
y_pred = RFC.predict(X_TE_all)
accuracy_score(y_true, y_pred)

0.8851515151515151

# RFC + Target + nums

In [170]:
RFC = RandomForestClassifier()
RFC.fit(X_train[numericals], y_true)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [171]:
y_pred = RFC.predict(X_train[numericals])
accuracy_score(y_true, y_pred)

0.969006734006734

# Gridsearch

In [174]:
gs = GridSearchCV(RandomForestClassifier(), cv=5, param_grid={},
                  scoring='accuracy', 
                  verbose=10)

gs.fit(X_train_cats_expanded, y_true)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....................... , score=0.7851190977190472, total=  24.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.7s remaining:    0.0s


[CV] ........................ , score=0.782004881743961, total=  25.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   50.3s remaining:    0.0s


[CV] ....................... , score=0.7808080808080808, total=  25.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.3min remaining:    0.0s


[CV] ....................... , score=0.7767676767676768, total=  26.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.7min remaining:    0.0s


[CV] ....................... , score=0.7824549587472639, total=  29.9s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=10)

In [175]:
gs = GridSearchCV(RandomForestClassifier(), cv=5, param_grid={},
                  scoring='accuracy', 
                  verbose=10)

gs.fit(X_TE_all, y_true)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....................... , score=0.7884016496927868, total=   2.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.5s remaining:    0.0s


[CV] ....................... , score=0.7832674017338608, total=   2.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.0s remaining:    0.0s


[CV] ....................... , score=0.7852693602693602, total=   2.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    7.3s remaining:    0.0s


[CV] ....................... , score=0.7794612794612794, total=   2.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.0s remaining:    0.0s


[CV] ....................... , score=0.7840545546388281, total=   1.9s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.3s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=10)

# Other

In [72]:
# # # Use a mapper to apply transformations selectively
# mapper = DataFrameMapper(
#     [([col], None) for col in other_cols] +
#     [([col], RobustScaler()) for col in numericals] +
#     [([col], OneHotEncoder()) for col in categoricals]
# )

# # # Define an estimator and param_grid
# pipe = make_pipeline(
#     mapper, 
#     LogisticRegression(solver='lbfgs', multi_class='ovr',
#                       max_iter=500))


In [None]:
%%time
X_tweaked = mapper.fit_transform(X_train[numericals], y_true)

In [61]:
pd.DataFrame(X_tweaked, columns=mapper.transformed_names_).head()

In [62]:
%%time
pipe.fit(X_train,y_true)

CPU times: user 5.84 s, sys: 359 ms, total: 6.2 s
Wall time: 4.07 s


Pipeline(memory=None,
     steps=[('dataframemapper', DataFrameMapper(default=False, df_out=False,
        features=[(['id'], None), (['amount_tsh_trash'], None), (['construction_year_trash'], None), (['gps_height_trash'], None), (['latitude_trash'], None), (['longitude_trash'], None), (['num_private_trash'], None), (['popula...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [63]:
y_pred = pipe.predict(X_train)
accuracy_score(y_true, y_pred)

0.543080808080808

Alright, that was the score with all categories except for those that have thousands of possible values and make the final dataframe way too big.

# Make a submission file

In [None]:
# Clean up the test dataset
test1 = cleanup1(test_features)

# Extract the same columns used for training
X_test = test1[cols_to_keep]

# Run the prediction, using the pipeline fit to the training data
y_pred = pipe.predict(X_test)

# Make a dataframe with the answers
y_submit = pd.DataFrame({'id':test_features['id'],
                         'status_group':y_pred} )
# make a submission CSV file
y_submit.to_csv('DMAn.csv', index=False)