In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wrangle import note, low_occurance
import category_encoders as ce
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score


pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
data         = pd.read_csv('train_features.csv')
feature_info = pd.read_csv('train_labels.csv')

feature_info = feature_info.drop(columns = 'id')

data = low_occurance(note(pd.concat([data, feature_info], 
                                    sort = False, axis = 1)))

data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,small_fry,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,small_fry,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [13]:
def finish_dataframe (data=data):
    
    df = data[['latitude','longitude','gps_height','population','ward',
               'construction_year','amount_tsh','quantity','funder',
               'waterpoint_type', 'status_group','extraction_type',
               'source','management']]

    df['longitude'] = df['longitude'].replace(0 ,np.nan)
    df['latitude'] = df['latitude'].replace(-2.000000e-08 ,np.nan)

    df = df.dropna()
    

    n_clusters = [50, 6000]

    for n in n_clusters:
        kmeans = KMeans(n_clusters=n, n_jobs=-1)
        kmeans.fit(df[['longitude','latitude']])
        y_kmeans = kmeans.predict(df[['longitude','latitude']])

        df[str(n)+'_kmeans_clusters'] = y_kmeans

    df = df.drop(columns=['latitude','longitude'])
    
    ordinal = ce.OrdinalEncoder(return_df=True)
    df = ordinal.fit_transform(df)
    
    return df

df = finish_dataframe()

In [35]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', max_iter=10000, multi_class='auto')

In [51]:
from sklearn.model_selection import cross_val_score

X = df.drop(columns='status_group')
y = df['status_group']

model.fit(X , y)

score  = (cross_val_score(model, X, y, 
                          cv = 10, 
                          scoring = 'accuracy',
                          n_jobs = -1,
                          verbose = 10))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  2.5min remaining:  5.9min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  2.8min remaining:  2.8min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  3.1min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.9min finished


In [52]:
print('Cross Validation Spread: ', score)
print('Cross Validation Mean: ', score.mean())
print('Cross Validation STDV: ', np.std(score))

Cross Validation Spread:  [0.63579103 0.63764357 0.63208596 0.63016491 0.63572355 0.6386212
 0.63843588 0.62861379 0.64003707 0.63410565]
Cross Validation Mean:  0.6351222611775732
Cross Validation STDV:  0.0036294907152368524


In [53]:
from xgboost import XGBClassifier

xgbc = XGBClassifier(booster='gbtree', colsample_bylevel=1, colsample_bytree=1, max_delta_step=0, 
                     max_depth=17, min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1, 
                     objective='multi:softmax',num_class=3,andom_state=42, reg_alpha=0, reg_lambda=1, 
                     scale_pos_weight=1,seed=42, silent=True, subsample=1, eval_metric='merror')

In [56]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'base_score': np.arange(start=0.3, stop=0.72, step=0.02),
    'max_depth': np.arange(start=1, stop=100),
    'n_estimators': np.arange(start=50, stop=500),
}

RSCV = RandomizedSearchCV(estimator=xgbc, 
                          param_distributions=param_distributions, 
                          n_iter=120, 
                          scoring='accuracy', 
                          n_jobs=-1, 
                          cv=5, 
                          verbose=10, 
                          random_state=42, 
                          return_train_score=True)

X = df.drop(columns='status_group')
y = df['status_group']

RSCV.fit(X, y)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 29.2min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 33.5min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 49.6min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 58.6min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 65.4min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 79.8min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 87.1min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 108.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 129.0min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 146.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=XGBClassifier(andom_state=42, base_score=0.5,
                                           booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           eval_metric='merror', gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=17, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=-1, nthread=None, num_class=3,
                                           obje...
       427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439,
       440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452,
       453, 454, 455, 456, 457, 45

In [58]:
results_RSCV = pd.DataFrame(RSCV.cv_results_)
results_RSCV.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,param_base_score,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
86,179.836953,4.776229,7.239346,0.837258,439,9,0.66,"{'n_estimators': 439, 'max_depth': 9, 'base_sc...",0.81008,0.807671,...,0.808194,0.0033,1,0.939868,0.944107,0.942602,0.944504,0.946148,0.943446,0.002115
22,108.149912,4.663385,3.7081,0.419618,165,13,0.62,"{'n_estimators': 165, 'max_depth': 13, 'base_s...",0.809802,0.802853,...,0.805878,0.002919,2,0.958839,0.960067,0.961318,0.959374,0.962454,0.96041,0.001316
80,99.125538,5.093529,3.53205,0.588358,142,14,0.7,"{'n_estimators': 142, 'max_depth': 14, 'base_s...",0.809802,0.80239,...,0.805433,0.003151,3,0.964861,0.963935,0.965116,0.963705,0.966345,0.964793,0.000942
93,46.376212,0.366455,1.08035,0.089693,86,11,0.42,"{'n_estimators': 86, 'max_depth': 11, 'base_sc...",0.807022,0.801927,...,0.804951,0.002914,4,0.904823,0.907787,0.908204,0.905846,0.91122,0.907576,0.002204
9,55.441087,1.074257,0.971262,0.046881,137,9,0.36,"{'n_estimators': 137, 'max_depth': 9, 'base_sc...",0.804614,0.803224,...,0.803654,0.003335,5,0.882401,0.886639,0.883906,0.882035,0.888938,0.884784,0.002634


In [61]:
RSCV.best_estimator_

XGBClassifier(andom_state=42, base_score=0.6600000000000004, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              eval_metric='merror', gamma=0, learning_rate=0.1,
              max_delta_step=0, max_depth=9, min_child_weight=1, missing=None,
              n_estimators=439, n_jobs=-1, nthread=None, num_class=3,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
              subsample=1, verbosity=1)

In [None]:
from sklearn.model_selection import GridSearchCV

param_distributions = {
    'base_score': [0.65],
    'max_depth': [9,10,11,12,13,14],
    'gamma': [0,1,5],
    'n_estimators': [137, 142, 165]
}




GSCV = GridSearchCV(estimator=xgbc, 
                    param_grid=param_distributions, 
                    scoring='accuracy', 
                    n_jobs=-1,
                    refit=True, 
                    cv=5, 
                    verbose=10, 
                    return_train_score=True)

X = df.drop(columns='status_group')
y = df['status_group']

GSCV.fit(X, y)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 22.0min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 24.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 27.8min


In [None]:
results_GSCV = pd.DataFrame(GSCV.cv_results_)
results_sort_values(by='rank_test_score').head()