In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
%matplotlib inline

In [37]:
train = pd.read_csv('Train_v2.csv')
test = pd.read_csv('Test_v2.csv')

In [38]:
print('The shape of Train data is',train.shape)
print('-------------------------------------')
print('The shape of Test data is',test.shape)

The shape of Train data is (23524, 13)
-------------------------------------
The shape of Test data is (10086, 12)


In [39]:
train.dtypes

country                   object
year                       int64
uniqueid                  object
bank_account              object
location_type             object
cellphone_access          object
household_size             int64
age_of_respondent          int64
gender_of_respondent      object
relationship_with_head    object
marital_status            object
education_level           object
job_type                  object
dtype: object

In [40]:
train.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [41]:
edu = {'Tertiary education':5,'Secondary education':4,
    'Primary education':3,'Vocational/Specialised training':2,
     'Other/Dont know/RTA':1,'No formal education':0}

cat_data = ['country','relationship_with_head','marital_status','job_type']

# Label_Encoding for train data
train['bank_account'] = train['bank_account'].map({'No':0, 'Yes':1})
train['location_type'] = train['location_type'].map({'Rural':1, 'Urban':0})
train['cellphone_access'] = train['cellphone_access'].map({'No':0, 'Yes':1})
train['gender_of_respondent'] = train['gender_of_respondent'].map({'Male':1,'Female':0})
train['education_level'] = train['education_level'].map(edu)

train['age_of_respondent_'] = np.log1p(train['age_of_respondent'])
bins = [0,1,10,25]
labels = [1,2,3]
train['familysize'] = pd.cut(train['household_size'],bins=bins,labels=labels)
train['familysize'] = train['familysize'].astype('int64')

# Frequency encoding for TRAIN uniqueid
encoding1 = train.groupby('uniqueid').size()
train['enc_uniqueid'] = train.uniqueid.map(encoding1)

# Frequency encoding for TRAIN Job_types
encoding = train.groupby('job_type').size()
encoding = encoding/len(train)
train['enc_job_type'] = train.job_type.map(encoding)



# Label_Encoding for test data
test['location_type'] = test['location_type'].map({'Rural':1, 'Urban':0})
test['cellphone_access'] = test['cellphone_access'].map({'No':0, 'Yes':1})
test['gender_of_respondent'] = test['gender_of_respondent'].map({'Male':1,'Female':0})
test['education_level'] = test['education_level'].map(edu)

test['age_of_respondent_'] = np.log1p(test['age_of_respondent'])
bins = [0,1,10,25]
labels = [1,2,3]
test['familysize'] = pd.cut(test['household_size'],bins=bins,labels=labels)
test['familysize'] = test['familysize'].astype('int64')

# Frequency encoding for TEST uniqueid
encoding_test = test.groupby('uniqueid').size()
test['enc_uniqueid'] = test.uniqueid.map(encoding_test)

# Frequency encoding for TEST Job_types
encoding_test_1 = test.groupby('job_type').size()
encoding_test_1 = encoding_test_1/len(test)
test['enc_job_type'] = test.job_type.map(encoding_test_1)

#OneHot Encoding for Train Data
train = pd.get_dummies(train, prefix_sep='_', columns=cat_data)

#OneHot Encoding for Test Data
test = pd.get_dummies(test, prefix_sep='_', columns=cat_data)

In [42]:
#Dropping and Splitting columns
X_train_scaled = train.drop(['uniqueid','bank_account','age_of_respondent'], axis=1)
y = train['bank_account']
test = test.drop(['uniqueid','age_of_respondent'], axis=1)

In [43]:
test['familysize'] = test['familysize'].astype('int64')
train['familysize'] = train['familysize'].astype('int64')

In [44]:
# from sklearn.model_selection import KFold,StratifiedKFold, cross_val_score
# skf = StratifiedKFold(n_splits=10)
# from xgboost import XGBClassifier
# xgb = XGBClassifier()
# score = cross_val_score(xgb,X_train_scaled,y_train,cv=skf, scoring='accuracy')
# score.mean()

In [45]:
# errcb2=[]
# y_pred_totcb2=[]
# from sklearn.model_selection import KFold
# fold=KFold(n_splits=10)
# i=1
# for train_index, test_index in fold.split(X_train_scaled,y):
#     X_train, X_test = X_train_scaled.iloc[train_index],X_train_scaled.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]
#     m2= XGBClassifier(min_child_weight=5, colsample_bylevel=0.8, gamma=0.5, subsample=0.6, colsample_bytree=0.6, max_depth=5)
#     m2.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=100,verbose=100)
#     preds=m2.predict(X_test)
#     print("err: ",1-accuracy_score(y_test,preds))
#     errcb2.append(1-accuracy_score(y_test,preds))
#     p2 = m2.predict(test)
#     y_pred_totcb2.append(p2)

In [46]:
#Rescalling X_train, X_test
scaler = MinMaxScaler(feature_range=(0,1))
X_train_scaled = scaler.fit_transform(X_train_scaled)
test = scaler.fit_transform(test)

#Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X_train_scaled, y, stratify = y, test_size = 0.2, random_state=42) 

In [47]:
# %%time
# xgb = XGBClassifier()
# grid_param = {'min_child_weight': [1,5,10],
#              'gamma': [0.5,1,1.5,2,5],
#              'subsample': [0.6,0.8,1.0],
#               'learning_rate':[0.1,0.01,0.3],
#              'colsample_bytree': [0.6,0.8,1.0],
#              'max_depth': [3,4,5]}

# xgb_grid = GridSearchCV(xgb, grid_param)
# xgb_grid.fit(X_train,y_train)
# print(xgb_grid.best_params_)

In [48]:
# Fitting and Model Evaluation
rf = RandomForestClassifier(n_estimators=100, max_depth=5, criterion='entropy')
rf.fit(X_train,y_train)

# Validation
y_pred0 = rf.predict(X_test)

# Metrics
print(confusion_matrix(y_pred0,y_test)) #Confusion Matrix
print("Error Rate: ", 1 - accuracy_score(y_test,y_pred0))

pred_rf = rf.predict(test)

[[4021  565]
 [  22   97]]
Error Rate:  0.12476089266737511


In [51]:
# Fitting and Model Evaluation
xgb = XGBClassifier(max_depth=4,learning_rate=0.1,colsample_bylevel=0.8,colsample_bytree=0.8,
                    nthread=4, scale_pos_weight=1,seed=27, alpha=0.01,
                   subsample=0.8, gamma=0, min_child_weight=9)
xgb.fit(X_train,y_train)

# Validation
y_pred = xgb.predict(X_test)

# Metrics
print(confusion_matrix(y_pred,y_test)) #Confusion Matrix
print("Error Rate: ", 1 - accuracy_score(y_test,y_pred))

pred_xgb = xgb.predict(test)

[[3948  418]
 [  95  244]]
Error Rate:  0.10903294367693939


In [52]:
sub = pd.read_csv('Test_v2.csv')
sub_xgb = pd.DataFrame({"uniqueid": sub['uniqueid'] + " x " +sub['country'],
                   "bank_account":pred_xgb})

In [54]:
import lightgbm as lgb
lgbm = lgb.LGBMClassifier(colsample_bytree=0.1,bagging_fraction=0.8,
                          learning_rate=0.1,num_leaves=11,
                          num_iterations=200)
lgbm.fit(X_train,y_train)
y_pred1 = lgbm.predict(X_test)
print(confusion_matrix(y_test,y_pred1)) #Confusion Matrix
print("Error Rate: ", 1 - accuracy_score(y_test,y_pred1))

pred_lgb = lgbm.predict(test)



[[3966   77]
 [ 436  226]]
Error Rate:  0.10903294367693939


In [55]:
sub = pd.read_csv('Test_v2.csv')
sub_lgb = pd.DataFrame({"uniqueid": sub['uniqueid'] + " x " +sub['country'],
                   "bank_account":pred_lgb})

In [56]:

cat  = CatBoostClassifier(bootstrap_type='Bayesian',max_depth=4,learning_rate=0.01,od_wait=50, reg_lambda=3)
cat.fit(X_train,y_train)

y_pred2 = cat.predict(X_test)
print(confusion_matrix(y_test,y_pred2)) #Confusion Matrix
print("Error Rate: ", 1 - accuracy_score(y_test,y_pred2))

pred_cat = cat.predict(test)

0:	learn: 0.6849167	total: 12ms	remaining: 12s
1:	learn: 0.6774061	total: 17.7ms	remaining: 8.85s
2:	learn: 0.6701283	total: 25.8ms	remaining: 8.58s
3:	learn: 0.6609931	total: 33.8ms	remaining: 8.42s
4:	learn: 0.6520624	total: 41.5ms	remaining: 8.26s
5:	learn: 0.6433271	total: 49ms	remaining: 8.11s
6:	learn: 0.6361466	total: 56.3ms	remaining: 7.98s
7:	learn: 0.6295513	total: 64.1ms	remaining: 7.95s
8:	learn: 0.6230186	total: 71.8ms	remaining: 7.91s
9:	learn: 0.6161228	total: 78.4ms	remaining: 7.76s
10:	learn: 0.6097471	total: 86.1ms	remaining: 7.74s
11:	learn: 0.6033590	total: 93.5ms	remaining: 7.7s
12:	learn: 0.5971754	total: 101ms	remaining: 7.68s
13:	learn: 0.5910150	total: 107ms	remaining: 7.56s
14:	learn: 0.5848414	total: 116ms	remaining: 7.61s
15:	learn: 0.5796086	total: 129ms	remaining: 7.96s
16:	learn: 0.5737222	total: 136ms	remaining: 7.86s
17:	learn: 0.5680410	total: 143ms	remaining: 7.79s
18:	learn: 0.5618310	total: 150ms	remaining: 7.76s
19:	learn: 0.5556637	total: 158ms	re

174:	learn: 0.3197273	total: 1.41s	remaining: 6.65s
175:	learn: 0.3193379	total: 1.42s	remaining: 6.63s
176:	learn: 0.3189512	total: 1.43s	remaining: 6.67s
177:	learn: 0.3185907	total: 1.44s	remaining: 6.66s
178:	learn: 0.3183749	total: 1.45s	remaining: 6.66s
179:	learn: 0.3181658	total: 1.46s	remaining: 6.64s
180:	learn: 0.3178676	total: 1.46s	remaining: 6.63s
181:	learn: 0.3175605	total: 1.47s	remaining: 6.62s
182:	learn: 0.3171917	total: 1.48s	remaining: 6.6s
183:	learn: 0.3168420	total: 1.48s	remaining: 6.58s
184:	learn: 0.3164590	total: 1.49s	remaining: 6.57s
185:	learn: 0.3161454	total: 1.5s	remaining: 6.55s
186:	learn: 0.3159155	total: 1.5s	remaining: 6.53s
187:	learn: 0.3155412	total: 1.51s	remaining: 6.51s
188:	learn: 0.3153592	total: 1.51s	remaining: 6.49s
189:	learn: 0.3152034	total: 1.52s	remaining: 6.47s
190:	learn: 0.3149319	total: 1.52s	remaining: 6.45s
191:	learn: 0.3146297	total: 1.53s	remaining: 6.43s
192:	learn: 0.3142243	total: 1.53s	remaining: 6.41s
193:	learn: 0.3

350:	learn: 0.2931486	total: 3.08s	remaining: 5.69s
351:	learn: 0.2930359	total: 3.09s	remaining: 5.69s
352:	learn: 0.2929812	total: 3.09s	remaining: 5.67s
353:	learn: 0.2929541	total: 3.1s	remaining: 5.66s
354:	learn: 0.2929037	total: 3.11s	remaining: 5.64s
355:	learn: 0.2928508	total: 3.11s	remaining: 5.63s
356:	learn: 0.2927815	total: 3.12s	remaining: 5.62s
357:	learn: 0.2927247	total: 3.12s	remaining: 5.6s
358:	learn: 0.2926626	total: 3.13s	remaining: 5.58s
359:	learn: 0.2925784	total: 3.13s	remaining: 5.57s
360:	learn: 0.2924582	total: 3.14s	remaining: 5.56s
361:	learn: 0.2923344	total: 3.15s	remaining: 5.56s
362:	learn: 0.2922788	total: 3.17s	remaining: 5.56s
363:	learn: 0.2922291	total: 3.18s	remaining: 5.55s
364:	learn: 0.2921752	total: 3.18s	remaining: 5.54s
365:	learn: 0.2921594	total: 3.19s	remaining: 5.53s
366:	learn: 0.2920765	total: 3.2s	remaining: 5.52s
367:	learn: 0.2919967	total: 3.21s	remaining: 5.51s
368:	learn: 0.2919384	total: 3.21s	remaining: 5.5s
369:	learn: 0.29

526:	learn: 0.2846130	total: 4.47s	remaining: 4.01s
527:	learn: 0.2845698	total: 4.48s	remaining: 4.01s
528:	learn: 0.2845499	total: 4.5s	remaining: 4s
529:	learn: 0.2845187	total: 4.51s	remaining: 4s
530:	learn: 0.2844631	total: 4.53s	remaining: 4s
531:	learn: 0.2844558	total: 4.55s	remaining: 4s
532:	learn: 0.2844070	total: 4.56s	remaining: 4s
533:	learn: 0.2843748	total: 4.58s	remaining: 3.99s
534:	learn: 0.2843566	total: 4.58s	remaining: 3.98s
535:	learn: 0.2842923	total: 4.59s	remaining: 3.98s
536:	learn: 0.2842663	total: 4.6s	remaining: 3.96s
537:	learn: 0.2842374	total: 4.61s	remaining: 3.96s
538:	learn: 0.2842089	total: 4.62s	remaining: 3.95s
539:	learn: 0.2841892	total: 4.62s	remaining: 3.94s
540:	learn: 0.2841590	total: 4.63s	remaining: 3.93s
541:	learn: 0.2841364	total: 4.64s	remaining: 3.92s
542:	learn: 0.2841231	total: 4.64s	remaining: 3.91s
543:	learn: 0.2840612	total: 4.65s	remaining: 3.9s
544:	learn: 0.2840027	total: 4.66s	remaining: 3.89s
545:	learn: 0.2839823	total: 4

700:	learn: 0.2803677	total: 6.1s	remaining: 2.6s
701:	learn: 0.2803518	total: 6.11s	remaining: 2.59s
702:	learn: 0.2803340	total: 6.12s	remaining: 2.58s
703:	learn: 0.2803214	total: 6.13s	remaining: 2.58s
704:	learn: 0.2803101	total: 6.14s	remaining: 2.57s
705:	learn: 0.2802896	total: 6.14s	remaining: 2.56s
706:	learn: 0.2802717	total: 6.15s	remaining: 2.55s
707:	learn: 0.2802429	total: 6.16s	remaining: 2.54s
708:	learn: 0.2802131	total: 6.16s	remaining: 2.53s
709:	learn: 0.2802007	total: 6.17s	remaining: 2.52s
710:	learn: 0.2801722	total: 6.18s	remaining: 2.51s
711:	learn: 0.2801556	total: 6.18s	remaining: 2.5s
712:	learn: 0.2801379	total: 6.19s	remaining: 2.49s
713:	learn: 0.2801225	total: 6.2s	remaining: 2.48s
714:	learn: 0.2801097	total: 6.2s	remaining: 2.47s
715:	learn: 0.2800909	total: 6.21s	remaining: 2.46s
716:	learn: 0.2800630	total: 6.21s	remaining: 2.45s
717:	learn: 0.2800519	total: 6.22s	remaining: 2.44s
718:	learn: 0.2800432	total: 6.23s	remaining: 2.43s
719:	learn: 0.280

887:	learn: 0.2772271	total: 7.73s	remaining: 975ms
888:	learn: 0.2772059	total: 7.74s	remaining: 967ms
889:	learn: 0.2771867	total: 7.75s	remaining: 958ms
890:	learn: 0.2771555	total: 7.75s	remaining: 949ms
891:	learn: 0.2771418	total: 7.76s	remaining: 940ms
892:	learn: 0.2771214	total: 7.77s	remaining: 931ms
893:	learn: 0.2771077	total: 7.77s	remaining: 922ms
894:	learn: 0.2771002	total: 7.78s	remaining: 913ms
895:	learn: 0.2770841	total: 7.79s	remaining: 904ms
896:	learn: 0.2770704	total: 7.79s	remaining: 895ms
897:	learn: 0.2770555	total: 7.79s	remaining: 885ms
898:	learn: 0.2770503	total: 7.8s	remaining: 876ms
899:	learn: 0.2770386	total: 7.81s	remaining: 867ms
900:	learn: 0.2770232	total: 7.81s	remaining: 858ms
901:	learn: 0.2770037	total: 7.82s	remaining: 849ms
902:	learn: 0.2769918	total: 7.82s	remaining: 840ms
903:	learn: 0.2769793	total: 7.83s	remaining: 832ms
904:	learn: 0.2769709	total: 7.84s	remaining: 823ms
905:	learn: 0.2769606	total: 7.84s	remaining: 814ms
906:	learn: 0

In [57]:
sub = pd.read_csv('Test_v2.csv')
sub_cat = pd.DataFrame({"uniqueid": sub['uniqueid'] + " x " +sub['country'],
                   "bank_account":pred_cat})

In [59]:
sub_xgb['bank_account'].corr(sub_cat['bank_account'])

0.8909631598595562

In [60]:
sub_lgb['bank_account'].corr(sub_cat['bank_account'])

0.8608377760507137

In [34]:
# from pystacknet.pystacknet import StackNetClassifier
# model = [
#     [lgb.LGBMClassifier(colsample_bytree=0.1,bagging_fraction=0.8,
#                           learning_rate=0.1,num_leaves=11,
#                           num_iterations=200),
#     XGBClassifier(max_depth=4,learning_rate=0.1,colsample_bylevel=0.8,colsample_bytree=0.8,
#                     nthread=4, scale_pos_weight=1,seed=27, alpha=0.01,
#                    subsample=0.8, gamma=0, min_child_weight=9),
#     CatBoostClassifier(bootstrap_type='Bayesian',max_depth=4,learning_rate=0.01,od_wait=50, reg_lambda=3)],
    
#     [RandomForestClassifier(n_estimators=100, max_depth=5, criterion='entropy')]
# ]

# # Error_rate= 1 - accuracy_score(y_test,y_pred)

# model=StackNetClassifier(model, metric='accuracy', folds=4,
#                          restacking=True,
#                          use_retraining=True,
                         
#                          random_state=12345,n_jobs=1, verbose=1)

# model.fit(X_train,y_train)
# pred_stack = model.predict_proba(test)

In [58]:
# from sklearn.model_selection import KFold, StratifiedKFold
# from sklearn.model_selection import cross_val_score
# kf = KFold(n_splits=10, random_state=42)
# cross = cross_val_score(cat,X_train_scaled,y,cv=kf, scoring='accuracy')
# error_rate = 1 - cross.mean()
# print(error_rate)

In [None]:

# fea_imp = pd.DataFrame({'imp':xgb.feature_importances_, 'col': X_train_scaled.columns})
# fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
# _ = fea_imp.plot(kind='barh', x='col', y='imp', figsize=(20, 10))  

In [None]:
 0.10924548352816155

In [None]:
pred = xgb.predict(test)
pred

In [None]:
testpred = pd.DataFrame({'y_test':y_test,'y_pred':y_pred})
testpred.corr()

In [29]:
sub_file = pd.read_csv('Test_v2.csv')
sub = pd.DataFrame({"uniqueid": sub_file['uniqueid'] + " x " +sub_file['country'],
                   "bank_account":pred_cat})

In [35]:
# sub['bank_account'] = pred_stack

In [33]:
sub.to_csv('pred_stack.csv', index=False) #Submission File

Contact: lawrencebolu@gmail.com