In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/Train_v2.csv')
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [3]:
# No missing data (NICE)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
country                   23524 non-null object
year                      23524 non-null int64
uniqueid                  23524 non-null object
bank_account              23524 non-null object
location_type             23524 non-null object
cellphone_access          23524 non-null object
household_size            23524 non-null int64
age_of_respondent         23524 non-null int64
gender_of_respondent      23524 non-null object
relationship_with_head    23524 non-null object
marital_status            23524 non-null object
education_level           23524 non-null object
job_type                  23524 non-null object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


In [4]:
# Use Undersampling
# n_samples = df['bank_account'].value_counts().min()`
# df = pd.concat([df[df['bank_account'] == value].sample(n=n_samples, random_state=42)
#            for value in df['bank_account'].unique()]).reset_index(drop=True).copy()

In [4]:
df.shape

(23524, 13)

# Form the train-test data

In [5]:
def transform(df):
    cols = ['location_type', 'year',  'cellphone_access', 'household_size', 'age_of_respondent', 'gender_of_respondent']
    X = df[cols].copy()
    X['cellphone_access'] = X['cellphone_access'] == 'Yes'
    X['location_type'] = X['location_type'] == 'Rural'
    X['gender_of_respondent'] = X['gender_of_respondent'] == 'Male'
    categorical_cols = ['job_type', 'education_level', 'country', 'marital_status']
#     categorical_cols = ['job_type', 'education_level', 'country', 'marital_status', 'relationship_with_head']
    X_cat = pd.get_dummies(df[categorical_cols])
#     merged_X = X.merge(X_cat)
    merged_X = pd.merge(X, X_cat, how='inner', left_index=True, right_index=True)
    assert(len(X) == len(merged_X))
    return merged_X

In [6]:
# Check for class inbalance
y = df['bank_account']
y.value_counts()

No     20212
Yes     3312
Name: bank_account, dtype: int64

In [7]:
X = transform(df)
X.head()

Unnamed: 0,location_type,year,cellphone_access,household_size,age_of_respondent,gender_of_respondent,job_type_Dont Know/Refuse to answer,job_type_Farming and Fishing,job_type_Formally employed Government,job_type_Formally employed Private,...,education_level_Vocational/Specialised training,country_Kenya,country_Rwanda,country_Tanzania,country_Uganda,marital_status_Divorced/Seperated,marital_status_Dont know,marital_status_Married/Living together,marital_status_Single/Never Married,marital_status_Widowed
0,True,2018,True,3,24,False,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,True,2018,False,5,70,False,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,False,2018,True,5,26,True,0,0,0,0,...,1,1,0,0,0,0,0,0,1,0
3,True,2018,True,5,34,False,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
4,False,2018,False,8,26,True,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 31 columns):
location_type                                      23524 non-null bool
year                                               23524 non-null int64
cellphone_access                                   23524 non-null bool
household_size                                     23524 non-null int64
age_of_respondent                                  23524 non-null int64
gender_of_respondent                               23524 non-null bool
job_type_Dont Know/Refuse to answer                23524 non-null uint8
job_type_Farming and Fishing                       23524 non-null uint8
job_type_Formally employed Government              23524 non-null uint8
job_type_Formally employed Private                 23524 non-null uint8
job_type_Government Dependent                      23524 non-null uint8
job_type_Informally employed                       23524 non-null uint8
job_type_No Income            

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test = train_test_split(X, random_state=42)

In [11]:
y_train, y_test = train_test_split(y, random_state=42)

In [12]:
X_train.shape, y_train.shape

((17643, 31), (17643,))

In [15]:
# # As said in other blog posts, scaling doesn't affect the performance of xboost!
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [13]:
from sklearn.model_selection import cross_validate

In [14]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [18]:
from sklearn.model_selection import GridSearchCV
clf = XGBClassifier(n_estimators=100)
params = {'learning_rate': [0.1*i for i in range(1, 5)],
          'max_depth':range(3,10,2),
          'gamma':[i/10.0 for i in range(0,5)],
          'min_child_weight':range(1,6,2)}
gsearch = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1)
gsearch.fit(X_train, y_train);

In [19]:
clf = gsearch.best_estimator_
clf

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0.4,
       learning_rate=0.30000000000000004, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [20]:
# 0.88896446182622
clf.score(X_train, y_train)

0.8942356742050671

In [21]:
# 0.891174970243156
clf.score(X_test, y_test)

0.8893045400442102

# Other model

In [15]:
clf = XGBClassifier(min_child_weight = 5, # OK
                    gamma = 0.5, # OK
                    subsample = 0.6, # used to be equal to 1
                    colsample_bytree = 0.6, # used to be 1
                    max_depth = 5) # OK
clf.fit(X_train, y_train);

In [16]:
# 0.88896446182622
clf.score(X_train, y_train)

0.8933854786600918

In [17]:
# 0.891174970243156
clf.score(X_test, y_test)

0.8896446182622003

# Make predictions

In [22]:
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0.4,
       learning_rate=0.30000000000000004, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [18]:
test_df = pd.read_csv('data/Test_v2.csv')
X = transform(test_df)

In [19]:
pred = 1 - (clf.predict(X) == 'No')

In [20]:
test_df.head()

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_6056,Urban,Yes,3,30,Male,Head of Household,Married/Living together,Secondary education,Formally employed Government
1,Kenya,2018,uniqueid_6060,Urban,Yes,7,51,Male,Head of Household,Married/Living together,Vocational/Specialised training,Formally employed Private
2,Kenya,2018,uniqueid_6065,Rural,No,3,77,Female,Parent,Married/Living together,No formal education,Remittance Dependent
3,Kenya,2018,uniqueid_6072,Rural,No,6,39,Female,Head of Household,Married/Living together,Primary education,Remittance Dependent
4,Kenya,2018,uniqueid_6073,Urban,No,3,16,Male,Child,Single/Never Married,Secondary education,Remittance Dependent


In [21]:
id_col = test_df.apply(lambda row: '{} x {}'.format(row['uniqueid'], row['country']), axis=1)

In [22]:
submission_df = pd.DataFrame({'uniqueid': id_col, 'bank_account': pred})

In [23]:
submission_df.to_csv('submission-6.csv', index=False)