In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler, 
    PolynomialFeatures,
    KBinsDiscretizer
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [2]:
fina = pd.read_csv('data/Train.csv')
fina.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [3]:
fina = fina.drop(['country','year','uniqueid','relationship_with_head'], axis=1)

In [4]:
le = LabelEncoder()
fina['bank_account'] = le.fit_transform(fina['bank_account'])

X = fina.drop('bank_account', axis=1)
y = fina['bank_account']

In [5]:
fina.dtypes

bank_account             int64
location_type           object
cellphone_access        object
household_size           int64
age_of_respondent        int64
gender_of_respondent    object
marital_status          object
education_level         object
job_type                object
dtype: object

In [6]:
X_train, X_test,y_train, y_test = train_test_split(X,y , test_size= 0.30, stratify=y)

In [7]:
num_features = list(X_train.select_dtypes(exclude=['object']).columns)
num_features

['household_size', 'age_of_respondent']

In [8]:
cat_features = list(X_train.select_dtypes(include=['object']).columns)
cat_features

['location_type',
 'cellphone_access',
 'gender_of_respondent',
 'marital_status',
 'education_level',
 'job_type']

In [9]:
num_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaling', StandardScaler())
    ])
cat_transformer = Pipeline(steps=
                           [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_transformer', num_transformer, num_features),
        ('cat_transformer', cat_transformer, cat_features)
    ])

In [11]:
baseline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(solver='saga', max_iter=10_000))
    ])

In [12]:
baseline.fit(X_train, y_train)

In [213]:
baseline_ran = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs = -1, verbose = 1))])

In [174]:
baseline_ran.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.7s finished


In [175]:
y_ran= baseline_ran.predict(X_train)
y_ra = baseline_ran.predict(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


In [176]:
baseline_ran.score(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


0.9710312158386979

In [177]:
baseline_ran.score(X_test,y_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


0.8635590818928875

In [184]:
print(classification_report(y_train, y_ran))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     14148
           1       0.96      0.83      0.89      2318

    accuracy                           0.97     16466
   macro avg       0.97      0.91      0.94     16466
weighted avg       0.97      0.97      0.97     16466



In [185]:
print(classification_report(y_test, y_ra))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92      6064
           1       0.52      0.40      0.45       994

    accuracy                           0.86      7058
   macro avg       0.71      0.67      0.69      7058
weighted avg       0.85      0.86      0.86      7058



In [13]:
param_grid = {
    'preprocessor__num_transformer__imputer__strategy': ['mean','median'],
    'preprocessor__num_transformer__scaling': [StandardScaler(),MinMaxScaler(), None],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [0.1, 1.0,10.]
}

In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
gscv = GridSearchCV(
    estimator=baseline,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5, 
    n_jobs=-1,
    verbose=1
)

In [16]:
gscv.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [17]:
pd.DataFrame(gscv.cv_results_).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_classifier__penalty,param_preprocessor__num_transformer__imputer__strategy,param_preprocessor__num_transformer__scaling,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,6.455817,0.57302,0.042875,0.008153,0.1,l1,mean,StandardScaler(),"{'classifier__C': 0.1, 'classifier__penalty': ...",0.881906,0.880049,0.886729,0.879745,0.878227,0.881331,0.002942,21
1,2.572003,0.10231,0.036425,0.007072,0.1,l1,mean,MinMaxScaler(),"{'classifier__C': 0.1, 'classifier__penalty': ...",0.881603,0.877923,0.886729,0.878227,0.877316,0.880359,0.003518,23
2,17.102521,0.64812,0.040363,0.00391,0.1,l1,mean,,"{'classifier__C': 0.1, 'classifier__penalty': ...",0.880085,0.874279,0.886122,0.879441,0.874582,0.878902,0.004332,27
3,6.953498,0.696674,0.032732,0.004041,0.1,l1,median,StandardScaler(),"{'classifier__C': 0.1, 'classifier__penalty': ...",0.881906,0.880049,0.886729,0.879745,0.878227,0.881331,0.002942,21
4,2.881768,0.17602,0.034666,0.003374,0.1,l1,median,MinMaxScaler(),"{'classifier__C': 0.1, 'classifier__penalty': ...",0.881603,0.877923,0.886729,0.878227,0.877316,0.880359,0.003518,23


In [18]:
gscv.best_params_

{'classifier__C': 1.0,
 'classifier__penalty': 'l1',
 'preprocessor__num_transformer__imputer__strategy': 'mean',
 'preprocessor__num_transformer__scaling': StandardScaler()}

In [19]:
round(gscv.best_score_,6)

0.882181

In [20]:
best_model = gscv.best_estimator_
best_model

In [21]:
best_model.fit(X_train,y_train);

In [22]:
training_score = best_model.score(X_train,y_train)

# test score
test_score = best_model.score(X_test,y_test)

print(f'Train score: {round(training_score,6)}')
print(f'Test score : {round(test_score,6)}')

Train score: 0.882364
Test score : 0.884953


In [23]:
y_pred = best_model.predict(X_test)

In [24]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.98      0.90      0.94      6608
           1       0.32      0.70      0.44       450

    accuracy                           0.88      7058
   macro avg       0.65      0.80      0.69      7058
weighted avg       0.94      0.88      0.90      7058

