In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier 

In [10]:
df = pd.read_csv("Financial_inclusion_dataset(1).csv")

In [11]:
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [12]:
df = df.drop(columns=['year', 'uniqueid', 'relationship_with_head', 'marital_status'], axis=1)

In [13]:
df['education_level'].unique()

array(['Secondary education', 'No formal education',
       'Vocational/Specialised training', 'Primary education',
       'Tertiary education', 'Other/Dont know/RTA'], dtype=object)

In [14]:
df.head()

Unnamed: 0,country,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,education_level,job_type
0,Kenya,Yes,Rural,Yes,3,24,Female,Secondary education,Self employed
1,Kenya,No,Rural,No,5,70,Female,No formal education,Government Dependent
2,Kenya,Yes,Urban,Yes,5,26,Male,Vocational/Specialised training,Self employed
3,Kenya,No,Rural,Yes,5,34,Female,Primary education,Formally employed Private
4,Kenya,No,Urban,No,8,26,Male,Primary education,Informally employed


In [15]:
le = LabelEncoder()

In [17]:
for col in ['country', 'bank_account', 'location_type', 'cellphone_access', 'household_size', 'age_of_respondent', 
            'gender_of_respondent', 'education_level', 'job_type']:
    if col in df.columns:
        df[col] = le.fit_transform(df[col])

In [25]:
df.head()

Unnamed: 0,country,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,education_level,job_type
0,0,1,0,1,2,8,0,3,9
1,0,0,0,0,4,54,0,0,4
2,0,1,1,1,4,10,1,5,9
3,0,0,0,1,4,18,0,2,3
4,0,0,1,0,7,10,1,2,5


In [27]:
df['bank_account'].value_counts(normalize=True)

bank_account
0    0.859208
1    0.140792
Name: proportion, dtype: float64

In [29]:
X = df.drop(columns='bank_account', axis=1)

In [31]:
y = df['bank_account']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [35]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [37]:
pipeline = Pipeline([
    ('ss', StandardScaler()),
    ('rfc', RandomForestClassifier())
])

In [39]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=30)

In [41]:
params = {
    'rfc__n_estimators': range(10, 100, 10),
    'rfc__max_depth': range(10, 50, 10)
}

In [43]:
model = GridSearchCV(
    pipeline,
    param_grid = params,
    cv = cv,
    n_jobs = 5,
    verbose =1
)

In [45]:
model.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [46]:
model.score(X_train_smote, y_train_smote)

0.9508308110445364

In [47]:
y_pred = model.predict(X_test)

In [48]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.84      0.88      4023
           1       0.37      0.55      0.44       682

    accuracy                           0.80      4705
   macro avg       0.65      0.70      0.66      4705
weighted avg       0.84      0.80      0.82      4705



In [49]:
pipeline1 = Pipeline([
    ('ss', StandardScaler()),
    ('dtc', DecisionTreeClassifier())
])

In [50]:
params1 = {
    'dtc__max_depth': range(10, 50, 10),
    'dtc__min_samples_split': range(10, 50,10),
    'dtc__min_samples_leaf': range(10, 50,10),
}

In [51]:
model1 = GridSearchCV(
    pipeline1,
    param_grid = params1,
    cv = cv,
    n_jobs = 5,
    verbose =1
)

In [52]:
model1.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [53]:
model1.score(X_train_smote, y_train_smote)

0.8560442275619248

In [54]:
y_pred1 = model1.predict(X_test)

In [55]:
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.93      0.80      0.86      4023
           1       0.35      0.63      0.45       682

    accuracy                           0.78      4705
   macro avg       0.64      0.72      0.66      4705
weighted avg       0.84      0.78      0.80      4705



In [56]:
import joblib

In [57]:
joblib.dump(model, 'financial_inclusion_le.pkl')

['financial_inclusion_le.pkl']

In [58]:
joblib.dump(le, 'financial_inclusion_le.pkl')

['financial_inclusion_le.pkl']