In [124]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
%pylab inline
pylab.rcParams['figure.figsize'] = (10,6)
color = sns.color_palette()

from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn import metrics

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [125]:
# customize the display area of the dataset
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [126]:
df = pd.read_csv('Train.csv')

In [127]:
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [128]:
df.shape

(23524, 13)

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


In [130]:
df.isnull().sum()

country                   0
year                      0
uniqueid                  0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

From the above analysis, there's no missing value in the dataset

In [131]:
for i in df.columns:
    print(i, df[i].unique())
    print("**********************")
    print("***********************")

country ['Kenya' 'Rwanda' 'Tanzania' 'Uganda']
**********************
***********************
year [2018 2016 2017]
**********************
***********************
uniqueid ['uniqueid_1' 'uniqueid_2' 'uniqueid_3' ... 'uniqueid_8757'
 'uniqueid_8758' 'uniqueid_8759']
**********************
***********************
bank_account ['Yes' 'No']
**********************
***********************
location_type ['Rural' 'Urban']
**********************
***********************
cellphone_access ['Yes' 'No']
**********************
***********************
household_size [ 3  5  8  7  1  6  4 10  2 11  9 12 16 15 13 14 21 18 17 20]
**********************
***********************
age_of_respondent [ 24  70  26  34  32  42  54  76  40  69  64  31  38  47  27  48  25  21
  18  22  58  55  62  29  35  45  67  19  80  66  50  33  28  51  16  17
  30  37  59  65  46  56  52  23  43  49  44  72  53  63  39  81  78  36
  20  60  95  71  57  85  68  41  61  75  86  73  93  74  88  90  77  84
  82  89  79  83  94  87

In [132]:
df.nunique()

country                      4
year                         3
uniqueid                  8735
bank_account                 2
location_type                2
cellphone_access             2
household_size              20
age_of_respondent           85
gender_of_respondent         2
relationship_with_head       6
marital_status               5
education_level              6
job_type                    10
dtype: int64

#### Convert all binary data into 1s and 0s.

In [133]:
def bank_account_num(x):
    if x == "Yes":
        return 1
    else:
        return 0


def gender_num(x):
    if x == "Male":
        return 1
    else:
        return 0

def location_num(x):
    if x == "Urban":
        return 1
    else:
        return 0

def phone_num(x):
    if x == "Yes":
        return 1
    else: 
        return 0


#### Convert the binary data into 1s and 0s

In [134]:
df['bank_account'] = df['bank_account'].apply(lambda x: bank_account_num(x))
df['cellphone_access'] = df['cellphone_access'].apply(lambda x: phone_num(x))
df['location_type'] = df['location_type'].apply(lambda x: location_num(x))
df['gender_of_respondent'] = df['gender_of_respondent'].apply(lambda x: gender_num(x))


In [135]:
maxi = max(df['age_of_respondent'])
df['age_of_respondent'] = df['age_of_respondent'].apply(lambda x: x/maxi)
maxi = max(df['household_size'])
df['household_size'] = df['household_size'].apply(lambda x: x/maxi)

In [136]:
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,1,0,1,0.142857,0.24,0,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,0,0,0,0.238095,0.7,0,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,1,1,1,0.238095,0.26,1,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,0,0,1,0.238095,0.34,0,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,0,1,0,0.380952,0.26,1,Child,Single/Never Married,Primary education,Informally employed


In [137]:
pd.get_dummies(df, columns=['relationship_with_head'], prefix=['R.W.H'])
pd.get_dummies(df, columns=['year'], prefix=["Y"])
pd.get_dummies(df, columns=["marital_status"], prefix=["MS"])
pd.get_dummies(df, columns=["education_level"], prefix=["EL"])
pd.get_dummies(df, columns=["job_type"], prefix=["JT"])
pd.get_dummies(df, columns=["country"], prefix=["C"])

Unnamed: 0,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type,C_Kenya,C_Rwanda,C_Tanzania,C_Uganda
0,2018,uniqueid_1,1,0,1,0.142857,0.24,0,Spouse,Married/Living together,Secondary education,Self employed,1,0,0,0
1,2018,uniqueid_2,0,0,0,0.238095,0.70,0,Head of Household,Widowed,No formal education,Government Dependent,1,0,0,0
2,2018,uniqueid_3,1,1,1,0.238095,0.26,1,Other relative,Single/Never Married,Vocational/Specialised training,Self employed,1,0,0,0
3,2018,uniqueid_4,0,0,1,0.238095,0.34,0,Head of Household,Married/Living together,Primary education,Formally employed Private,1,0,0,0
4,2018,uniqueid_5,0,1,0,0.380952,0.26,1,Child,Single/Never Married,Primary education,Informally employed,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23519,2018,uniqueid_2113,0,0,1,0.190476,0.48,0,Head of Household,Divorced/Seperated,No formal education,Other Income,0,0,0,1
23520,2018,uniqueid_2114,0,0,1,0.095238,0.27,0,Head of Household,Single/Never Married,Secondary education,Other Income,0,0,0,1
23521,2018,uniqueid_2115,0,0,1,0.238095,0.27,0,Parent,Widowed,Primary education,Other Income,0,0,0,1
23522,2018,uniqueid_2116,0,1,1,0.333333,0.30,0,Parent,Divorced/Seperated,Secondary education,Self employed,0,0,0,1


In [138]:
df.nunique()

country                      4
year                         3
uniqueid                  8735
bank_account                 2
location_type                2
cellphone_access             2
household_size              20
age_of_respondent           85
gender_of_respondent         2
relationship_with_head       6
marital_status               5
education_level              6
job_type                    10
dtype: int64

In [139]:
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,1,0,1,0.142857,0.24,0,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,0,0,0,0.238095,0.7,0,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,1,1,1,0.238095,0.26,1,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,0,0,1,0.238095,0.34,0,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,0,1,0,0.380952,0.26,1,Child,Single/Never Married,Primary education,Informally employed


In [140]:
df.columns

Index(['country', 'year', 'uniqueid', 'bank_account', 'location_type', 'cellphone_access', 'household_size', 'age_of_respondent', 'gender_of_respondent', 'relationship_with_head', 'marital_status', 'education_level', 'job_type'], dtype='object')

### Machine Learning Model

### Drop the uniqueid and bank_account

In [141]:
X = df.drop(['uniqueid', 'bank_account', "country"], axis=1)
X

Unnamed: 0,year,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,2018,0,1,0.142857,0.24,0,Spouse,Married/Living together,Secondary education,Self employed
1,2018,0,0,0.238095,0.70,0,Head of Household,Widowed,No formal education,Government Dependent
2,2018,1,1,0.238095,0.26,1,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,2018,0,1,0.238095,0.34,0,Head of Household,Married/Living together,Primary education,Formally employed Private
4,2018,1,0,0.380952,0.26,1,Child,Single/Never Married,Primary education,Informally employed
...,...,...,...,...,...,...,...,...,...,...
23519,2018,0,1,0.190476,0.48,0,Head of Household,Divorced/Seperated,No formal education,Other Income
23520,2018,0,1,0.095238,0.27,0,Head of Household,Single/Never Married,Secondary education,Other Income
23521,2018,0,1,0.238095,0.27,0,Parent,Widowed,Primary education,Other Income
23522,2018,1,1,0.333333,0.30,0,Parent,Divorced/Seperated,Secondary education,Self employed


In [142]:
Y = df['bank_account']
Y

0        1
1        0
2        1
3        0
4        0
        ..
23519    0
23520    0
23521    0
23522    0
23523    0
Name: bank_account, Length: 23524, dtype: int64

In [143]:
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,1,0,1,0.142857,0.24,0,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,0,0,0,0.238095,0.7,0,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,1,1,1,0.238095,0.26,1,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,0,0,1,0.238095,0.34,0,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,0,1,0,0.380952,0.26,1,Child,Single/Never Married,Primary education,Informally employed


In [144]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, train_size=0.75, random_state=123)

In [145]:
# from tpot import TPOTClassifier
# tpot = TPOTClassifier(generations=5, population_size=100,cv=5, subsample=0.3, verbosity=2, n_jobs=-1)
# tpot.fit(X_train, y_train)

In [146]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.55, min_samples_leaf=8, min_samples_split=12, n_estimators=100)
clf.fit(X_train, y_train)

ValueError: could not convert string to float: 'Spouse'

In [None]:
y_pred = clf.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, precision_score, classification_report, confusion_matrix

print("Accuracy: %.2f% " (accuracy_score(y_test, y_pred)*100))
print("F1: %.2f% " (f1_score(y_test, y_pred)))

In [None]:
import xgboost as xgb

model1 = xgb.XGBClassifier()
mode2 = xgb.XGBClasifier(n_estimators=100, max_depth=8, learning_rate=01, , subsample=0.5)

model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

print("Accuracy: %.2f%" (accuracy_score(y_test, y_preds)*100))
print("F1: %.2f% " (f1_score(y_test, y_preds)))

In [None]:
model2.fit(X_train, X_test)
y_pred = model2.predict(X_test)

