In [None]:
!pip install sweetviz

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.stats as stats # t

from sklearn import model_selection as ms
from sklearn import preprocessing   as pp
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

#Visualization
from matplotlib import pyplot as plt
import seaborn as sns
import sweetviz as sv


#Others
import pickle
import warnings

from IPython.display       import Image
from IPython.core.display  import HTML

warnings.filterwarnings( 'ignore' )

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Paths handlers
data_path = '/kaggle/input/inclusao-financeira-na-africa/'

## 0.1. Helper Functions

In [None]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    sns.set()
    
# Plots the disribution of a variable colored by value of the target
def kde_target(var_name, df):
    
    # Calculate the correlation coefficient between the new variable and the target
    corr = df['TARGET'].corr(df[var_name])
    
    # Calculate medians for repaid vs not repaid
    avg_repaid = df.ix[df['TARGET'] == 0, var_name].median()
    avg_not_repaid = df.ix[df['TARGET'] == 1, var_name].median()
    
    plt.figure(figsize = (12, 6))
    
    # Plot the distribution for target == 0 and target == 1
    sns.kdeplot(df.ix[df['TARGET'] == 0, var_name], label = 'TARGET == 0')
    sns.kdeplot(df.ix[df['TARGET'] == 1, var_name], label = 'TARGET == 1')
    
    # label the plot
    plt.xlabel(var_name); plt.ylabel('Density'); plt.title('%s Distribution' % var_name)
    plt.legend();
    
    # print out the correlation
    print('The correlation between %s and the TARGET is %0.4f' % (var_name, corr))
    # Print out average values
    print('Median value for loan that was not repaid = %0.4f' % avg_not_repaid)
    print('Median value for loan that was repaid =     %0.4f' % avg_repaid)


def diagnostic_plots(df, variable):
    # plot the histograms to have a quick look at the distributions
    # we can plot Q-Q plots to visualise if the variable is normally distributed
    # function to plot a histogram and a Q-Q plot
    # side by side, for a certain variable
    
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    df[variable].hist()

    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=pylab)

    plt.show()
    

jupyter_settings()

## 0.2 Loading Data

In [None]:
df_raw=pd.read_csv(data_path+'train.csv',low_memory=False)
test_raw = pd.read_csv(data_path+'test.csv',low_memory=False)

# 1.0. Descrição dos Dados

## 1.1 Loading Data

In [None]:
df1 = df_raw.copy()

In [None]:
df1.sample()

## 1.2. Data Dimensions and Columns

In [None]:
print(f'number of rows: {df1.shape[0]}')
print(f'number of columns: {df1.shape[1]}')

df1.columns  # already in snake case! :)

## 1.3. Data Types

In [None]:
df1.info()

## 1.4. Check NA

In [None]:
df1.isna().sum()

## 1.5 Descriptive Statistical

In [None]:
# Analyse Dataset
report = sv.analyze(df1)
# View
report.show_notebook(w="100%", h="full") # if working in Kaggle

# 2.0 EDA

In [None]:
df2 = df1.copy()

# 3.0. FEATURE ENGINEERING

In [None]:
df3 = df2.copy()
df3.sample()

In [None]:
def idade(coluna):
    if coluna <= 26:
        return "Jovem"
    elif 26<coluna<=35:
        return "Adulto"
    elif 35<coluna<=48:
        return "Velho"
    else:
        return "Ultra Velho"

In [None]:
df3['age_of_respondent'] = df3['age_of_respondent'].map(idade)

## 3.1 Encoding Variables

In [None]:
df3['bank_account']=df3['bank_account'].apply(lambda x: 1 if x=='Yes' else 0)
df3['gender_of_respondent']=df3['gender_of_respondent'].apply(lambda x: 1 if x=='Male' else 0)
df3['location_type']=df3['location_type'].apply(lambda x:1 if x=='Rural' else 0)
df3['cellphone_access']=df3['cellphone_access'].apply(lambda x:1 if x=='Yes' else 0)
#One Hot Encoding
# enc = OrdinalEncoder()
# enc.fit(df3[['age_of_respondent', 'education_level']])
# df3[['age_of_respondent', 'education_level']] = enc.transform(df3[['age_of_respondent', 'education_level']])
df3 = pd.get_dummies(df3, columns=['relationship_with_head'], prefix = ['rwh'])
df3 = pd.get_dummies(df3, columns=['age_of_respondent'], prefix = ['aor'])
# df3 = pd.get_dummies(df3, columns=['year'], prefix = ['y'])
df3 = pd.get_dummies(df3, columns=['marital_status'], prefix = ['ms'])
df3 = pd.get_dummies(df3, columns=['job_type'], prefix = ['jt'])
df3 = pd.get_dummies(df3, columns=['country'], prefix = ['c'])
#Label Encoder
df3['education_level'] = LabelEncoder().fit_transform(df3['education_level'].astype(str))


In [None]:
df3['household_size'] = np.log(df3.household_size)

In [None]:
#df3 = pd.get_dummies(df3, columns=['education_level'], prefix = ['e'])

In [None]:
df3.head()

## 3.2 Discretization

# 4.0. Data Preparation

In [None]:
df4 = df3.copy()

In [None]:
df4.head()

## 4.1. Split dataframe  into training and testing

In [None]:
from sklearn.model_selection import train_test_split
Y = df4['bank_account'].copy()
X = df4.drop(['uniqueid', 'bank_account','uid','year','rwh_Head of Household', 'rwh_Other non-relatives', 'rwh_Other relative', 'rwh_Other relative', 'rwh_Parent', 'rwh_Spouse'],axis=1)

In [None]:
#x_train,x_validation,y_train,y_validation = ms.train_test_split(X, y,test_size=0.75)
X_train, X_test, y_train,y_test=train_test_split(X,Y,train_size=0.75,stratify=Y,random_state=123)

#df5 = pd.concat([X_train,y_train],axis=1)

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
# training and test dataset for Boruta
X_train_n = X_train.values
y_train_n = y_train.values.ravel()


# define RandomForest Regressor
rf = RandomForestRegressor( n_jobs=-1 )

# define boruta
boruta = BorutaPy( rf, n_estimators='auto', verbose=2, random_state=42 ).fit( X_train_n, y_train_n)


In [None]:
cols_selected = boruta.support_.tolist()

# best features
X_train_fs = X_train
cols_selected_boruta = X_train_fs.iloc[:, cols_selected].columns.to_list()

cols_not_selected_boruta = list(np.setdiff1d(X_train_fs.columns, cols_selected_boruta))

In [None]:
cols_selected_boruta = ['aor_Adulto', 'aor_Jovem', 'aor_Ultra Velho', 
                            'aor_Velho', 'c_Kenya', 'c_Rwanda', 'c_Tanzania', 
                            'c_Uganda', 'cellphone_access', 'gender_of_respondent', 
                            'household_size', 'jt_Dont Know/Refuse to answer', 
                            'jt_Farming and Fishing', 
                            'jt_Formally employed Government', 
                            'jt_Formally employed Private', 
                            'jt_Government Dependent', 'jt_Informally employed', 
                            'jt_No Income', 'jt_Other Income', 
                            'jt_Remittance Dependent', 'jt_Self employed', 
                            'location_type', 'ms_Divorced/Seperated', 
                            'ms_Dont know', 'ms_Married/Living together', 
                            'ms_Single/Never Married', 'ms_Widowed', 'rwh_Child']

In [None]:
X_train = X_train[cols_selected_boruta]
X_test = X_test[cols_selected_boruta]

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf =  RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.55, min_samples_leaf=9, min_samples_split=12, n_estimators=200)
clf.fit(X_train, y_train)

In [None]:
y_pred=clf.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred)*100))
print("F1: %.2f%%" % (f1_score(y_test, y_pred)*100))

In [None]:
import xgboost as xgb

model1 = xgb.XGBClassifier()
model2 = xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5)
model3 = xgb.XGBClassifier(learning_rate= 0.01,
                            n_estimators=1500,
                            max_depth= 4,
                            subsample= 0.8,
                            colsample_bytree= 1,
                            gamma= 1)

In [None]:
model1.fit(X_train, y_train)
y_pred=model1.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred)*100))
print("F1: %.2f%%" % (f1_score(y_test, y_pred)*100))

In [None]:
model3.fit(X_train, y_train)
y_pred=model3.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred)*100))
print("F1: %.2f%%" % (f1_score(y_test, y_pred)*100))

# Test Submission

In [None]:
test = test_raw.copy()

In [None]:
test.sample()

## Variable Encoding

In [None]:
test['age_of_respondent'] = test['age_of_respondent'].map(idade)

In [None]:
# test['bank_account']=test['bank_account'].apply(lambda x: 1 if x=='Yes' else 0)
test['gender_of_respondent']=test['gender_of_respondent'].apply(lambda x: 1 if x=='Male' else 0)
test['location_type']=test['location_type'].apply(lambda x:1 if x=='Rural' else 0)
test['cellphone_access']=test['cellphone_access'].apply(lambda x:1 if x=='Yes' else 0)
#One Hot Encoding
# enc = OrdinalEncoder()
# enc.fit(test[['age_of_respondent', 'education_level']])
# test[['age_of_respondent', 'education_level']] = enc.transform(test[['age_of_respondent', 'education_level']])
test = pd.get_dummies(test, columns=['relationship_with_head'], prefix = ['rwh'])
test = pd.get_dummies(test, columns=['age_of_respondent'], prefix = ['aor'])
#df3 = pd.get_dummies(df3, columns=['year'], prefix = ['y'])
test = pd.get_dummies(test, columns=['marital_status'], prefix = ['ms'])
test = pd.get_dummies(test, columns=['job_type'], prefix = ['jt'])
test = pd.get_dummies(test, columns=['country'], prefix = ['c'])
#Label Encoder
test['education_level'] = LabelEncoder().fit_transform(test['education_level'].astype(str))
# df3['age_of_respondent'] = LabelEncoder().fit_transform(df3['age_of_respondent'].astype(str))


In [None]:
# #test['bank_account']=test['bank_account'].apply(lambda x: 1 if x=='Yes' else 0)
# test['gender_of_respondent']=test['gender_of_respondent'].apply(lambda x: 1 if x=='Male' else 0)
# test['location_type']=test['location_type'].apply(lambda x:1 if x=='Rural' else 0)
# test['cellphone_access']=test['cellphone_access'].apply(lambda x:1 if x=='Yes' else 0)
# #One Hot Encoding
# test = pd.get_dummies(test, columns=['relationship_with_head'], prefix = ['rwh'])
# test = pd.get_dummies(test, columns=['age_of_respondent'], prefix = ['aor'])
# #test = pd.get_dummies(test, columns=['year'], prefix = ['y'])
# test = pd.get_dummies(test, columns=['marital_status'], prefix = ['ms'])
# test = pd.get_dummies(test, columns=['job_type'], prefix = ['jt'])
# test = pd.get_dummies(test, columns=['country'], prefix = ['c'])
# #Label Encoder
# test['education_level'] = LabelEncoder().fit_transform(test['education_level'].astype(str))

In [None]:
test['household_size'] = np.log(test.household_size)

## Training Model

In [None]:
#for submission
#df5 = pd.concat([X_train,y_train],axis=1)

Y = df4['bank_account'].copy()
X = df4.drop(['uniqueid', 'bank_account','uid','year'],axis=1)

X_train = X[cols_selected_boruta]
y_train = Y[cols_selected_boruta]
model3 = xgb.XGBClassifier(learning_rate= 0.01,
                            n_estimators=1500,
                            max_depth= 4,
                            subsample= 0.8,
                            colsample_bytree= 1,
                            gamma= 1)
model3.fit(X_train, y_train)
#y_pred=model3.predict(X_test)


In [None]:
# Create a list of columns to be used for the predictions
wanted_test_columns = X_train.columns
wanted_test_columns

In [None]:
# Make a prediction using the XGboost model on the wanted columns
predictions = model3.predict(test[wanted_test_columns])

In [None]:
submission = pd.DataFrame({'uid': test.uid, 'bank_account': predictions})

In [None]:
submission.dtypes

In [None]:
submission['bank_account'].value_counts()

In [None]:
def yesno(coluna):
    if coluna == 0:
        return "No"
    else:
        return "Yes"

In [None]:
submission['bank_account'] = submission['bank_account'].map(yesno)

In [None]:
submission.sample(100)

In [None]:
# Convert submisison dataframe to csv for submission to csv 
# for Kaggle submisison
submission.to_csv('/kaggle/working/submission.csv', index=False)
print('Submission CSV is ready!')