In [38]:
import pandas as pd
import numpy as np

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [40]:
df = pd.read_csv("user_information.csv")

In [41]:
SEED = 42
np.random.seed(SEED)

# Adjusting DATA

In [42]:
# Ordinal Categories, so it will be used 
# sequential integers
tr_cc = {}
cc_types = df.conta_corrente.value_counts().keys()

for index, cc in enumerate(cc_types):
    tr_cc[cc] = index + 1
    
df.conta_corrente = df.conta_corrente.map(tr_cc)

In [43]:
# Bool Value
df.trabalhador_estrangeiro = df.trabalhador_estrangeiro.map({"yes": 1, "no": 0})

In [44]:
telefone_to_bool = lambda x: 1 if x != "none" else 0

df.telefone = df.telefone.apply(telefone_to_bool)

In [45]:
# Creating a dict to translate the variables 
# as if it had an order between them
credit_history = {'existing credits paid back duly till now': 5,
                 'all credits at this bank paid back duly': 4, 
                 'no credits taken/ all credits paid back duly': 3,
                 'critical account/ other credits existing (not at this bank)': 2,
                 'delay in paying off in the past': 1,
                 'ohter': 0}


df.historico_credito = df.historico_credito.map(credit_history)

In [46]:
# purposes that are, at least, 5% of the cases
purposes = ['domestic appliances', 'car (new)', 'radio/television', 'car (used)', 'business']
df.proposito_emprestimo = df.proposito_emprestimo.apply(lambda x: x if x in purposes else "other")

In [47]:
# Extracting only the civil status from the column
# sex does not help in the case
df = df.assign(civil_status = df.sexo_est_civil.apply(lambda x: x.split(" ")[-1]))
df.drop('sexo_est_civil', axis=1, inplace=True)

In [48]:
# As an ordinal condition, it'll be replaced be integers values 
# between 0 and the lenght of the list

dict_reserva = {'unknown/ no savings account': 0, 
                '... < 100 DM': 1,
                '100 <= ... < 500 DM':2,
                '500 <= ... < 1000 DM ': 3,
                '.. >= 1000 DM ': 4
                }   

df.reserva_cc = df.reserva_cc.map(dict_reserva)

In [49]:
# Same case as the cell above
dict_tempo_emprego_atual = {
                            "unemployed": 0,
                            "... < 1 year": 1,
                            "1 <= ... < 4 years": 2, 
                            "4 <= ... < 7 years": 3, 
                            ".. >= 7 years": 4
                            }

df.tempo_emprego_atual = df.tempo_emprego_atual.map(dict_tempo_emprego_atual)

In [50]:
# None and guarantor have a similiar behavior, so
# the will be grouped in one class, so it'll turn 
# outros_fiadores in a binary column

dict_fiadores = {
                "none": 0, 
                "guarantor": 0,
                "co-applicant": 1
                }

df.outros_fiadores = df.outros_fiadores.map(dict_fiadores)

In [51]:
# Bank and Stores cases have a similiar behavior too
# So, the same logic above will be applied

dict_fiadores = {
                "bank": 1, 
                "stores": 1,
                "none": 0
                }

df.outros_planos_financiamento = df.outros_planos_financiamento.map(dict_fiadores)

In [53]:
# One Hot Encoder
cols_one = ['outros_fiadores', 'civil_status', 'proposito_emprestimo', 
            'status_emprego', 'propriedade']

In [55]:
# Splitting data between train and test
y = df.pop("default")

train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=.25)