#Pre-processamento credit data

In [1]:
import pandas as pd
import numpy  as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
sns.set(palette = 'Accent_r', style = 'whitegrid')
plt.style.use('ggplot')

In [3]:
credito = pd.read_csv('credit_data.csv')

In [4]:
credito.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [5]:
a_rename = {
    'i#clientid':'ID',
    'income'    :'SALARIO',
    'age'       :'IDADE',
    'loan'      :'EMPRESTIMO',
    'c#default' :'APROVACAO'
}
credito.rename(columns = a_rename, inplace = True)

In [6]:
credito

Unnamed: 0,ID,SALARIO,IDADE,EMPRESTIMO,APROVACAO
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [7]:
credito.describe()

Unnamed: 0,ID,SALARIO,IDADE,EMPRESTIMO,APROVACAO
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


**O atributo idade tem valor mínimo negativo, ou seja, há idade negativa**

In [8]:
credito.loc[credito['IDADE'].isnull()]

Unnamed: 0,ID,SALARIO,IDADE,EMPRESTIMO,APROVACAO
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [9]:
credito.loc[credito['IDADE'] < 0]

Unnamed: 0,ID,SALARIO,IDADE,EMPRESTIMO,APROVACAO
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [10]:
credito.loc[credito['IDADE'] < 0, 'IDADE'] = credito.loc[credito['IDADE'] > 0, 'IDADE'].mean()

## corrigindo valores nulos

In [11]:
#credito.loc[credito['IDADE'].isnull(), 'IDADE'] = credito.loc[credito['IDADE'].isnull() == True, 'IDADE'].mean()

In [12]:
credito.drop('ID', axis = 1, inplace = True)

In [13]:
X = credito.iloc[:, :-1]
y = credito.iloc[:, -1]

In [14]:
from sklearn.impute import SimpleImputer

In [15]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [16]:
X = imp.transform(X)

## Normalização

**Usado para algoritmos que usam distância euclidiana**

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()

In [19]:
scaled_X = scaler.fit_transform(X)

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
np.random.seed(0)

X_train, X_test, y_train, y_test = train_test_split(scaled_X, y,
                                                    test_size = 0.20)

## Naive Bayes

In [22]:
from sklearn.naive_bayes import GaussianNB

### Parameters

**Priors : array-like of shape (n_classes,)**
> Prior probabilities of the classes. If specified the priors are not adjusted according to the data.

**var_smoothing : float, default=1e-9**
> Portion of the largest variance of all features that is added to variances for calculation stability.

In [23]:
nb = GaussianNB(priors = None, var_smoothing = 1e-09)

In [24]:
model = nb.fit(X_train, y_train)

In [25]:
from sklearn.metrics import mean_squared_log_error, confusion_matrix

In [26]:
pred = model.predict(X_test)

In [27]:
(1 - np.sqrt(mean_squared_log_error(y_test, pred)))*100

84.11800288669414

In [28]:
confusion_matrix(y_test, pred)

array([[342,   6],
       [ 15,  37]])

## Decision Tree Classifier

In [29]:
from sklearn.tree import DecisionTreeClassifier

In [30]:
mdl = DecisionTreeClassifier(criterion= 'gini', splitter = 'best',
                             max_depth = 2**6, min_samples_split = 3)

In [31]:
mdl.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=64, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [32]:
pred = model.predict(X_test)

In [33]:
(1 - np.sqrt(mean_squared_log_error(y_test, pred)))*100

84.11800288669414

In [34]:
confusion_matrix(y_test, pred)

array([[342,   6],
       [ 15,  37]])

## Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
mdl = RandomForestClassifier(n_jobs = -1, random_state = 0, n_estimators = 1000,
                           min_samples_leaf = 64, max_features = 'auto')

In [37]:
mdl.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=64, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [38]:
pred = model.predict(X_test)

In [39]:
(1 - np.sqrt(mean_squared_log_error(y_test, pred)))*100

84.11800288669414

In [40]:
confusion_matrix(y_test, pred)

array([[342,   6],
       [ 15,  37]])

# Pré-processamento Census

**Se a renda da pessoa é maior ou menor que 50k por ano**

In [41]:
census = pd.read_csv('census.csv')

In [42]:
census.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [43]:
census.iloc[:, -1].unique()

array([' <=50K', ' >50K'], dtype=object)

## Encoding

In [44]:
!pip install category_encoders



In [45]:
X = census.iloc[:, :-1]
y = census.iloc[:, -1]

In [46]:
from sklearn.preprocessing import LabelEncoder

In [47]:
le = LabelEncoder()
y = le.fit_transform(y)

In [48]:
from category_encoders import OneHotEncoder

  import pandas.util.testing as tm


In [49]:
ohe = OneHotEncoder(cols = ['workclass', 'education', 'marital-status',
                            'occupation', 'relationship', 'race', 'sex',
                            'native-country'], 
                    use_cat_names = True,
                    drop_invariant = True)

In [50]:
X = ohe.fit_transform(X)

  elif pd.api.types.is_categorical(cols):


In [51]:
X.head()

Unnamed: 0,age,workclass_ State-gov,workclass_ Self-emp-not-inc,workclass_ Private,workclass_ Federal-gov,workclass_ Local-gov,workclass_ ?,workclass_ Self-emp-inc,workclass_ Without-pay,workclass_ Never-worked,final-weight,education_ Bachelors,education_ HS-grad,education_ 11th,education_ Masters,education_ 9th,education_ Some-college,education_ Assoc-acdm,education_ Assoc-voc,education_ 7th-8th,education_ Doctorate,education_ Prof-school,education_ 5th-6th,education_ 10th,education_ 1st-4th,education_ Preschool,education_ 12th,education-num,marital-status_ Never-married,marital-status_ Married-civ-spouse,marital-status_ Divorced,marital-status_ Married-spouse-absent,marital-status_ Separated,marital-status_ Married-AF-spouse,marital-status_ Widowed,occupation_ Adm-clerical,occupation_ Exec-managerial,occupation_ Handlers-cleaners,occupation_ Prof-specialty,occupation_ Other-service,...,native-country_ Cuba,native-country_ Jamaica,native-country_ India,native-country_ ?,native-country_ Mexico,native-country_ South,native-country_ Puerto-Rico,native-country_ Honduras,native-country_ England,native-country_ Canada,native-country_ Germany,native-country_ Iran,native-country_ Philippines,native-country_ Italy,native-country_ Poland,native-country_ Columbia,native-country_ Cambodia,native-country_ Thailand,native-country_ Ecuador,native-country_ Laos,native-country_ Taiwan,native-country_ Haiti,native-country_ Portugal,native-country_ Dominican-Republic,native-country_ El-Salvador,native-country_ France,native-country_ Guatemala,native-country_ China,native-country_ Japan,native-country_ Yugoslavia,native-country_ Peru,native-country_ Outlying-US(Guam-USVI-etc),native-country_ Scotland,native-country_ Trinadad&Tobago,native-country_ Greece,native-country_ Nicaragua,native-country_ Vietnam,native-country_ Hong,native-country_ Ireland,native-country_ Hungary
0,39,1,0,0,0,0,0,0,0,0,77516,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,50,0,1,0,0,0,0,0,0,0,83311,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,38,0,0,1,0,0,0,0,0,0,215646,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,53,0,0,1,0,0,0,0,0,0,234721,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,28,0,0,1,0,0,0,0,0,0,338409,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,1,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [52]:
scaled_X = scaler.fit_transform(X)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y,
                                                    test_size = 0.2)

### Parameters

**Priors : array-like of shape (n_classes,)**
> Prior probabilities of the classes. If specified the priors are not adjusted according to the data.

**var_smoothing : float, default=1e-9**
> Portion of the largest variance of all features that is added to variances for calculation stability.

In [54]:
nb = GaussianNB(priors = None, var_smoothing = 1e-09)

In [55]:
model = nb.fit(X_train, y_train)

In [56]:
pred = model.predict(X_test)

In [57]:
(1 - np.sqrt(mean_squared_log_error(y_test, pred)))*100

51.00590992987814

In [58]:
confusion_matrix(y_test, pred)

array([[1671, 3193],
       [  61, 1588]])

## Decision Tree Classifier

In [59]:
from sklearn.tree import DecisionTreeClassifier

In [60]:
mdl = DecisionTreeClassifier(criterion= 'gini', splitter = 'best',
                             max_depth = 2**6, min_samples_split = 3)

In [61]:
mdl.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=64, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [62]:
pred = model.predict(X_test)

In [63]:
(1 - np.sqrt(mean_squared_log_error(y_test, pred)))*100

51.00590992987814

In [64]:
confusion_matrix(y_test, pred)

array([[1671, 3193],
       [  61, 1588]])

## Random Forest Classifier

In [65]:
from sklearn.ensemble import RandomForestClassifier

In [66]:
mdl = RandomForestClassifier(n_jobs = -1, random_state = 0, n_estimators = 1000,
                           min_samples_leaf = 64, max_features = 'auto')

In [67]:
mdl.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=64, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [68]:
pred = model.predict(X_test)

In [69]:
(1 - np.sqrt(mean_squared_log_error(y_test, pred)))*100

51.00590992987814

In [70]:
confusion_matrix(y_test, pred)

array([[1671, 3193],
       [  61, 1588]])