In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('credit_data.csv')

In [4]:
data.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [5]:
data.columns

Index(['i#clientid', 'income', 'age', 'loan', 'c#default'], dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
i#clientid    2000 non-null int64
income        2000 non-null float64
age           1997 non-null float64
loan          2000 non-null float64
c#default     2000 non-null int64
dtypes: float64(3), int64(2)
memory usage: 78.2 KB


In [7]:
# Temos três valores faltosos na coluna "AGE".

In [8]:
media_idade = np.round(data[data['age'] > 0]['age'].mean())
print(media_idade)

41.0


In [9]:
data = data.fillna(value = int(media_idade))

In [10]:
data['age'].dtype
# Não faz muito sentido termos a idade como float, portanto verificaremos a existência de valores inconsistentes

dtype('float64')

In [11]:
np.array_equal(data.age,data.age.astype(int))
# A coluna age possui números inconsistentes. 

False

In [12]:
data['age'].describe()

count    2000.000000
mean       40.807848
std        13.614244
min       -52.423280
25%        29.012888
50%        41.300710
75%        52.582340
max        63.971796
Name: age, dtype: float64

In [13]:
# Enxergamos a presença de valores inconsistentes, pois a idade não pode/deve ser negativa

In [14]:
data.loc[data.age < 0, 'age'] = media_idade

In [15]:
X = data[['income','age','loan']]

In [16]:
Y = data['c#default']

In [48]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import resample

In [18]:
scaler = StandardScaler()

In [19]:
X = scaler.fit_transform(X)

In [20]:
X.shape

(2000, 3)

In [21]:
Y.shape

(2000,)

In [58]:
 X_treino, x_teste, y_treino, y_teste = train_test_split(X,Y,random_state = 301, test_size = 0.1, stratify = Y)

In [25]:
from sklearn.naive_bayes import GaussianNB
# Modelo probabilístico que analisa dados históricos. 

In [26]:
classifier = GaussianNB()

In [29]:
classifier.fit(X_treino,y_treino)

GaussianNB()

In [31]:
predicts = classifier.predict(x_teste)

In [47]:
print(classification_report(predicts, y_teste))

              precision    recall  f1-score   support

           0       0.98      0.93      0.96       181
           1       0.57      0.84      0.68        19

    accuracy                           0.93       200
   macro avg       0.78      0.89      0.82       200
weighted avg       0.94      0.93      0.93       200



In [33]:
confusion_matrix(predicts, y_teste)

array([[169,  12],
       [  3,  16]])

In [36]:
# Vemos que está deturpado o modelo e que acertamos mal a classe 2, isso se deve pelo fato de termos um problema desbalanceado

In [34]:
Y.value_counts()

0    1717
1     283
Name: c#default, dtype: int64

In [70]:
downsampled_class_1 = resample(data[data['c#default'] == 1], n_samples = 283, random_state = 301)

In [71]:
full_balanced = pd.concat([downsampled_class_1, data[data['c#default'] == 0][:283]])

In [72]:
full_balanced.shape

(566, 5)

In [73]:
X_balanced = full_balanced[['income','age','loan']]
Y_balanced = full_balanced['c#default']

In [74]:
X_treino_bal, x_teste_bal, y_treino_bal, y_teste_bal = train_test_split(X_balanced, Y_balanced, random_state = 301, stratify = Y_balanced, test_size = 0.1)

In [75]:
classifier.fit(X_treino_bal, y_treino_bal)

GaussianNB()

In [76]:
bal_predicts = classifier.predict(x_teste_bal)

In [77]:
print(classification_report(bal_predicts, y_teste_bal))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92        24
           1       1.00      0.88      0.94        33

    accuracy                           0.93        57
   macro avg       0.93      0.94      0.93        57
weighted avg       0.94      0.93      0.93        57



In [78]:
confusion_matrix(bal_predicts,y_teste_bal)

array([[24,  0],
       [ 4, 29]])