# Padronização (z-score) e k-NN

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('CSVs/credit_data.csv')
df.dropna(inplace=True)
df.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


## Sem padronização

In [3]:
X = df.iloc[:, 1:4].values
X

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [5]:
y = df.iloc[:,4].values
y

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                   stratify=y, random_state=42)

In [10]:
np.mean(X_train[0]), np.median(X_train[0]), np.std(X_train[0])

(19398.143639007838, 8088.568018568091, 21938.187117654743)

In [11]:
np.mean(X_test[0]), np.median(X_test[0]), np.std(X_test[0])

(7339.915692550724, 706.619738681604, 9845.813185631645)

In [12]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [13]:
pred = knn.predict(X_test)
accuracy_score(y_test, pred)

0.8325

## Com padronização

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
z_train = StandardScaler()
z_test = StandardScaler()

In [16]:
X_train_scaled = z_train.fit_transform(X_train)
X_test_scaled = z_test.fit_transform(X_test)

In [17]:
X_train_scaled, X_test_scaled

(array([[ 0.31748319, -0.56715816,  1.1817256 ],
        [-0.21627756,  1.26958957,  0.92983393],
        [-1.12984237, -0.90286849, -0.8793038 ],
        ...,
        [ 1.0741458 , -0.2692364 ,  1.88177064],
        [-0.31881781,  1.11035342, -1.12115003],
        [-0.49612137, -0.98363313, -1.35772901]]),
 array([[-1.59428322,  0.96147305, -1.25162245],
        [ 0.94250522,  1.15285924,  1.91822308],
        [-0.26530077, -1.47669699, -0.59139514],
        ...,
        [-0.98486486,  0.19927988, -1.16586816],
        [-0.55803147, -0.52968768,  0.75759667],
        [ 0.51062994,  0.2206557 , -0.71602283]]))

In [19]:
min(X_train_scaled[0]), max(X_train_scaled[0])

(-0.5671581637177425, 1.1817256030796386)

In [20]:
np.mean(X_train_scaled[0]), np.median(X_train_scaled[0]), np.std(X_train_scaled[0])

(0.31068354279402416, 0.3174831890201764, 0.7139949971035978)

In [21]:
np.mean(X_train_scaled), np.median(X_train_scaled), np.std(X_train_scaled)

(-8.631514761268594e-16, -0.036932302269182614, 1.0000000000000002)

In [22]:
np.mean(X_test_scaled), np.median(X_test_scaled), np.std(X_test_scaled)

(-4.825769413704014e-16, -0.03983560801094515, 1.0)

In [23]:
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

pred = knn.predict(X_test_scaled)
accuracy_score(y_test, pred)

0.975