## 5.9 Wykorzystanie KNN do klasyfikacji

In [1]:
import pandas as pd

In [2]:
credit_cards_df = pd.read_parquet("../data/credit-cards-reduced.parquet")
credit_cards_df.T

ID,1,2,3,4,5,6,7,8,9,10,...,29991,29992,29993,29994,29995,29996,29997,29998,29999,30000
LIMIT_BAL,20000,120000,90000,50000,50000,50000,500000,100000,140000,20000,...,140000,210000,10000,100000,80000,220000,150000,30000,80000,50000
AGE,24,26,34,37,57,37,29,23,28,35,...,41,34,43,38,34,39,43,37,41,46
PAY_1,2,-1,0,0,-1,0,0,0,0,-2,...,0,3,0,0,2,0,-1,4,1,0
PAY_2,2,2,0,0,0,0,0,-1,0,-2,...,0,2,0,-1,2,0,-1,3,-1,0
PAY_3,-1,0,0,0,-1,0,0,-1,2,-2,...,0,2,0,-1,2,0,-1,2,0,0
PAY_4,-1,0,0,0,0,0,0,0,0,-2,...,0,2,-2,0,2,0,-1,-1,0,0
PAY_5,-2,0,0,0,0,0,0,0,0,-1,...,0,2,-2,0,2,0,0,0,0,0
PAY_6,-2,2,0,0,0,0,0,-1,0,-1,...,0,2,-2,0,2,0,0,0,-1,0
BILL_AMT1,3913,2682,29239,46990,8617,64400,367965,11876,11285,0,...,138325,2500,8802,3042,72557,188948,1683,3565,-1645,47929
BILL_AMT2,3102,1725,14027,48233,5670,57069,412023,380,14096,0,...,137142,2500,10400,1427,77708,192815,1828,3356,78379,48905


In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test = train_test_split(credit_cards_df, test_size=0.2, random_state=2020)

In [5]:
grid_cv = GridSearchCV(KNeighborsClassifier(), param_grid={
    "n_neighbors": range(1, 10)
}, cv=5, scoring="f1", verbose=1, n_jobs=6)

In [6]:
grid_cv.fit(X_train.drop(columns="DEFAULT"), X_train["DEFAULT"])

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [7]:
grid_cv.best_score_

np.float64(0.2984339551992302)

In [8]:
knn = grid_cv.best_estimator_
knn

KNN, wykorzystując funkcję odległości, nie rozróżnia poszczególnych cech. Dlatego też kolumny o większych wartościach mogą sztucznie zawyżać odległość. Na potrzeby KNN powinniśmy postarać się zeskalować zmienne, aby uniknąć tego zjawiska.

In [9]:
from sklearn.metrics import f1_score

In [10]:
f1_score(X_test["DEFAULT"], knn.predict(X_test.drop(columns="DEFAULT")))

np.float64(0.31594634873323396)

In [12]:
credit_cards_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LIMIT_BAL,30000.0,167484.322667,129747.661567,10000.0,50000.0,140000.0,240000.0,1000000.0
AGE,30000.0,35.4855,9.217904,21.0,28.0,34.0,41.0,79.0
PAY_1,30000.0,-0.0167,1.123802,-2.0,-1.0,0.0,0.0,8.0
PAY_2,30000.0,-0.133767,1.197186,-2.0,-1.0,0.0,0.0,8.0
PAY_3,30000.0,-0.1662,1.196868,-2.0,-1.0,0.0,0.0,8.0
PAY_4,30000.0,-0.220667,1.169139,-2.0,-1.0,0.0,0.0,8.0
PAY_5,30000.0,-0.2662,1.133187,-2.0,-1.0,0.0,0.0,8.0
PAY_6,30000.0,-0.2911,1.149988,-2.0,-1.0,0.0,0.0,8.0
BILL_AMT1,30000.0,51223.3309,73635.860576,-165580.0,3558.75,22381.5,67091.0,964511.0
BILL_AMT2,30000.0,49179.075167,71173.768783,-69777.0,2984.75,21200.0,64006.25,983931.0


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [15]:
pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("classifier", GridSearchCV(KNeighborsClassifier(),
                                param_grid={
                                    "n_neighbors": range(1, 10)
                                }, cv=5, scoring="f1", verbose=1, n_jobs=6, refit=True))
])

In [16]:
pipeline.fit(X_train.drop(columns="DEFAULT"), X_train["DEFAULT"])

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [17]:
f1_score(X_test["DEFAULT"], pipeline.predict(X_test.drop(columns="DEFAULT")))

np.float64(0.4510079699953118)