### sklearn.neighbors.KNeighborsClassifier : 레이블(y) 범주형, 분류
### sklear.neighbors.KNeighborsRegressor : 레이블(y) 숫자, 회귀
#### 각 데이터간 거리를 측정하여 가까운 K개의 다른 데이터 레이블을 참조하여 분류
##### 최적의 K : 3 ~ 10 (일반적)
##### 데이터 수의 제곱근 값을 K로 정하는 방법도 존재
#### 주요 Hyperparameter
- n_neighbors : 기본값 5로, 1에 가까울수록 과대적합, 클수록 과소적합

##### KNeighborsClassifier / KNeighborsRegressor(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs)

# 분석 코드 - Classification

In [1]:
# 라이브러리 및 데이터 로드
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import classification_report, confusion_matrix
df = pd.read_csv('../input/big-data-certification-study/breast-cancer-wisconsin.csv', encoding='utf-8')
df.head(), df.info(), df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   code                         683 non-null    int64
 1   Clump_Thickness              683 non-null    int64
 2   Cell_Size                    683 non-null    int64
 3   Cell_Shape                   683 non-null    int64
 4   Marginal_Adhesion            683 non-null    int64
 5   Single_Epithelial_Cell_Size  683 non-null    int64
 6   Bare_Nuclei                  683 non-null    int64
 7   Bland_Chromatin              683 non-null    int64
 8   Normal_Nucleoli              683 non-null    int64
 9   Mitoses                      683 non-null    int64
 10  Class                        683 non-null    int64
dtypes: int64(11)
memory usage: 58.8 KB


(      code  Clump_Thickness  Cell_Size  Cell_Shape  Marginal_Adhesion  \
 0  1000025                5          1           1                  1   
 1  1002945                5          4           4                  5   
 2  1015425                3          1           1                  1   
 3  1016277                6          8           8                  1   
 4  1017023                4          1           1                  3   
 
    Single_Epithelial_Cell_Size  Bare_Nuclei  Bland_Chromatin  Normal_Nucleoli  \
 0                            2            1                3                1   
 1                            7           10                3                2   
 2                            2            2                3                1   
 3                            3            4                3                7   
 4                            2            1                3                1   
 
    Mitoses  Class  
 0        1      0  
 1        1      0

In [2]:
# 데이터 분할
X = df.drop(columns=['code', 'Class'])
y = df[['Class']]
X.shape, y.shape

((683, 9), (683, 1))

In [3]:
X_train, X_test, y_train, y_test=train_test_split(X,y,stratify=y,random_state=42)

In [4]:
# 정규화
scaler = MinMaxScaler()
scaler.fit(X_train)
mm_X_train=scaler.transform(X_train)
mm_X_test=scaler.transform(X_test)

In [5]:
# 모델 적용
model = KNeighborsClassifier()
model.fit(mm_X_train, y_train)
pred_train=model.predict(mm_X_train)
print(model.score(mm_X_train, y_train))

0.984375


In [6]:
# 혼동행렬 확인
cm_train=confusion_matrix(y_train, pred_train)
print('Train Data C.M. :\n', cm_train)

# 정상(0) 중 2명 오분류, 환자(1) 중 6명 오분류

Train Data C.M. :
 [[331   2]
 [  6 173]]


In [7]:
# 분류예측 보고서
cfr_train=classification_report(y_train,pred_train)
print('Train Data C.F.R. :\n', cfr_train)

Train Data C.F.R. :
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       333
           1       0.99      0.97      0.98       179

    accuracy                           0.98       512
   macro avg       0.99      0.98      0.98       512
weighted avg       0.98      0.98      0.98       512



In [8]:
# Test Data 적용
pred_test = model.predict(mm_X_test)
print(model.score(mm_X_test, y_test))

0.9532163742690059


In [9]:
# 혼동행렬 확인
cm_test = confusion_matrix(y_test, pred_test)
print("Test Data C.M. :\n", cm_test)

# 정상 중 5명 오분류, 환자 중 3명 오분류

Test Data C.M. :
 [[106   5]
 [  3  57]]


In [10]:
# 분류예측 보고서 확인
cfr_test=classification_report(y_test, pred_test)
print('Test Data C.F.R. :\n', cfr_test)

Test Data C.F.R. :
               precision    recall  f1-score   support

           0       0.97      0.95      0.96       111
           1       0.92      0.95      0.93        60

    accuracy                           0.95       171
   macro avg       0.95      0.95      0.95       171
weighted avg       0.95      0.95      0.95       171



In [11]:
# Hyperparameter Tuning
# Grid Search
param_g={'n_neighbors': [x for x in range(1, 12, 2)]}
g_search=GridSearchCV(KNeighborsClassifier(),
                      param_g, cv=5,
                      return_train_score=True)
g_search.fit(mm_X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11]},
             return_train_score=True)

In [12]:
print('Best Parameter :', g_search.best_params_)
print('Best Score :', round(g_search.best_score_, 4))
print('Test Set Score :', round(g_search.score(mm_X_test, y_test), 4))

Best Parameter : {'n_neighbors': 3}
Best Score : 0.9824
Test Set Score : 0.9532


In [13]:
# Randomized Search
param_r={'n_neighbors':randint(low=1, high=20)}
r_search=RandomizedSearchCV(KNeighborsClassifier(),
                            param_distributions=param_r,
                            cv=5, n_iter=20,
                            return_train_score=True)
r_search.fit(mm_X_train, y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(), n_iter=20,
                   param_distributions={'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa1a1a12190>},
                   return_train_score=True)

In [14]:
print('Best Parameter :', r_search.best_params_)
print('Best Score :', round(r_search.best_score_, 4))
print('Test Set Score :', round(r_search.score(mm_X_test, y_test), 4))

Best Parameter : {'n_neighbors': 9}
Best Score : 0.9726
Test Set Score : 0.9532


# 분석 코드 - Regression

In [15]:
# 데이터 로드
df2 = pd.read_csv('../input/big-data-certification-study/house_price.csv', encoding='utf-8')
df2.head()

Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value
0,23,6.777,0.141112,2.442244,8.10396,500000
1,49,6.0199,0.160984,2.726688,5.752412,500000
2,35,5.1155,0.249061,1.902676,3.888078,500000
3,32,4.7109,0.231383,1.913669,4.508393,500000
4,21,4.5625,0.255583,3.092664,4.667954,500000


In [16]:
# 데이터 분류
X=df2.drop(columns=['house_value'])
y=df2[['house_value']]
X.shape, y.shape

((17689, 5), (17689, 1))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [18]:
# 정규화 - Min Max Scaling
scaler.fit(X_train)
ms_X_train = scaler.transform(X_train)
ms_X_test = scaler.transform(X_test)

In [19]:
# 모델 적용
model = KNeighborsRegressor()
model.fit(ms_X_train, y_train)
pred_train=model.predict(ms_X_train)
model.score(ms_X_train, y_train)

0.7227068233803273

In [20]:
pred_test=model.predict(ms_X_test)
model.score(ms_X_test, y_test)

0.5910613917846521

In [21]:
# RMSE (Root Mean Squared Error)
# 회귀모델 평가지표
from sklearn.metrics import mean_squared_error
mse_train=mean_squared_error(y_train, pred_train)
mse_test=mean_squared_error(y_test, pred_test)
print('Train Data RMSE :', np.sqrt(mse_train))
print('Test  Data RMSE :', np.sqrt(mse_test))

Train Data RMSE : 50259.78603945866
Test  Data RMSE : 61135.22729166754


In [22]:
# Hyperparameter Tuning
# Grid Search
param_grid ={'n_neighbors':[x for x in range(1, 12, 2)]}
grid_search=GridSearchCV(KNeighborsRegressor(),
                         param_grid, cv=5,
                         return_train_score=True)
grid_search.fit(ms_X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11]},
             return_train_score=True)

In [23]:
print('Best Parameter :', grid_search.best_params_)
print('Best Score :', round(grid_search.best_score_, 4))
print('Test Set Score :', round(grid_search.score(ms_X_test, y_test), 4))

Best Parameter : {'n_neighbors': 11}
Best Score : 0.6137
Test Set Score : 0.6195


In [24]:
# Randomized Search
r_param={'n_neighbors':randint(low=1, high=20)}
search_r=RandomizedSearchCV(KNeighborsRegressor(),
                            param_distributions=r_param,
                            cv=5, n_iter=20,
                            return_train_score=True)
search_r.fit(ms_X_train, y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsRegressor(), n_iter=20,
                   param_distributions={'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa1a197d710>},
                   return_train_score=True)

In [25]:
print('Best Parameter :', search_r.best_params_)
print('Best Score :', round(search_r.best_score_, 4))
print('Test Set Score :', round(search_r.score(ms_X_test, y_test), 4))

Best Parameter : {'n_neighbors': 19}
Best Score : 0.6191
Test Set Score : 0.6275
