In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(2022)

In [2]:
# Data Load
water = pd.read_csv("water_potability.csv")

In [3]:
data = water.drop(["Potability"], axis=1)
label = water["Potability"]

In [4]:
# Data EDA
data.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739


In [5]:
# 결측치 확인
data.isna()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,True,False,False,False,False,False,False,False,False
1,False,False,False,False,True,False,False,False,False
2,False,False,False,False,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
3271,False,False,False,False,False,False,False,False,False
3272,False,False,False,False,True,False,False,True,False
3273,False,False,False,False,True,False,False,False,False
3274,False,False,False,False,True,False,False,False,False


In [6]:
data.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
dtype: int64

In [7]:
# Data Preprocess - 결측치 제거
# row 제거
data.isna().sum(axis=1)

0       1
1       1
2       1
3       0
4       0
       ..
3271    0
3272    2
3273    1
3274    1
3275    1
Length: 3276, dtype: int64

In [8]:
na_cnt = data.isna().sum(axis=1)
na_cnt

0       1
1       1
2       1
3       0
4       0
       ..
3271    0
3272    2
3273    1
3274    1
3275    1
Length: 3276, dtype: int64

In [9]:
drop_idx = na_cnt.loc[na_cnt > 0].index

In [10]:
drop_idx

Int64Index([   0,    1,    2,    8,   11,   13,   14,   16,   18,   20,
            ...
            3247, 3252, 3258, 3259, 3260, 3266, 3272, 3273, 3274, 3275],
           dtype='int64', length=1265)

In [11]:
drop_row = data.drop(drop_idx, axis=0)

In [12]:
drop_row.shape

(2011, 9)

In [13]:
data.shape

(3276, 9)

In [14]:
# column 제거
na_cnt = data.isna().sum()
drop_cols = na_cnt.loc[na_cnt > 0].index

In [15]:
drop_cols

Index(['ph', 'Sulfate', 'Trihalomethanes'], dtype='object')

In [16]:
data = data.drop(drop_cols, axis=1)

In [17]:
# Data Split
from sklearn.model_selection import train_test_split

train_data, test_data, train_label, test_label = train_test_split(
    data, label, train_size=0.7, random_state=2021
)

In [18]:
print(f"train_data size: {len(train_label)}, {len(train_label)/len(data):.2f}")
print(f"test_data size: {len(test_label)}, {len(test_label)/len(data):.2f}")

train_data size: 2293, 0.70
test_data size: 983, 0.30


In [19]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

In [20]:
# Hyper Parameter tuning
# n_neighbors, p
from sklearn.model_selection import GridSearchCV

In [21]:
# 탐색 범위 지정
params = {
    "n_neighbors": [i for i in range(1, 12, 2)],
    "p": [1, 2]
}

In [22]:
params

{'n_neighbors': [1, 3, 5, 7, 9, 11], 'p': [1, 2]}

In [23]:
# 6*2*3 = 36개 모델 탐색
grid_cv = GridSearchCV(knn, param_grid=params, cv=3, n_jobs=-1)

In [24]:
grid_cv.fit(train_data, train_label)

GridSearchCV(cv=3, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11], 'p': [1, 2]})

In [25]:
# 결과
print(f"Best score of paramter search is: {grid_cv.best_score_:.4f}")

Best score of paramter search is: 0.5652


In [26]:
grid_cv.best_params_

{'n_neighbors': 11, 'p': 1}

In [27]:
print("Best parameter of best score is")
print(f"\t n_neighbors: {grid_cv.best_params_['n_neighbors']}")
print(f"\t p: {grid_cv.best_params_['p']}")

Best parameter of best score is
	 n_neighbors: 11
	 p: 1


In [28]:
# 예측
train_pred = grid_cv.best_estimator_.predict(train_data)
test_pred = grid_cv.best_estimator_.predict(test_data)

In [29]:
# 평가
from sklearn.metrics import accuracy_score

train_acc = accuracy_score(train_label, train_pred)
test_acc = accuracy_score(test_label, test_pred)

In [30]:
print(f"train accuracy is {train_acc:.4f}")
print(f"test accuracy is {test_acc:.4f}")

train accuracy is 0.6520
test accuracy is 0.5595


In [31]:
# Data Scaling 시행
# KNN은 거리 기반 알고리즘이라 데이터의 크기에 영향을 받음
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [32]:
scaler.fit(train_data)

StandardScaler()

In [33]:
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)

In [34]:
# 탐색
scaling_knn = KNeighborsClassifier()
scaling_grid_cv = GridSearchCV(scaling_knn, param_grid=params, n_jobs=-1)

In [35]:
scaling_grid_cv.fit(scaled_train_data, train_label)

GridSearchCV(estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11], 'p': [1, 2]})

In [36]:
scaling_grid_cv.best_score_

0.587011825593896

In [37]:
scaling_grid_cv.best_params_

{'n_neighbors': 9, 'p': 1}

In [38]:
# 평가
scaling_train_pred = scaling_grid_cv.best_estimator_.predict(scaled_train_data)
scaling_test_pred = scaling_grid_cv.best_estimator_.predict(scaled_test_data)

In [39]:
scaling_train_acc = accuracy_score(train_label, scaling_train_pred)
scaling_test_acc = accuracy_score(test_label, scaling_test_pred)

In [40]:
print(f"Scaled data train accuracy is {scaling_train_acc:.4f}")
print(f"Scaled data test accuracy is {scaling_test_acc:.4f}")

Scaled data train accuracy is 0.6829
Scaled data test accuracy is 0.5799


In [41]:
print(f"test accuracy is {test_acc:.4f}")
print(f"Scaled data test accuracy is {scaling_test_acc:.4f}")

test accuracy is 0.5595
Scaled data test accuracy is 0.5799


In [None]:
# 스케일링 시 성능이 더 좋아짐