In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [2]:
breast_cancer = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header = None)

In [3]:
breast_cancer.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
breast_cancer.columns = ['id_number', 'Clump_Thickness', 'Unif_Cell_Size', 'Unif_Cell_Shape','Marg_Adhesion', 
                         'Single_Epith_Cell_Size', 'Bare_Nuclei','Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class']

In [5]:
breast_cancer.head()

Unnamed: 0,id_number,Clump_Thickness,Unif_Cell_Size,Unif_Cell_Shape,Marg_Adhesion,Single_Epith_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [6]:
breast_cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id_number               699 non-null    int64 
 1   Clump_Thickness         699 non-null    int64 
 2   Unif_Cell_Size          699 non-null    int64 
 3   Unif_Cell_Shape         699 non-null    int64 
 4   Marg_Adhesion           699 non-null    int64 
 5   Single_Epith_Cell_Size  699 non-null    int64 
 6   Bare_Nuclei             699 non-null    object
 7   Bland_Chromatin         699 non-null    int64 
 8   Normal_Nucleoli         699 non-null    int64 
 9   Mitoses                 699 non-null    int64 
 10  Class                   699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [7]:
breast_cancer.Bare_Nuclei = breast_cancer.Bare_Nuclei.replace('?', np.NaN)

In [8]:
breast_cancer.isnull()

Unnamed: 0,id_number,Clump_Thickness,Unif_Cell_Size,Unif_Cell_Shape,Marg_Adhesion,Single_Epith_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
694,False,False,False,False,False,False,False,False,False,False,False
695,False,False,False,False,False,False,False,False,False,False,False
696,False,False,False,False,False,False,False,False,False,False,False
697,False,False,False,False,False,False,False,False,False,False,False


NaN 값의 갯수 세기

In [9]:
breast_cancer.isnull().values.sum()

16

최빈값으로 np.NaN을 채우기

In [10]:
breast_cancer.Bare_Nuclei = breast_cancer.Bare_Nuclei.fillna(breast_cancer.Bare_Nuclei.value_counts().index[0])

암인지 아닌지를 구별하는 코드 넣기 

In [11]:
breast_cancer['cancer_ind'] = 0 # 정상

In [12]:
breast_cancer.loc[breast_cancer['Class'] == 4, 'cancer_ind'] = 1 # 유방암 소견

불필요한 컬럼 제거

In [13]:
X_df = breast_cancer.drop(['id_number', 'Class', 'cancer_ind'], axis = 1)

In [14]:
y = breast_cancer.cancer_ind

### 데이터의 정규화

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()
X_scaled_df = scaler.fit_transform(X_df)

### 훈련 데이터와 테스트 데이터 나누기

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.3, random_state = 42)

### Classifier 생성

In [19]:
from sklearn.neighbors import KNeighborsClassifier

In [20]:
knn = KNeighborsClassifier(n_neighbors=3)

In [21]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

### 성능 측정 - 분류를 위한 평가지표 사용

In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve

In [23]:
y_pred = knn.predict(X_test)

In [24]:
confusion_matrix(y_test, y_pred)

array([[141,   2],
       [  3,  64]])

In [25]:
accuracy_score(y_test, y_pred)

0.9761904761904762

In [26]:
roc_auc_score(y_test, y_pred)

0.9706189333055005

### 최적의 이웃 개수(k)를 찾기 위한 작업

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
grid_search = GridSearchCV(knn, {"n_neighbors":[1, 2, 3, 4, 5]}, n_jobs = -1, cv=7, scoring='roc_auc')

In [29]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=7, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=3, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)

In [30]:
grid_search.best_params_

{'n_neighbors': 4}

In [31]:
knn_best = grid_search.best_estimator_

In [32]:
y_pred = knn_best.predict(X_test)

In [33]:
confusion_matrix(y_test, y_pred)

array([[141,   2],
       [  6,  61]])

In [34]:
accuracy_score(y_test, y_pred)

0.9619047619047619

In [35]:
roc_auc_score(y_test, y_pred)

0.948230873604008