## □ kNN 머신러닝 알고리즘을 이용하여 유방암 데이터 분류 데이터 분석
    1단계 : 데이터 수집
    2단계 : 데이터 탐색과 준비
    3단계 : 데이터 모델로 훈련
    4단계 : 모델 성능 평가
    5단계 : 모델 성능개선

### ■ 1단계 : 데이터 수집
    569개의 진단 데이터셋이며 암 조직 검사 예시
    32개의 특징을 갖고 있으며 디지털 이미지에 존재하는 세포핵의 특성을 나타냄
        반지름, 질감, 둘레, 넓이, 매끄러움, 조밀성, 오목함, 오목점, 대칭성, 프랙탈 차원
        정답(라벨) : diagnosis(진단) 양성(B) / 악성(M)

### ■ 2단계: 데이터 시각화

In [1]:
#1. 데이터를 로드 한다.
wbcd <- read.csv("wisc_bc_data.csv", header=T,  stringsAsFactors=FALSE)
table(wbcd$diagnosis)


  B   M 
357 212 

    1. 정답에 해당하는 라벨 column의 데이터 분포를 막대그래프로 시각화
        악성 데이터와 양성 데이터가 50:50으로 분포되어있는 것이 가장 이상적이나 보통은 그렇지 않기 때문에 데이터를 맞춰줄 필요가 있다
        (모델의 정확도가 낮을 때 고려해볼 필요가 있음)
        
    2. 수치형 변수 데이터의 분포를 파악
        - 정규분포 그래프
        
    3. 이상치가 있는지 확인해 볼 필요
        - 사분위수 그래프
    
    4. 결측치가 많은 column이 무엇인지 확인
        - 결측치를 다른 값으로 치환하거나 삭제 → 파생변수 생성
        
    

In [None]:
#2. diagnosis 를 factor 로 변환한다
wbcd$diagnosis <- factor(wbcd$diagnosis,
                          levels =c("B","M"),
                   labels = c("Benign","Maliganant"))

In [2]:
colSums(is.na(wbcd))

In [3]:
colnames(wbcd)

In [4]:
library(outliers)
grubbs.flag <- function(x) {
  outliers <- NULL
  test <- x
  grubbs.result <- grubbs.test(test)
  pv <- grubbs.result$p.value
  while(pv < 0.05) {
    outliers <- c(outliers,as.numeric(strsplit(grubbs.result$alternative," ")[[1]][3]))
    test <- x[!x %in% outliers]
    grubbs.result <- grubbs.test(test)
    pv <- grubbs.result$p.value
  }
  return(data.frame(X=x,Outlier=(x %in% outliers)))
}

### ※ 문제222. wbcd의 radius_mean에 이상치가 몇 개 인지 확인하시오(True가 몇 개인지)

In [17]:
rs <- grubbs.flag(wbcd$radius_mean)
rs[rs$Outlier=='TRUE',]

Unnamed: 0,X,Outlier
166,27.22,True
276,28.11,True
461,27.42,True


### ※ 문제223. wbcd의 모든 column에 이상치가 각각 몇 개 있는지 아래와 같이 출력되게 하시오

In [20]:
for (i in 4:length(colnames(wbcd))){
  a = grubbs.flag(wbcd[,colnames(wbcd)[i]])
  b = a[a$Outlier==TRUE,"Outlier"]
  print(paste(colnames(wbcd)[i],'-->',length(b)))
}

[1] "texture_mean --> 1"
[1] "perimeter_mean --> 3"
[1] "area_mean --> 6"
[1] "smoothness_mean --> 1"
[1] "compactness_mean --> 2"
[1] "concavity_mean --> 4"
[1] "points_mean --> 1"
[1] "symmetry_mean --> 2"
[1] "dimension_mean --> 6"
[1] "radius_se --> 7"
[1] "texture_se --> 5"
[1] "perimeter_se --> 14"
[1] "area_se --> 14"
[1] "smoothness_se --> 7"
[1] "compactness_se --> 12"
[1] "concavity_se --> 10"
[1] "points_se --> 6"
[1] "symmetry_se --> 13"
[1] "dimension_se --> 17"
[1] "radius_worst --> 1"
[1] "texture_worst --> 1"
[1] "perimeter_worst --> 1"
[1] "area_worst --> 8"
[1] "smoothness_worst --> 2"
[1] "compactness_worst --> 6"
[1] "concavity_worst --> 3"
[1] "points_worst --> 0"
[1] "symmetry_worst --> 5"
[1] "dimension_worst --> 3"


In [22]:
# 양성과 악성의 비율을 확인
round(prop.table(table(wbcd$diagnosis))*100,digit=1)


   B    M 
62.7 37.3 

In [24]:
#3. 데이터를 shuffle 시킨다.
# wbcd[sample(10),] # 1번~10번까지의 데이터가 섞여서 출력이됨
wbcd_shuffle <- wbcd[sample(nrow(wbcd)), ]
wbcd_shuffle

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
308,9047,B,12.940,16.17,83.18,507.6,0.09879,0.08836,0.032960,0.023900,...,13.860,23.02,89.69,580.9,0.11720,0.19580,0.18100,0.08388,0.3297,0.07834
209,893526,B,13.500,12.71,85.69,566.2,0.07376,0.03614,0.002758,0.004419,...,14.970,16.94,95.48,698.7,0.09023,0.05836,0.01379,0.02210,0.2267,0.06192
191,869104,M,16.110,18.05,105.10,813.0,0.09721,0.11370,0.094470,0.059430,...,19.920,25.27,129.00,1233.0,0.13140,0.22360,0.28020,0.12160,0.2792,0.08158
341,905189,B,16.140,14.86,104.30,800.0,0.09495,0.08501,0.055000,0.045280,...,17.710,19.58,115.90,947.9,0.12060,0.17220,0.23100,0.11290,0.2778,0.07012
534,87281702,M,16.460,20.11,109.30,832.9,0.09831,0.15560,0.179300,0.088660,...,17.790,28.45,123.50,981.2,0.14150,0.46670,0.58620,0.20350,0.3054,0.09519
370,886226,M,19.450,19.33,126.50,1169.0,0.10350,0.11880,0.137900,0.085910,...,25.700,24.57,163.10,1972.0,0.14970,0.31610,0.43170,0.19990,0.3379,0.08950
278,926954,M,16.600,28.08,108.30,858.1,0.08455,0.10230,0.092510,0.053020,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.34030,0.14180,0.2218,0.07820
367,8711202,M,17.680,20.74,117.40,963.7,0.11150,0.16650,0.185500,0.105400,...,20.470,25.11,132.90,1302.0,0.14180,0.34980,0.35830,0.15150,0.2463,0.07738
109,923465,B,10.820,24.21,68.89,361.6,0.08192,0.06602,0.015480,0.008160,...,13.030,31.45,83.90,505.6,0.12040,0.16330,0.06194,0.03264,0.3059,0.07626
533,90769602,B,12.720,17.67,80.98,501.3,0.07896,0.04522,0.014020,0.018350,...,13.820,20.96,88.87,586.8,0.10680,0.09605,0.03469,0.03612,0.2165,0.06025


In [26]:
#4. 데이터에서 id 를 제외 시킨다
wbcd2 <- wbcd_shuffle[-1]
head(wbcd2)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
308,B,12.94,16.17,83.18,507.6,0.09879,0.08836,0.03296,0.0239,0.1735,...,13.86,23.02,89.69,580.9,0.1172,0.1958,0.181,0.08388,0.3297,0.07834
209,B,13.5,12.71,85.69,566.2,0.07376,0.03614,0.002758,0.004419,0.1365,...,14.97,16.94,95.48,698.7,0.09023,0.05836,0.01379,0.0221,0.2267,0.06192
191,M,16.11,18.05,105.1,813.0,0.09721,0.1137,0.09447,0.05943,0.1861,...,19.92,25.27,129.0,1233.0,0.1314,0.2236,0.2802,0.1216,0.2792,0.08158
341,B,16.14,14.86,104.3,800.0,0.09495,0.08501,0.055,0.04528,0.1735,...,17.71,19.58,115.9,947.9,0.1206,0.1722,0.231,0.1129,0.2778,0.07012
534,M,16.46,20.11,109.3,832.9,0.09831,0.1556,0.1793,0.08866,0.1794,...,17.79,28.45,123.5,981.2,0.1415,0.4667,0.5862,0.2035,0.3054,0.09519
370,M,19.45,19.33,126.5,1169.0,0.1035,0.1188,0.1379,0.08591,0.1776,...,25.7,24.57,163.1,1972.0,0.1497,0.3161,0.4317,0.1999,0.3379,0.0895


In [29]:
#5. 데이터를 정규화 한다.
normalize <- function(x) {
      return ( (x-min(x)) / (max(x) - min(x))  )
}
# 서로 단위가 다른 데이터를 전부 0-1사이의 데이터로 맞춰준다

wbcd_n  <- as.data.frame(lapply(wbcd2[2:31],normalize))
summary(wbcd_n)

  radius_mean      texture_mean    perimeter_mean     area_mean     
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.2233   1st Qu.:0.2185   1st Qu.:0.2168   1st Qu.:0.1174  
 Median :0.3024   Median :0.3088   Median :0.2933   Median :0.1729  
 Mean   :0.3382   Mean   :0.3240   Mean   :0.3329   Mean   :0.2169  
 3rd Qu.:0.4164   3rd Qu.:0.4089   3rd Qu.:0.4168   3rd Qu.:0.2711  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
 smoothness_mean  compactness_mean concavity_mean     points_mean    
 Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
 1st Qu.:0.3046   1st Qu.:0.1397   1st Qu.:0.06926   1st Qu.:0.1009  
 Median :0.3904   Median :0.2247   Median :0.14419   Median :0.1665  
 Mean   :0.3948   Mean   :0.2606   Mean   :0.20806   Mean   :0.2431  
 3rd Qu.:0.4755   3rd Qu.:0.3405   3rd Qu.:0.30623   3rd Qu.:0.3678  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
 symmetry_mean    dimension

In [41]:
#6. train 데이터와 test 데이터로 9 대 1로 나눈다
train_num<-round(0.9*nrow(wbcd_n),0)
wbcd_train<-wbcd_n[1:train_num,]
wbcd_test<-wbcd_n[(train_num+1):nrow(wbcd_n),]

In [42]:
#7. train 데이터를 데이터와 라벨로 나누고 test  데이터를 데이터와 라벨로 나누시오 ~
wbcd_train_labels <- wbcd2[1:train_num,1]
wbcd_test_labels <- wbcd2[(train_num+1):nrow(wbcd_n),1]

In [43]:
install.packages('class')
library(class)

Installing package into 'C:/Users/knitwill/Documents/R/win-library/3.6'
(as 'lib' is unspecified)
"package 'class' is in use and will not be installed"

### ■ 3단계: 데이터로 모델 훈련

In [None]:
#8. knn 모델로 훈련시켜서 모델을 만들고 바로 그 모델에 test 데이터를 넣어서 정확도를 확인한다
rs <- knn(train=wbcd_train, test=wbcd_test, cl=wbcd_train_labels, k=21)

### ■ 4단계: 모델 성능평가

In [52]:
x <- data.frame(실제=wbcd_test_labels, 예측=rs)
table(x)

    예측
실제  B  M
   B 34  0
   M  2 21


### ※ 문제224. 주어진 k값으로 모델을 훈련시키고 모델 성능평가 결과를 댓글로 올리시오

In [55]:
rs2 <- knn(train=wbcd_train, test=wbcd_test, cl=wbcd_train_labels, k=51)
x <- data.frame(실제=wbcd_test_labels, 예측=rs2)
table(x)

    예측
실제  B  M
   B 33  1
   M  2 21

### ※ 문제225. CrossTable 함수를 이용해서 모델의 성능을 확인하시오

In [56]:
library(gmodels)

"package 'gmodels' was built under R version 3.6.3"

In [58]:
g2 <- CrossTable(x=wbcd_test_labels, y=rs2,chisq=F)


 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  57 

 
                 | rs2 
wbcd_test_labels |         B |         M | Row Total | 
-----------------|-----------|-----------|-----------|
               B |        33 |         1 |        34 | 
                 |     7.039 |    11.199 |           | 
                 |     0.971 |     0.029 |     0.596 | 
                 |     0.943 |     0.045 |           | 
                 |     0.579 |     0.018 |           | 
-----------------|-----------|-----------|-----------|
               M |         2 |        21 |        23 | 
                 |    10.406 |    16.555 |           | 
                 |     0.087 |     0.913 |     0.404 | 
                 |     0.057 |     0.955 |           | 
                 |     0.035 |     0.368 |        

### ※ 문제226. 적절한 k 값을 알아내기 위한 시각화 코드를 참고해서 우리가 테스트한 변수로 변경해서 그래프를 시각화 하시오
### 　　　　　 시각화 한 그래프를 첨부