### 서포트 벡터 머신(SVM)
#### 거리를 통해 분류나 회귀 모델을 만들 때는 반드시 데이터 정규화나 표준화를 해줘야 한다.
#### 커널 기법은 기존의 데이터를 고차원 공간으로 확장하여 새로운 결정경계선을 만들어내는 방법이다.

In [1]:
# 필요한 패키지 설치
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import sklearn.svm as svm
from sklearn.svm import SVC
import pandas as pd

In [5]:
# 데이터 불러오기
df = pd.read_excel("C:/Users/qorud/OneDrive/바탕 화면/Github/Data Analysis Method/Raisin_Dataset.xlsx")

# 데이터 샘플 확인
df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,Kecimen
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,Kecimen
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,Kecimen
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,Kecimen
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,Kecimen


In [6]:
# 독립변수 정규화

# 독립변수, 종속변수 분리
df_x = df.drop(['Area','Class'],axis=1)
df_y = df[['Class']]

# 데이터 정규화 적용
MinMaxScaler = MinMaxScaler()
df_minmax = MinMaxScaler.fit_transform(df_x)

# 컬럼명 결합
df_x = pd.DataFrame(data=df_minmax, columns=df_x.columns)

df_x.head()

#SVM 모델을 적용하기 앞서 독립변수 데이터셋과 종속변수 데이터셋을 분리하고 독립변수에 스케일 정규화를 적용해 준다.
#SVM은 거리를 기반으로 한 모델이기 때문에 스케일링을 필수적으로 해야한다.

Unnamed: 0,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter
0,0.280714,0.314376,0.767872,0.255504,0.831422,0.271791
1,0.234638,0.284945,0.738636,0.208864,0.667854,0.241842
2,0.280741,0.351778,0.733009,0.268084,0.565754,0.283594
3,0.078935,0.18662,0.548194,0.084089,0.701809,0.108284
4,0.164011,0.422064,0.350968,0.219472,0.906315,0.218493


In [7]:
# 학습셋과 테스트셋 분리하여 생성(6:4)

x_train, x_test, y_train, y_test = train_test_split(
    df_x,df_y,test_size=0.4,random_state=10)

# 학습셋과 검증셋이 잘 나뉘었는지 확인
print('train data 개수: ', len(x_train))
print('test data 개수: ', len(x_test))

#전체 고나측치가 900개밖에 되지 않기 때문에 6:4 비율로 학습셋과 테스트셋 분리를 해준다. 아웃풋 결과 540:360으로 관측치가 배분되었다.

train data 개수:  540
test data 개수:  360


In [8]:
# 선형 SVM 모델 성능 확인

#3차항(degree), 학습 반복횟수(max_iter) 10000 설정
svm_model = SVC(kernel = 'linear', degree = 3, gamma = 'auto', C=10, max_iter = 1000)

svm_model.fit(x_train,y_train)

print(svm_model.score(x_train,y_train))
print(svm_model.score(x_test, y_test))

#학습의 반복이 1000회까지 되도록 했고 3차항으로 설정했다. 반복수가 너무 적으면 과소적합이 될 수 있다. 
#반복횟수를 10회로 지정할 경우 학습셋의 정확도는 67% 정도에 불과하지만 1000회로 설정할 경우 86%가량의 정확도를 보인다.

0.8648148148148148
0.8638888888888889


  y = column_or_1d(y, warn=True)


In [9]:
# SVM C값 1~30에 따른 모델 성능 확인

scores = []
for C_point in [*range(1,31)]:
    svc=SVC(kernel='rbf',C=C_point,max_iter = 1000)
    C_model=svc.fit(x_train, y_train)
    train_score = C_model.score(x_train, y_train)
    test_score = C_model.score(x_test, y_test)
    print("rbf SVM : C:{}, train set score:{:2f}, test set score:{:2f}".format
          (C_point,train_score, test_score))
    scores.append([train_score, test_score])
    
#다음으로 C값을 1~30까지 변경해 가며 정확도가 어떻게 변하는지 확인해 본다.
#이를 통해 C값의 하이퍼 파라미터 최적화를 할 수 있다.
#테스트셋의 정확도를 봤을 떄 큰 차이는 없지만 C값 10부터 89%로 올랐다가 21부근부터 다시 감소하기 시작한다.
#C값은 15가 최적으로 판단된다.

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


rbf SVM : C:1, train set score:0.857407, test set score:0.883333
rbf SVM : C:2, train set score:0.864815, test set score:0.880556
rbf SVM : C:3, train set score:0.862963, test set score:0.886111


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


rbf SVM : C:4, train set score:0.861111, test set score:0.880556
rbf SVM : C:5, train set score:0.861111, test set score:0.880556
rbf SVM : C:6, train set score:0.862963, test set score:0.883333


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


rbf SVM : C:7, train set score:0.866667, test set score:0.883333
rbf SVM : C:8, train set score:0.864815, test set score:0.883333
rbf SVM : C:9, train set score:0.864815, test set score:0.883333


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


rbf SVM : C:10, train set score:0.864815, test set score:0.891667
rbf SVM : C:11, train set score:0.864815, test set score:0.891667
rbf SVM : C:12, train set score:0.862963, test set score:0.891667


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


rbf SVM : C:13, train set score:0.864815, test set score:0.891667
rbf SVM : C:14, train set score:0.864815, test set score:0.891667
rbf SVM : C:15, train set score:0.864815, test set score:0.897222


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


rbf SVM : C:16, train set score:0.864815, test set score:0.891667
rbf SVM : C:17, train set score:0.864815, test set score:0.891667
rbf SVM : C:18, train set score:0.864815, test set score:0.891667


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


rbf SVM : C:19, train set score:0.864815, test set score:0.891667
rbf SVM : C:20, train set score:0.864815, test set score:0.891667
rbf SVM : C:21, train set score:0.864815, test set score:0.888889


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


rbf SVM : C:22, train set score:0.864815, test set score:0.883333
rbf SVM : C:23, train set score:0.864815, test set score:0.883333
rbf SVM : C:24, train set score:0.862963, test set score:0.886111


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


rbf SVM : C:25, train set score:0.861111, test set score:0.886111
rbf SVM : C:26, train set score:0.861111, test set score:0.888889
rbf SVM : C:27, train set score:0.862963, test set score:0.888889


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


rbf SVM : C:28, train set score:0.861111, test set score:0.886111
rbf SVM : C:29, train set score:0.861111, test set score:0.886111
rbf SVM : C:30, train set score:0.861111, test set score:0.886111


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [10]:
# SVM gamma값 1~30에 따른 모델 성능 확인

scores = []
for gamma_point in [0.1,0.5,1,5,10,20]:
    svc=SVC(kernel='rbf',C=10,gamma=gamma_point,max_iter = 1000)
    model=svc.fit(x_train, y_train)
    train_score = model.score(x_train, y_train)
    test_score = model.score(x_test, y_test)
    print("rbf SVM : gamma:{}, train set score:{:2f}, test set score:{:2f}".format
          (gamma_point,train_score, test_score))
    scores.append([train_score, test_score])
    
#감마 역시 보면 5~10 정도에서 높은 정확도를 보이는 것을 알 수 있다.

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


rbf SVM : gamma:0.1, train set score:0.862963, test set score:0.866667
rbf SVM : gamma:0.5, train set score:0.864815, test set score:0.880556
rbf SVM : gamma:1, train set score:0.861111, test set score:0.877778


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


rbf SVM : gamma:5, train set score:0.862963, test set score:0.880556
rbf SVM : gamma:10, train set score:0.864815, test set score:0.883333
rbf SVM : gamma:20, train set score:0.879630, test set score:0.872222


