<a href="https://colab.research.google.com/github/Chromis07/dataStudy/blob/main/ML/SVM(support_vector_machine)_%ED%83%80%EC%9D%B4%ED%83%80%EB%8B%89_%EC%83%9D%EC%A1%B4%EC%9E%90_%EC%98%88%EC%B8%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

문제 정의 : SVM(support vector machine) 사용하여 타이타닉 생존자(1), 사망(0) 예측

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
# 한글 깨짐 방지
import matplotlib as mpl
import matplotlib.pyplot as plt
 
%config InlineBackend.figure_format = 'retina'
 
!apt -qq -y install fonts-nanum
 
import matplotlib.font_manager as fm

fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

plt.rcParams['axes.unicode_minus'] = False # 마이너스 기호 깨짐 방지

fonts-nanum is already the newest version (20170925-1).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


[Step 1] 데이터 준비 - Seaborn에서 제공하는 titanic dataset 가져오기

In [None]:
# load_dataset 함수를 사용하여 데이터 프레임으로 변환
df = sns.load_dataset('titanic')

[Step 2] 데이터 탐색 및 데이터 전처리

In [None]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


In [None]:
# NaN값이 많은 deck(객실 데크 위치) 열을 삭제, embarked와 내용이 겹치는 embark_town(승선 도시) 열을 삭제

rdf = df.drop(['deck', 'embark_town'], axis=1)

In [None]:
# age 열에 나이 데이터가 없는 모든 행을 삭제 -> 177개 NaN 삭제 (under sampling)
rdf = rdf.dropna(subset=['age'], how='any', axis=0)
rdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    714 non-null    int64   
 1   pclass      714 non-null    int64   
 2   sex         714 non-null    object  
 3   age         714 non-null    float64 
 4   sibsp       714 non-null    int64   
 5   parch       714 non-null    int64   
 6   fare        714 non-null    float64 
 7   embarked    712 non-null    object  
 8   class       714 non-null    category
 9   who         714 non-null    object  
 10  adult_male  714 non-null    bool    
 11  alive       714 non-null    object  
 12  alone       714 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 63.6+ KB


In [None]:
# embarked 열의 NaN값을 승선도시 중에서 가장 많이 출현한 값 확인 -> 탑승한 도시의 첫글자
most_freq = rdf['embarked'].value_counts(dropna=True).idxmax()
most_freq

'S'

In [None]:
# embarked 열의 NaN값을 승선도시 중에서 가장 많이 출현한 값으로 치환하기
rdf['embarked'].fillna(most_freq, inplace=True)

[Step 3] 분석에 사용할 feature 선택

In [None]:
# 분석에 활용할 열(속성)을 선택 (생존 여부,객실 등급, 성별, 나이, 타이타닉에 탑승한 형제 자매 수
# 타이타닉에 탑승한 부모/자녀 수, 탑승한 곳(항구) C=Cherbourg, Q = Queenstown, S = Southhampton)
ndf = rdf[['survived', 'pclass', 'sex', 'sibsp', 'parch', 'embarked']]
ndf.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,embarked
0,0,3,male,1,0,S
1,1,1,female,1,0,C
2,1,3,female,0,0,S
3,1,1,female,1,0,S
4,0,3,male,0,0,S


In [None]:
# one-hot 인코딩 - 범주형 데이터를 모형이 인식할 수 있도록 숫자형으로 변경
# male, female -> [1, 0], [0, 1]
# embarked C, Q, S = [1, 0, 0], [0, 1, 0], [0, 0, 1]

oneshot_sex = pd.get_dummies(ndf['sex'])
oneshot_sex

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
885,1,0
886,0,1
887,1,0
889,0,1


In [None]:
# one-hot 인코딩 - 범주형 데이터를 모형이 인식할 수 있도록 숫자형으로 변경
# male, female -> [1, 0], [0, 1]

oneshot_sex = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf, oneshot_sex], axis=1)

ndf.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,embarked,female,male
0,0,3,male,1,0,S,0,1
1,1,1,female,1,0,C,1,0
2,1,3,female,0,0,S,1,0
3,1,1,female,1,0,S,1,0
4,0,3,male,0,0,S,0,1


In [None]:
# one-hot 인코딩 - 범주형 데이터를 모형이 인식할 수 있도록 숫자형으로 변경
# embarked C, Q, S = [1, 0, 0], [0, 1, 0], [0, 0, 1]

oneshot_embarked = pd.get_dummies(ndf['embarked'], prefix='town')
ndf = pd.concat([ndf, oneshot_embarked], axis=1)

ndf.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,embarked,female,male,town_C,town_Q,town_S
0,0,3,male,1,0,S,0,1,0,0,1
1,1,1,female,1,0,C,1,0,1,0,0
2,1,3,female,0,0,S,1,0,0,0,1
3,1,1,female,1,0,S,1,0,0,0,1
4,0,3,male,0,0,S,0,1,0,0,1


In [None]:
# 기존 컬럼 삭제
ndf.drop(['sex', 'embarked'], axis=1, inplace=True)
ndf.head()

Unnamed: 0,survived,pclass,sibsp,parch,female,male,town_C,town_Q,town_S
0,0,3,1,0,0,1,0,0,1
1,1,1,1,0,1,0,1,0,0
2,1,3,0,0,1,0,0,0,1
3,1,1,1,0,1,0,0,0,1
4,0,3,0,0,0,1,0,0,1


[Step 4] 데이터 셋 구분 - train / test

In [None]:
# ndf -> X, y (문제집, 정답지)
X = ndf[['pclass', 'sibsp',	'parch', 'female', 'male', 'town_C',
         'town_Q', 'town_S']] # 독립변수
y = ndf['survived']

In [None]:
# 설명 변수 (데이터, 문제집) 정규화(normalization) -> 스케일링 조정 0~1 사이 작은 값으로 축소

from sklearn import preprocessing

X = preprocessing.StandardScaler().fit(X).transform(X)
X

array([[ 0.91123237,  0.52457013, -0.50589515, ..., -0.47180795,
        -0.20203051,  0.53307848],
       [-1.47636364,  0.52457013, -0.50589515, ...,  2.11950647,
        -0.20203051, -1.87589641],
       [ 0.91123237, -0.55170307, -0.50589515, ..., -0.47180795,
        -0.20203051,  0.53307848],
       ...,
       [-1.47636364, -0.55170307, -0.50589515, ..., -0.47180795,
        -0.20203051,  0.53307848],
       [-1.47636364, -0.55170307, -0.50589515, ...,  2.11950647,
        -0.20203051, -1.87589641],
       [ 0.91123237, -0.55170307, -0.50589515, ..., -0.47180795,
         4.94974747, -1.87589641]])

In [None]:
# train data와 test 데이터로 구분 (7:3 비율)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

print(f'train의 개수 : {X_train.shape}')
print(f'test의 개수 : {X_test.shape}')

train의 개수 : (499, 8)
test의 개수 : (215, 8)


[Step 5] SVM 분류 모형 - sklearn 사용

In [None]:
from sklearn import svm

# 모형 객체 생성 (kernel='rbf' 적용)
# 벡터 공간을 맵핑 하는 함수 -> radial basis function (RBF) 커널을 사용
# 선형(linear), 다항식(poly), 가우시안 RBF(rbf) , 시그모이드(sigmoid)
svm_model = svm.SVC(kernel='rbf') # 벡터 공간을 맵핑 하는 함수 -> 기본 rbf 사용

In [None]:
# train data를 사용하여 모형 학습
svm_model.fit(X_train, y_train)

# test data를 사용하여 y_hat 예측
y_hat = svm_model.predict(X_test)

[Step 6] 모형 성능 평가

In [None]:
from sklearn import metrics

print(f"test set 정확도 : {svm_model.score(X_test, y_test)*100:.2f}")

test set 정확도 : 77.67


In [None]:
# 모형 성능 평가 - 평가지표 계산
svm_report = metrics.classification_report(y_test, y_hat)
print(svm_report)

              precision    recall  f1-score   support

           0       0.74      0.95      0.83       126
           1       0.89      0.53      0.66        89

    accuracy                           0.78       215
   macro avg       0.81      0.74      0.75       215
weighted avg       0.80      0.78      0.76       215

