In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
data = pd.read_csv('Fvote.csv', encoding = 'utf-8')
data.head()

Unnamed: 0,gender_female,gender_male,region_Chungcheung,region_Honam,region_Others,region_Sudo,region_Youngnam,edu,income,age,score_gov,score_progress,score_intention,vote,parties
0,0,1,0,0,0,0,1,3,3,3,2,2,4.0,1,2
1,0,1,0,0,1,0,0,2,3,3,2,4,3.0,0,3
2,0,1,0,1,0,0,0,1,2,4,1,3,2.8,1,4
3,1,0,0,0,0,1,0,2,1,3,5,4,2.6,1,1
4,0,1,0,0,0,1,0,1,2,4,4,3,2.4,1,1


## 데이터셋 분할

In [2]:
X = data[data.columns[0:13]]
y = data[['vote']]

print(X.shape, y.shape)

(211, 13) (211, 1)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)

In [4]:
print(X_train.shape,'\n', X_test.shape)

(158, 13) 
 (53, 13)


## 모델 적용

In [5]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

# 비 무작위 교차검증
## cross_val_score

In [6]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train, cv=5) # cv = 교차검증 횟수
print('5개 Test Set 정확도 : ', scores)
print("정확도 평균 : ", scores.mean())

# 비 무작위이므로 데이터셋의 순서 그대로 그룹 분할
# 즉, 편향 존재 시 문제 야기 가능

5개 Test Set 정확도 :  [0.71875    0.6875     0.8125     0.58064516 0.80645161]
정확도 평균 :  0.7211693548387096


# 무작위 교차검증
## K-Fold

In [7]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits = 5, shuffle = True, random_state=42)
# n_splits = 그룹 수
# shuffle = 순서 무작위
score = cross_val_score(model, X_train, y_train, cv=kfold)
print('5개 Fold의 정확도 : ', score)

5개 Fold의 정확도 :  [0.71875    0.6875     0.625      0.70967742 0.77419355]


# 임의 분할 교차검증
## ShuffleSplit

In [8]:
# 훈련/테스트 데이터 구성 시 다른 교차검증에 사용된 데이터도 랜덤으로 선택되게 하는 방법
# 즉, 전체 데이터 중 일부는 훈련 또는 테스트 데이터 모두에서 선택되지 않을 수 있음

from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size = 0.5, train_size = 0.5, random_state=42)
# test_size / train_size 설정
score = cross_val_score(model, X_train, y_train, cv=shuffle_split)
print("교차검증 정확도 :", score)

교차검증 정확도 : [0.73417722 0.69620253 0.70886076 0.73417722 0.65822785 0.67088608
 0.72151899 0.65822785 0.69620253 0.70886076]


# Train - Validity - Test 분할

In [9]:
# train_test_split을 두 번 진행하여 Train Data를 Train - Validity Data로 분할
# train_size 옵션 설정 가능 (Default : 7:3)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state=1)
# X_train_val / y_train_val : Train + Validity Data
X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, random_state=2)
# Train Data와 Validity Data 분할

In [10]:
model.fit(X_train, y_train)
scores=cross_val_score(model, X_train, y_train, cv=5)
print("Accuracy of Cross Validation :", scores)
print("Mean of Accuracy :", scores.mean())

Accuracy of Cross Validation : [0.58333333 0.66666667 0.70833333 0.65217391 0.65217391]
Mean of Accuracy : 0.652536231884058


In [11]:
# Validity Data 정확도
model.score(X_valid, y_valid)

0.65

In [12]:
# Test Data 정확도
model.score(X_test, y_test)

0.6981132075471698