In [1]:
# 데이터 및 라이브러리 로드
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from scipy.stats import randint
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv('../input/big-data-certification-study/Fvote.csv', encoding='utf-8')
df.head()

Unnamed: 0,gender_female,gender_male,region_Chungcheung,region_Honam,region_Others,region_Sudo,region_Youngnam,edu,income,age,score_gov,score_progress,score_intention,vote,parties
0,0,1,0,0,0,0,1,3,3,3,2,2,4.0,1,2
1,0,1,0,0,1,0,0,2,3,3,2,4,3.0,0,3
2,0,1,0,1,0,0,0,1,2,4,1,3,2.8,1,4
3,1,0,0,0,0,1,0,2,1,3,5,4,2.6,1,1
4,0,1,0,0,0,1,0,1,2,4,4,3,2.4,1,1


In [2]:
# 데이터셋 분할
X = df.drop(columns=['vote', 'parties'])
y = df[['parties']]
df.shape, X.shape, y.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [3]:
# 모델 훈련 (로지스틱 회귀)
model = LogisticRegression(C=35)
model.fit(X_train, y_train)

LogisticRegression(C=35)

In [4]:
# 훈련데이터 예측범주 저장 및 정확도 확인
pred_train=model.predict(X_train)
model.score(X_train, y_train)

0.6265822784810127

In [5]:
# 테스트데이터 예측범주 저장 및 정확도 확인
pred_test=model.predict(X_test)
model.score(X_test, y_test)

0.5094339622641509

In [6]:
# 혼동행렬 확인
confusion_train=confusion_matrix(y_train, pred_train)
confusion_test=confusion_matrix(y_test, pred_test)
print(' 훈련데이터 혼동행렬 :\n', confusion_train)
print('\n\n','테스트데이터 혼동행렬 :\n', confusion_test)

 훈련데이터 혼동행렬 :
 [[20  2  3 12]
 [ 1 26  2 11]
 [ 4  2  7  6]
 [ 7  7  2 46]]


 테스트데이터 혼동행렬 :
 [[ 6  2  1  4]
 [ 1  9  0  3]
 [ 1  3  0  2]
 [ 2  5  2 12]]


In [7]:
# Hyperparameter Tuning - Grid
param_grid={'C':[0.0001,0.001,0.01,0.1,1,10,100]}
grid_search=GridSearchCV(LogisticRegression(),
                         param_grid, cv=5,
                         return_train_score=True)
grid_search.fit(X_train, y_train)

print('Best Parameter :', grid_search.best_params_)
print('Best Score :',round(grid_search.best_score_,3))

Best Parameter : {'C': 0.1}
Best Score : 0.556


In [8]:
print('Test Set Score:', round(grid_search.score(X_test, y_test), 3))

Test Set Score: 0.547


In [9]:
# Hyperparameter Tuning - Randomized
param_distribs = {'C': randint(low=0.001, high=100)}
random_search=RandomizedSearchCV(LogisticRegression(),
                                 param_distributions=param_distribs,
                                 cv=5, n_iter=200,
                                 return_train_score=True)
random_search.fit(X_train, y_train)

print('Best Parameter :', random_search.best_params_)
print('Best Score :',round(random_search.best_score_, 3))

Best Parameter : {'C': 2}
Best Score : 0.544
