## 0617 과제 - pima indian 데이터셋 분류

In [3]:
import pandas as pd

In [None]:
# 변수 설명
# 1. Number of times pregnant
# 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
# 3. Diastolic blood pressure (mm Hg)
# 4. Triceps skin fold thickness (mm)
# 5. 2-Hour serum insulin (mu U/ml)
# 6. Body mass index (weight in kg/(height in m)^2)
# 7. Diabetes pedigree function
# 8. Age (years)
# 9. Class variable (0 or 1)

In [5]:
# 데이터 불러오기
pima= pd.read_csv('pima-indians-diabetes.csv')
pima.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [16]:
# 컬럼명 생성하기
pima.columns = ["pregnant", "pgc", "bp","skin","insulin","bodymass","dpf","age","class"]

In [17]:
pima.head() # 확인

Unnamed: 0,pregnant,pgc,bp,skin,insulin,bodymass,dpf,age,class
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [30]:
pima_data=pima.loc[:,pima.columns!="class"]  #독립변수
pima_data.head()

Unnamed: 0,pregnant,pgc,bp,skin,insulin,bodymass,dpf,age
0,1,85,66,29,0,26.6,0.351,31
1,8,183,64,0,0,23.3,0.672,32
2,1,89,66,23,94,28.1,0.167,21
3,0,137,40,35,168,43.1,2.288,33
4,5,116,74,0,0,25.6,0.201,30


In [110]:
pima_data2 = pima_data.to_numpy() # 독립변수 array 버전

In [92]:
pima_target = pima['class'] # 종속변수
pima_target.head()

0    0
1    1
2    0
3    1
4    0
Name: class, dtype: int64

In [93]:
type(pima['class'])

pandas.core.series.Series

In [56]:
pima['class'].value_counts()

0    500
1    267
Name: class, dtype: int64

## 1. Decision Tree

In [34]:
from sklearn.tree import DecisionTreeClassifier

In [62]:
# train/test set 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(pima_data, pima_target, stratify = pima_target, test_size=0.2,random_state=2021)

In [97]:
dt_clf = DecisionTreeClassifier(random_state=2020)

In [98]:
# 학습 수행
dt_clf.fit(X_train,y_train)

DecisionTreeClassifier(random_state=2020)

In [99]:
# 예측 수행
pred_dt = dt_clf.predict(X_test)

In [100]:
# 모델 평가
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, pred_dt)
print(f'결정트리 예측 정확도{score:.4f}')

결정트리 예측 정확도0.6558


In [101]:
# 교차검증방식
from sklearn.model_selection import cross_val_score

In [119]:
cross_val_score(DecisionTreeClassifier(), pima_data, pima_target, cv=5)

array([0.73376623, 0.66883117, 0.73202614, 0.74509804, 0.7124183 ])

- 하이퍼파라미터 튜닝

In [120]:
from sklearn.model_selection import GridSearchCV

In [129]:
dtc = DecisionTreeClassifier(random_state=2010)

In [130]:
# parameter 를 Dictionary 형태로 설정
params = {
    'max_depth':[2,3,4,5,6],
    'min_samples_split':[2,3,4]
}

In [131]:
grid_dtc = GridSearchCV(dtc, param_grid=params, cv=5)  # cv: 교차검증세트 수

In [132]:
grid_dtc.fit(X_train,y_train)  # 학습 수행

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2010),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4]})

In [133]:
# 최적 파라미터와 최고 정확도
grid_dtc.best_params_ , grid_dtc.best_score_

({'max_depth': 2, 'min_samples_split': 2}, 0.7390910302545648)

In [134]:
# 최적 예측 
pred = grid_dtc.best_estimator_.predict(X_test)
accuracy_score(y_test, pred)

0.7597402597402597

## 2. SVM

In [39]:
from sklearn.svm import SVC

In [104]:
# 학습, 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(pima_data, pima_target, stratify = pima_target, test_size=0.2,random_state=2021)

In [105]:
sv_clf = SVC()  # 객체 생성
sv_clf.fit(X_train,y_train)  #학습 수행

SVC()

In [106]:
# 예측 수행
pred_sv = sv_clf.predict(X_test)

In [107]:
# 모델 평가
score = accuracy_score(y_test, pred_sv)
print(f'SVM 예측 정확도{score:.4f}')

SVM 예측 정확도0.7403


In [108]:
# 교차검증방식
cross_val_score(SVC(), pima_data, pima['class'], cv=5)

array([0.74675325, 0.73376623, 0.77777778, 0.78431373, 0.75163399])

- 최적화

In [137]:
sv_clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [138]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [139]:
pipe_svc = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [
    {'clf__C': param_range, 'clf__kernel': ['linear']},
    {'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']}]

grid_svc  = GridSearchCV(estimator=pipe_svc, param_grid=param_grid,
                  scoring='accuracy', cv=10, n_jobs=1)

In [142]:
# 학습 수행
grid_svc.fit(X_train,y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scl', StandardScaler()),
                                       ('clf', SVC(random_state=1))]),
             n_jobs=1,
             param_grid=[{'clf__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                                     1000.0],
                          'clf__kernel': ['linear']},
                         {'clf__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                                     1000.0],
                          'clf__gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0,
                                         100.0, 1000.0],
                          'clf__kernel': ['rbf']}],
             scoring='accuracy')

In [145]:
# 최적 파라미터와 최고 정확도
print(grid_svc.best_params_)
print(grid_svc.best_score_)

{'clf__C': 100.0, 'clf__gamma': 0.001, 'clf__kernel': 'rbf'}
0.7736118455843469


In [144]:
# 최적 예측 
pred = grid_svc.best_estimator_.predict(X_test)
accuracy_score(y_test, pred)

0.7792207792207793

## 3. Logistic Regression

In [50]:
from sklearn.linear_model import LogisticRegression

In [149]:
# 스케일링
scaled_pima_data = StandardScaler().fit_transform(pima_data)

In [150]:
# 학습, 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(scaled_pima_data, pima_target, stratify=pima_target, test_size=0.2)

In [151]:
lr_clf = LogisticRegression(random_state=2021) # 객체 생성
lr_clf.fit(X_train,y_train)  # 학습 수행

LogisticRegression(random_state=2021)

In [152]:
# 예측 수행
pred_lr = lr_clf.predict(X_test)

In [153]:
# 모델 평가
score = accuracy_score(y_test, pred_lr)
print(f'로지스틱회귀 예측 정확도{score:.4f}')

로지스틱회귀 예측 정확도0.7987


In [155]:
# 교차검증방식
cross_val_score(LogisticRegression(), scaled_pima_data, pima_target, cv=5)

array([0.77272727, 0.74675325, 0.75816993, 0.81045752, 0.77124183])

- 하이퍼파라미터 튜닝

In [159]:
params = { "penalty" : ["l2", "l1"], "C" : [0.01, 0.1, 1, 5, 10] } 
lr = LogisticRegression() 
grid_lr = GridSearchCV(lr, param_grid=params, cv=5, n_jobs=-1, scoring="accuracy")
grid_lr.fit(scaled_pima_data, pima_target)


 0.76926407        nan 0.76926407        nan]


GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 5, 10], 'penalty': ['l2', 'l1']},
             scoring='accuracy')

In [160]:
# 최적 파라미터와 최고 정확도
print("best param : {}".format(grid_lr.best_params_)) 
print("best acc : {}".format(grid_lr.best_score_))

best param : {'C': 1, 'penalty': 'l2'}
best acc : 0.7718699601052542


In [161]:
# 최적 예측 
pred = grid_lr.best_estimator_.predict(X_test)
accuracy_score(y_test, pred)

0.7987012987012987