In [1]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
import pandas as pd
df = pd.read_csv("pima-indians-diabetes.csv", skiprows=9, header=None)
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
pima = df.values
pima.shape

(768, 9)

In [4]:
X = pima[:,:-1]
y = pima[:,-1]

In [5]:
X.shape, y.shape

((768, 8), (768,))

In [6]:
X = df.iloc[:,:-1].values
y = df[8] # Series
y = y.values # Numpy array

# 정규화

In [7]:
from sklearn.preprocessing import StandardScaler
pima_std = StandardScaler().fit_transform(X)

In [8]:
df = pd.DataFrame(pima_std)
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,2.5442610000000002e-17,3.614007e-18,-1.3272440000000001e-17,7.994184000000001e-17,-3.556183e-17,2.295979e-16,2.462585e-16,1.8576e-16
std,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652
min,-1.141852,-3.783654,-3.572597,-1.288212,-0.6928906,-4.060474,-1.189553,-1.041549
25%,-0.8448851,-0.6852363,-0.3673367,-1.288212,-0.6928906,-0.5955785,-0.6889685,-0.7862862
50%,-0.2509521,-0.1218877,0.1496408,0.1545332,-0.4280622,0.0009419788,-0.3001282,-0.3608474
75%,0.6399473,0.6057709,0.5632228,0.7190857,0.4120079,0.5847705,0.4662269,0.6602056
max,3.906578,2.444478,2.734528,4.921866,6.652839,4.455807,5.883565,4.063716


훈련/테스트 데이터셋 분리

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    pima_std, y, stratify=y, test_size=0.2, random_state=2021
)

결정 트리 ( Decision Tree )

In [10]:
dtc = DecisionTreeClassifier(random_state=2021)
params = {
    'max_depth': [2,3,4,5,6],
    'min_samples_split': [2,3,4]
}

In [11]:
grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=5)
grid_dt.fit(X_train, y_train)
grid_dt.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [12]:
best_dt = grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.7337662337662337

- Support Vector Machine

In [13]:
svc = SVC(random_state=2021)
params = {'C': [0.01, 0.1, 1, 10, 100]}

In [14]:
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 1}

In [15]:
params = {'C': [0.25, 0.5, 0.75, 1, 2, 3, 4, 5, 6, 7]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 0.25}

In [16]:
params = {'C': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 0.25}

In [17]:
params = {'C': [0.2, 0.225, 0.25, 0.275, 0.3]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 0.25}

In [18]:
best_sv = grid_sv.best_estimator_
best_sv.score(X_test, y_test)

0.7662337662337663

로지스틱 회귀 ( Logistic Regression )

In [19]:
lrc = LogisticRegression()
lrc.fit(X_train, y_train)

LogisticRegression()

In [20]:
lrc.score(X_test, y_test)

0.8051948051948052

# 표준화

In [21]:
from sklearn.preprocessing import MinMaxScaler
pima_mm = MinMaxScaler().fit_transform(X)

In [22]:
df2 = pd.DataFrame(pima_mm)
df2.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.22618,0.60751,0.566438,0.207439,0.094326,0.47679,0.168179,0.204015
std,0.19821,0.160666,0.158654,0.161134,0.136222,0.117499,0.141473,0.196004
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.058824,0.497487,0.508197,0.0,0.0,0.406855,0.070773,0.05
50%,0.176471,0.58794,0.590164,0.232323,0.036052,0.4769,0.125747,0.133333
75%,0.352941,0.704774,0.655738,0.323232,0.150414,0.545455,0.234095,0.333333
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


훈련/테스트 데이터셋 분리

In [23]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    pima_mm, y, stratify=y, test_size=0.2, random_state=2021
)

결정 트리 ( Decision Tree )

In [24]:
dtc2 = DecisionTreeClassifier(random_state=2021)
params2 = {
    'max_depth': [2,3,4,5,6],
    'min_samples_split': [2,3,4]
}

In [25]:
grid_dt2 = GridSearchCV(dtc2, param_grid=params2, scoring='accuracy', cv=5)
grid_dt2.fit(X_train2, y_train2)
grid_dt2.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [26]:
best_dt2 = grid_dt2.best_estimator_
best_dt2.score(X_test2, y_test2)

0.7337662337662337

- Support Vector Machine

In [27]:
svc2 = SVC(random_state=2021)
params2 = {'C': [0.01, 0.1, 1, 10, 100]}

In [28]:
grid_sv2 = GridSearchCV(svc2, param_grid=params2, scoring='accuracy', cv=5)
grid_sv2.fit(X_train2, y_train2)
grid_sv2.best_params_

{'C': 1}

In [29]:
params2 = {'C': [0.25, 0.5, 0.75, 1, 2, 3, 4, 5, 6, 7]}
grid_sv2 = GridSearchCV(svc2, param_grid=params2, scoring='accuracy', cv=5)
grid_sv2.fit(X_train2, y_train2)
grid_sv2.best_params_

{'C': 0.5}

In [30]:
params2 = {'C': [0.3, 0.35, 0.4, 0.5, 0.6, 0.7]}
grid_sv2 = GridSearchCV(svc2, param_grid=params2, scoring='accuracy', cv=5)
grid_sv2.fit(X_train2, y_train2)
grid_sv2.best_params_

{'C': 0.6}

In [31]:
params2 = {'C': [0.525, 0.55, 0.6, 0.625, 0.65]}
grid_sv2 = GridSearchCV(svc2, param_grid=params2, scoring='accuracy', cv=5)
grid_sv2.fit(X_train2, y_train2)
grid_sv2.best_params_

{'C': 0.525}

In [32]:
params2 = {'C': [0.45, 0.475, 0.5, 0.525, 0.55]}
grid_sv2 = GridSearchCV(svc2, param_grid=params2, scoring='accuracy', cv=5)
grid_sv2.fit(X_train2, y_train2)
grid_sv2.best_params_

{'C': 0.525}

In [33]:
best_sv2 = grid_sv2.best_estimator_
best_sv2.score(X_test2, y_test2)

0.7857142857142857

로지스틱 회귀 ( Logistic Regression )

In [34]:
lrc2 = LogisticRegression()
lrc2.fit(X_train2, y_train2)

LogisticRegression()

In [35]:
lrc2.score(X_test2, y_test2)

0.7922077922077922

# 결과
- Decision Tree 정확도 
    - 정규화 0.7337662337662337 
    - 표준화 0.7337662337662337
- Support Vector Machine 정확도 
    - 정규화 0.7662337662337663 
    - 표준화 0.7857142857142857
- Logistic Regression 정확도 
    - 정규화 0.8051948051948052 
    - 표준화 0.7922077922077922

- Decision Tree 정확도 : 정규화 표준화 모두 같음
- Support Vector Machine 정확도 : 표준화가 더 높음
- Logistic Regression 정확도 : 정규화가 더 높음