In [None]:
from sklearn.datasets import load_diabetes
import pandas as pd
load_diabetes = load_diabetes()
load_diabetes.keys()


In [None]:
df = pd.DataFrame(data = load_diabetes.data, columns = load_diabetes.feature_names)
df.head()

# 머신러닝 프로세스
1. 데이터 수집
2. 데이터 전처리
3. 데이터 탐색

4. 모델 선택
5. 모델 학습
6. 모델 평가
7. 모델 개선
8. 모델 배포

# 데이터 수집

In [None]:
from sklearn.datasets import load_iris
load_iris = load_iris()
iris_data = load_iris.data
iris_data # X

In [None]:
iris_label = load_iris.target #y
iris_label

In [None]:
load_iris.target_names, load_iris.feature_names

## 데이터 분할

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=42) # x, y, 8:2

## 모델 생성

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf

## 모델 학습

In [None]:
dt_clf.fit(X_train, y_train)

In [None]:
pred = dt_clf.predict(X_test)
pred

## 평가

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred) # 정확도

# 의사결정 나무 시각화

In [87]:
import graphviz
from sklearn.tree import export_graphviz

In [None]:
export_graphviz(dt_clf, out_file="tree.dot", 
                class_names=load_iris.target_names,)
with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

# 교차검증

In [None]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)
iris_clf = DecisionTreeClassifier(random_state=156)
iris_data, iris_label

In [None]:
for train_index, test_index in kfold.split(iris_data):
    X_train, X_test = iris_data[train_index], iris_data[test_index]
    y_train, y_test = iris_label[train_index], iris_label[test_index]
    
    iris_clf.fit(X_train, y_train)
    pred = iris_clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    print(f"Accuracy: {accuracy:.4f}")

# stratified split

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
iris_df = pd.DataFrame(data = iris.data, columns=iris.feature_names )
iris_df['label']  = iris.target
iris_df['label'].value_counts()

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(iris_df, iris_df['label']):
  y_train = iris_df['label'].iloc[train_index]
  y_test =  iris_df['label'].iloc[test_index]
  print(f'train 분포: {y_train.value_counts()}')
  print(f'test 분포: {y_test.value_counts()}')

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(iris_df, iris_df['label']):
  train_data = iris_df.iloc[train_index]
  test_data = iris_df.iloc[test_index]
  X_train = train_data[iris.feature_names]
  X_test = test_data[iris.feature_names]

  y_train = train_data['label']
  y_test =  test_data['label']

  model = DecisionTreeClassifier(random_state=156)
  model.fit(X_train, y_train)
  pred = model.predict(X_test)
  acc = accuracy_score(y_test, pred)
  print(f'acc : {acc}')

# cross_val_score()의 적용

In [130]:
from sklearn.model_selection import cross_val_score, cross_validate

In [None]:
X = iris.data
y = iris.target
model = DecisionTreeClassifier(random_state=156)
cross_val_score(model,X,y, cv = 5, scoring='accuracy')

In [None]:
skf = StratifiedKFold(n_splits=5)
cross_val_score(model,X,y,cv=skf, scoring='accuracy')

In [None]:
results = cross_validate(model, X,y, cv=skf, scoring='accuracy')
results['test_score']

# GridSearchCV - 하이퍼파라미터 + 교차검증

In [118]:
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=121)

In [None]:
from sklearn.model_selection import GridSearchCV
dt_clf2 = DecisionTreeClassifier(random_state=121)
params = {'max_depth': [1,2,3], 'min_samples_split':[2,3]}
grid_dtree = GridSearchCV(dt_clf2, param_grid=params, cv=3, refit=True)
grid_dtree.fit(X_train,y_train)

In [None]:
pd.DataFrame(grid_dtree.cv_results_)

In [None]:
b_model = grid_dtree.best_estimator_ # 최적의 모델
pred = b_model.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
grid_dtree.best_params_# 가장 성능이 좋았던 파라미터들

# titanic dataset 으로 분류 모델 만들기

In [None]:
import pandas as pd
df = pd.read_csv('./data/titanic.csv')
df

In [None]:
print(df.isnull().sum())
df.info()

In [137]:
X = df[['Pclass', 'SibSp']]
y = df['Survived']

# 훈련, 테스트 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 모델생성
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf

# 모델학습
dt_clf.fit(X_train, y_train)

# 모델예측
pred = dt_clf.predict(X_test)
pred

In [None]:
# 모델 평가
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred) # 정확도

In [None]:
print(df.isnull().sum())
df.info()

In [None]:
# 성별: male=1, female=0
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
df.head()

In [None]:
# Age 평균으로 채우기
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [None]:
print(df.isnull().sum())

In [145]:
X = df[['Pclass','Sex', 'Age', 'SibSp']]
y = df['Survived']

# 훈련, 테스트 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 모델생성
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf

# 모델학습
dt_clf.fit(X_train, y_train)

# 모델예측
pred = dt_clf.predict(X_test)
pred

In [None]:
# 모델 평가
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred) # 정확도

In [175]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                  y,
                                                  test_size=0.2,
                                                  random_state=121)

In [None]:
from sklearn.model_selection import GridSearchCV
dt_clf2 = DecisionTreeClassifier(random_state=121)
params = {'max_depth':[1,2,3], 'min_samples_split':[2,3]}
grid_dtree = GridSearchCV(dt_clf2, param_grid=params, cv=3, refit=True)
grid_dtree.fit(X_train, y_train)

In [None]:
pd.DataFrame(grid_dtree.cv_results_)

In [None]:
b_model = grid_dtree.best_estimator_
pred = b_model.predict(X_test)
accuracy_score(y_test, pred)

# 피처 스케일링

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df.mean()

In [None]:
iris_df.var()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
iris_scaled.mean(), iris_scaled.var()

# LogisticRegression 모델

- 스케일링 안 한 데이터
- 스케일링 한 데이터

In [186]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=11)

In [None]:
no_scaling_model = LogisticRegression(random_state=11)
no_scaling_model.fit(X_train, y_train)
pred_1 = no_scaling_model.predict(X_test)
acc_1 = accuracy_score(y_test, pred_1)
acc_1

In [190]:
from sklearn.preprocessing import StandardScaler
scaler1 = StandardScaler()
scaler1.fit(X_train)
X_train_scaled = scaler1.transform(X_train)
X_test_scaled = scaler1.transform(X_test)

In [None]:
scaling_model = LogisticRegression(random_state=11)
scaling_model.fit(X_train_scaled, y_train)
pred_2 = scaling_model.predict(X_test_scaled)
acc_2 = accuracy_score(y_test, pred_2)
acc_2, acc_1

# wine 데이터로 스케일링하기

In [3]:
from sklearn.datasets import load_wine
import pandas as pd

In [5]:
wine = load_wine()
wine_df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
wine_df.head(2)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0


In [7]:
wine_df['target'] = wine.target
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
 13  targe

In [12]:
# target에 들어 있는 값, 분포
wine_df['target'].value_counts().sort_index()

target
0    59
1    71
2    48
Name: count, dtype: int64

In [25]:
X = wine.data
y = wine.target

In [None]:
# 로지스틱회귀 모델, 8:2, 11
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [30]:
from sklearn.metrics import accuracy_score
no_scaling_model = LogisticRegression(max_iter=3000) # 수렴을 위한 하이퍼파라미터
no_scaling_model.fit(X_train, y_train)
pred_1 = no_scaling_model.predict(X_test)
acc_1 = accuracy_score(y_test, pred_1)
acc_1

0.9722222222222222

# StandardScaling 후 성능

In [28]:
from sklearn.preprocessing import StandardScaler
scaler1 = StandardScaler()
scaler1.fit(X_train)
X_train_scaled = scaler1.transform(X_train)
X_test_scaled = scaler1.transform(X_test)

In [29]:
scaling_model = LogisticRegression(random_state=11)
scaling_model.fit(X_train_scaled, y_train)
pred_2 = scaling_model.predict(X_test_scaled)
acc_2 = accuracy_score(y_test, pred_2)
acc_2, acc_1

(1.0, 0.9722222222222222)

In [33]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# 알고리즘에 따른 스케일링의 효과 확인

In [83]:
models = {
  'KNN': KNeighborsClassifier(n_neighbors=5),
  'RF': RandomForestClassifier(),
  'LR': LogisticRegression(max_iter=3000)
}
for name, model in models.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  acc = accuracy_score(y_test, y_pred)
  print(f'{name} 모델의 정확도: {acc:.5f}')

KNN 모델의 정확도: 0.75000
RF 모델의 정확도: 1.00000
LR 모델의 정확도: 0.97222


# 스케킹링 이전 데이터로 성능 비교

In [84]:
models = {
  'KNN': KNeighborsClassifier(n_neighbors=5),
  'RF': RandomForestClassifier(),
  'LR': LogisticRegression(max_iter=3000)
}
for name, model in models.items():
  model.fit(X_train_scaled, y_train)
  y_pred = model.predict(X_test_scaled)
  acc = accuracy_score(y_test, y_pred)
  print(f'{name} 모델의 정확도: {acc:.5f}')

KNN 모델의 정확도: 1.00000
RF 모델의 정확도: 0.97222
LR 모델의 정확도: 1.00000
