In [4]:
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, r2_score, precision_score, recall_score, roc_auc_score
# sklearn.metrics import  평가 5개하기

In [5]:
penguins = sns.load_dataset("penguins")

In [6]:
penguins.dropna(subset=["bill_length_mm"], inplace=True)
penguins.isna().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  9
dtype: int64

In [7]:
# 데이터 준비 (탐색적 분석 eda)

x = penguins[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]]
y = penguins["species"].map(lambda x: 0 if x=="Adelie" else 1 if x=="Gentoo" else 2)

In [8]:
#  나이브 베이즈, knn(k=3), 의사결정트리(depth=3) 로 
#  펭귄의 종을 예측하는 모델을 만들고 학습하자!

# 각 모델의 정확도를 , r2스코어, precisiom스코어, recall스코어
#  rox auc 스코어를 각각 출력하자


In [9]:
x.shape

(342, 4)

In [10]:
y.shape

(342,)

In [24]:
#  1-1 스케일링
#  데이터의 분포가 다르다  = 그러면 스케일링 하자
x_scale = StandardScaler().fit_transform(x)

In [30]:
# 2. 데이터 분할 - 모의고사(검증) 보게 하려고 검증데이터 따로 잘라낸 것
x_train, x_test, y_train, y_test = train_test_split(x_scale, y, test_size=0.3, random_state=1)

In [35]:
model_list = [GaussianNB(), KNeighborsClassifier(n_neighbors=3), DecisionTreeClassifier(max_depth=3)]
# 모델이 3개가 있으니 LIST로 묶어서 써주겠다.
# 질문을 많이하면 과적합이 일어날 수 있다.
# 과적합이란? 모델이 훈련 데이터에 너무 잘 맞아서 새로운 데이터를 잘 예측 못하는거

In [40]:
for model in model_list:
#     4. 학습
    model.fit(x_train, y_train)
    
#     5. 평가
    predict = model.predict(x_test)
    proba = model.predict_proba(x_test)
    
    print(f"**{model.__class__.__name__}**")
    print(f"accuracy : {accuracy_score(y_test, predict)}")
    print(f"r2 score : {r2_score(y_test, predict)}")
    print(f"precision score : {precision_score(y_test, predict, average=None)}")
    print(f"recall score : {recall_score(y_test, predict, average=None)}")
    print(f"roc auc score : {roc_auc_score(y_test, proba, multi_class='ovo')}")

**GaussianNB**
accuracy : 0.9611650485436893
r2 score : 0.7138888888888888
precision score : [0.93617021 1.         0.9375    ]
recall score : [0.97777778 1.         0.83333333]
roc auc score : 0.9921810699588477
**KNeighborsClassifier**
accuracy : 0.9902912621359223
r2 score : 0.9284722222222223
precision score : [0.97826087 1.         1.        ]
recall score : [1.         1.         0.94444444]
roc auc score : 0.9997942386831276
**DecisionTreeClassifier**
accuracy : 0.9611650485436893
r2 score : 0.7675347222222222
precision score : [0.93617021 1.         0.94117647]
recall score : [0.97777778 0.975      0.88888889]
roc auc score : 0.9951748971193416
