### 목표 : 생선 분류 모델
- 데이터   : fish.csv
- feature  : 5개 - Weight, Length, Diagonal, Width, Height
- target   : 1개 - Species
- 방법     : 지도학습 + 다중분류

1. 모듈 로딩, 데이터 준비
<hr>

In [4]:
import pandas as pd
import numpy as np


In [5]:
#데이터 준비
data = '../data/fish.csv'
fish = pd.read_csv(data)

fish.head(2)

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056


2. 학습 데이터 준비

2 - 1. feature / target 분리

In [6]:
feature = fish[fish.columns[1:]]
target = fish[fish.columns[0]]


In [7]:
print(f'feature: {feature.shape}')
print(f'target: {target.shape}')

feature: (159, 5)
target: (159,)


In [8]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(target)
encoder.transform(target)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5])

# target class quantity check

In [9]:
target.nunique()

7

In [10]:
# target class별 데이터 수 확인

(target.value_counts() / target.shape[0]) * 100

Species
Perch        35.220126
Bream        22.012579
Roach        12.578616
Pike         10.691824
Smelt         8.805031
Parkki        6.918239
Whitefish     3.773585
Name: count, dtype: float64

2 - 2. 학습 / 테스트용 데이터셋 준비

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
xtrain, xtest, ytrain, ytest = train_test_split(feature, target, stratify=target, random_state=11)

In [13]:
print(f'[train dataset] {xtrain.shape}, {ytrain.shape}')
print(f'[test dataset] {xtest.shape}, {ytest.shape}')

[train dataset] (119, 5), (119,)
[test dataset] (40, 5), (40,)


3. 학습 진행

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
# 모델 인스턴스 생성, 학습
md = LogisticRegression(max_iter=20000, solver='liblinear') #max_iter: (처음부터 끝까지)몇 번 공부할래? , solver: 알고리즘
#                                                            tol: 충분히 주고 max_iter이 남을때 언제 그만둘래?
md.fit(xtrain, ytrain)

In [17]:
print('labels', md.classes_)
labels = encoder.inverse_transform([1])

labels ['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


In [None]:
# 모델 파라미터 확인
print(f'classes_ : {md.classes_}')
print(f'feature_names_in : {md.feature_names_in_}')
print(f'max_iter : {md.max_iter}')
print(f'coef : {len(md.coef_)},\n {md.coef_}')
print(f'intercept :{len(md.intercept_)}, \n{md.intercept_}')

4. 평가

In [None]:
print(f' [ Train score ] : {md.score(xtrain, ytrain)} \n [ Test score ] : {md.score(xtest, ytest)}')

5. 모델 활용

In [None]:
ypred = md.predict(xtest.iloc[[0]])

ypred, ytest[:1]

In [None]:
print(md.predict_proba(xtest.iloc[[0]]))

In [None]:
#5개 데이터에 대한 생선 분류 예측
print(md.classes_)
np.round(md.predict_proba(xtest.iloc[:5]),3), ytest[:5].to_list()

In [None]:
result = md.predict_proba(xtest.iloc[:5]).argmax(axis=1)
result

In [None]:
data = {'pre y': [md.classes_[idx] for idx in result], 
        'true y': ytest[:5].to_list()}

In [None]:
pd.DataFrame(data)