In [1]:
import pandas as pd
fish = pd.read_csv("https://bit.ly/fish_csv_data")
fish.head() # 데이터프레임의 첫번재 행부터 5개 추출

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [2]:
# pandas unique : 명시된 컬럼에 중복 제거
print(pd.unique(fish['Species']))

['Bream' 'Roach' 'Whitefish' 'Parkki' 'Perch' 'Pike' 'Smelt']


In [3]:
# Species : 타켓(정답 데이터), 그 외 - 학습 데이터
import numpy as np
fish_input = fish[['Weight','Length','Diagonal', 'Height', 'Width']].to_numpy()
print(fish_input[:5])

[[242.      25.4     30.      11.52     4.02  ]
 [290.      26.3     31.2     12.48     4.3056]
 [340.      26.5     31.1     12.3778   4.6961]
 [363.      29.      33.5     12.73     4.4555]
 [430.      29.      34.      12.444    5.134 ]]


In [4]:
fish_target = fish['Species'].to_numpy() # 타겟 데이터(정답 데이터)

In [5]:
from sklearn.model_selection import train_test_split # 학습세트, 테스트 세트 분리

train_input, test_input, train_target, test_target = train_test_split(
    fish_input, fish_target
)

In [6]:
# 데이터 전처리 - 표준 점수 변환
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input) # 훈련 세트 훈련 데이터를 표준 점수로 변환
test_scaled = ss.transform(test_input) # 테스트 세트 데이터를 표준 점수로 변환
print(train_scaled[:5])

[[-0.63884753 -0.38907831 -0.48984986 -0.67688657 -0.22434665]
 [ 1.29024871  0.86938877  0.7680652   0.47868835  1.83378305]
 [ 0.77984771  0.58764241  0.77680072  1.83751676  0.79542171]
 [-0.95990623 -1.0089203  -1.0838653  -0.91054727 -0.97616098]
 [-1.17943354 -0.86804712 -0.85674175 -0.69855549 -0.78555126]]


In [7]:
# k-최근접 이웃 분류 - 분류, 확률 예측
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, train_target) # 학습

print(kn.score(train_scaled, train_target)) # 훈련 세트 정확도
print(kn.score(test_scaled, test_target)) # 테스트 세트 정확도

0.8739495798319328
0.825


In [8]:
# 클래스 목록(분류 항목) - 알파벳 순서대로
print(kn.classes_)

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


In [9]:
print(kn.predict(test_scaled[:5]))

['Bream' 'Bream' 'Whitefish' 'Smelt' 'Smelt']


In [10]:
# 분류 기준 확률 / kn.predict_proba(...) : 확률을 구하는 함수
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=3))

[[0.667 0.333 0.    0.    0.    0.    0.   ]
 [1.    0.    0.    0.    0.    0.    0.   ]
 [0.    0.    0.    0.    0.333 0.    0.667]
 [0.    0.    0.    0.    0.    1.    0.   ]
 [0.    0.    0.    0.    0.    1.    0.   ]]


In [11]:
distances, indexes =  kn.kneighbors(test_scaled[3:4])
print(train_target[indexes]) # 확률로 보기에는 조금 어색함 -> 그렇기 때문에 로지스틱 회귀를 사용함.

[['Smelt' 'Smelt' 'Smelt']]


In [12]:
# 로지스틱 회귀 - 이진 분류(도미, 방어)
bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')
# print(bream_smelt_indexes)
train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt = train_target[bream_smelt_indexes]
print(target_bream_smelt[:5])

['Bream' 'Bream' 'Bream' 'Smelt' 'Bream']


In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt) # 훈련

In [14]:
print(lr.predict(train_bream_smelt[:5]))

['Bream' 'Bream' 'Bream' 'Smelt' 'Bream']


In [29]:
# 확률 predict_proba
print(np.round(lr.predict_proba(train_bream_smelt[:5]), decimals=3))

[[0.999 0.    0.    0.    0.    0.    0.001]
 [0.998 0.    0.    0.    0.    0.    0.001]
 [1.    0.    0.    0.    0.    0.    0.   ]
 [0.    0.003 0.05  0.    0.002 0.944 0.   ]
 [0.984 0.003 0.    0.    0.    0.    0.013]]


In [30]:
print(lr.classes_) #[음성, 양성]

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


In [31]:
# 가중치(기울기)와 절편
print(lr.coef_, lr.intercept_)

[[-1.03881759 -1.08996043  1.56174571  7.17907078 -0.36274998]
 [-0.40459288 -1.82276695 -2.49335844  5.77405923 -2.42788968]
 [ 3.92120043  5.01721555 -7.38947659 -5.98443065  3.91978161]
 [ 0.14721792  3.5204334   3.67712744 -3.13079715 -1.75500921]
 [-3.40879264 -3.8954388   4.06954957 -0.57686294  2.06554291]
 [-1.17588421  0.67822368  0.82257148 -4.36595298 -3.47910424]
 [ 1.95966897 -2.40770645 -0.24815918  1.10491372  2.03942859]] [ 0.32935567 -0.84169741  2.8376061   0.12592264  1.53665929 -6.64380779
  2.65596151]


In [32]:
# 학습을 통해서 구한 가중치와 절편으로 방정식의 결과값을 구하는 함수 : decision_function
decisions = lr.decision_function(train_bream_smelt[:5])
print(decisions) # 결과값을 0~1 사이로 변환해주어야 함. 이것이 시그모이드 함수

[[ 12.99501804   4.51353842  -4.77488903  -1.98298327   0.33340647
  -17.3131581    6.22906747]
 [ 12.55976563   4.91792119  -4.98403809  -2.1305062    0.17209288
  -16.52109679   5.98586138]
 [ 16.40386634   4.73572312  -4.46293852  -2.28577366  -0.72248848
  -21.94106335   8.27267455]
 [-11.27275409   1.9177728    4.58017972  -2.75610403   1.58178581
    7.52629558  -1.57717578]
 [ 10.03176435   4.19310018  -3.17305612  -2.07023646  -0.0738382
  -14.6168423    5.70910855]]


In [33]:
# 시그모이드 함수로 값을 0~1 사이로 변환
from scipy.special import expit # 시그모이드 함수
print(np.round(expit(decisions), decimals=3))

[[1.    0.989 0.008 0.121 0.583 0.    0.998]
 [1.    0.993 0.007 0.106 0.543 0.    0.997]
 [1.    0.991 0.011 0.092 0.327 0.    1.   ]
 [0.    0.872 0.99  0.06  0.829 0.999 0.171]
 [1.    0.985 0.04  0.112 0.482 0.    0.997]]


In [34]:
# 로지스틱 회귀를 통한 다중 분류 / 확률 - 소프트맥스 함수
lr = LogisticRegression(C=15, max_iter=1000)
lr.fit(train_scaled, train_target) # 학습

print(lr.score(train_scaled, train_target)) # 학습 세트
print(lr.score(test_scaled, test_target)) # 테스트 세트

0.8991596638655462
0.925


In [35]:
print(lr.predict(test_scaled[:5]))

['Bream' 'Bream' 'Roach' 'Smelt' 'Smelt']


In [36]:
print(lr.classes_)

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


In [37]:
proba = lr.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=3))

[[0.531 0.38  0.    0.    0.051 0.    0.038]
 [0.997 0.002 0.    0.    0.    0.    0.001]
 [0.024 0.032 0.188 0.004 0.438 0.    0.315]
 [0.    0.004 0.114 0.    0.004 0.879 0.   ]
 [0.    0.003 0.078 0.    0.003 0.916 0.   ]]


In [38]:
print(lr.coef_, lr.intercept_)

[[-1.03881759 -1.08996043  1.56174571  7.17907078 -0.36274998]
 [-0.40459288 -1.82276695 -2.49335844  5.77405923 -2.42788968]
 [ 3.92120043  5.01721555 -7.38947659 -5.98443065  3.91978161]
 [ 0.14721792  3.5204334   3.67712744 -3.13079715 -1.75500921]
 [-3.40879264 -3.8954388   4.06954957 -0.57686294  2.06554291]
 [-1.17588421  0.67822368  0.82257148 -4.36595298 -3.47910424]
 [ 1.95966897 -2.40770645 -0.24815918  1.10491372  2.03942859]] [ 0.32935567 -0.84169741  2.8376061   0.12592264  1.53665929 -6.64380779
  2.65596151]


In [43]:
# decision_function
decisions = lr.decision_function(test_scaled[:5])
print(np.round(decisions, decimals=3))

[[ 5.2170e+00  4.8820e+00 -4.4130e+00 -3.3880e+00  2.8740e+00 -7.7520e+00
   2.5800e+00]
 [ 1.1714e+01  5.5020e+00 -5.6430e+00 -2.2880e+00  3.8100e-01 -1.4791e+01
   5.1260e+00]
 [ 8.0000e-03  3.1400e-01  2.0800e+00 -1.8670e+00  2.9290e+00 -6.0610e+00
   2.5980e+00]
 [-1.1549e+01  1.8540e+00  5.2220e+00 -3.3250e+00  1.7760e+00  7.2680e+00
  -1.2470e+00]
 [-1.1193e+01  1.5540e+00  4.8460e+00 -2.4080e+00  1.5370e+00  7.3090e+00
  -1.6450e+00]]


In [40]:
from scipy.special import softmax

proba = softmax(decisions, axis=1) # 행별로(axis = 1)
print(np.round(proba, decimals=3))

[[0.531 0.38  0.    0.    0.051 0.    0.038]
 [0.997 0.002 0.    0.    0.    0.    0.001]
 [0.024 0.032 0.188 0.004 0.438 0.    0.315]
 [0.    0.004 0.114 0.    0.004 0.879 0.   ]
 [0.    0.003 0.078 0.    0.003 0.916 0.   ]]
