# Machine_learning_fitting

# Split data
*for avoid overfitting*
* Training dataset
* Test dataset

In [1]:
def split_data(data, prob):
    """split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
        
def train_test_split(x, y, test_pct):
    data = list(zip(x, y)) # pair corresponding values
    train, test = split_data(data, 1 - test_pct) # split the dataset of pairs
    x_train, y_train = list(zip(*train)) # magical un-zip trick
    x_test, y_test = list(zip(*test))
    return x_train, x_test, y_train, y_test

sklearn's train_test_split

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

X, y = np.arange(12).reshape((6, 2)), list(range(6))
print("<데이터 split 전>")
print(X)
print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print("\n<데이터 split 후>")
print(X_train, y_train)
print(X_test, y_test)

<데이터 split 전>
[[ 0  1]
 [ 2  3]
 [ 4  5]
 [ 6  7]
 [ 8  9]
 [10 11]]
[0, 1, 2, 3, 4, 5]

<데이터 split 후>
[[ 2  3]
 [10 11]
 [ 4  5]
 [ 8  9]] [1, 5, 2, 4]
[[6 7]
 [0 1]] [3, 0]


# Correctness
## accuracy
* correct/total   
$(tp+tn)/(tp+tn+fp+fn)$

In [6]:
def accuracy(tp, fp, fn, tn):
    correct = tp + tn
    total = tp + fp + fn + tn
    return correct/total

## Precision(정밀도)

 *how accurate our positive predictions were*

> Positive 라고 예측한 것 중에 True positive

* TP/TP+FP

In [7]:
def precision(tp, fp, fn, tn):
    return tp/ (tp+fp)

## Recall(재현율)

*what fraction of the positives our model identified*

> Positve 인 것 중에 True positive

* TP/TP+FN

In [14]:
def recall(tp, fp, fn, tn):
    return tp/(tp+fn)

## f1_score

* 2xPxR/(P+R)

> Precision 과 Recall 간의 조화평균
* 어느 한쪽으로 치우치지 않을 때 조화롭게 높은 값
* 경우에 따라 Precision 과 Recall 의 중요도를 다르다.

* Precision 과 Recall 은 반비례 관계

In [15]:
def f1_score(tp, fp, fn, tn):
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)
    return 2*p*r/(p+r)

In [16]:
print("precision(70, 4930, 13930, 981070) = {}".format(precision(70, 4930, 13930, 981070)))
print("recall(70, 4930, 13930, 981070) = {}".format(recall(70, 4930, 13930, 981070)))
print("f1_score(70, 4930, 13930, 981070) = {}".format(f1_score(70, 4930, 13930, 981070)))

precision(70, 4930, 13930, 981070) = 0.014
recall(70, 4930, 13930, 981070) = 0.005
f1_score(70, 4930, 13930, 981070) = 0.00736842105263158


sklearn's confusion matrix and classification_report

In [26]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = ["cat", "ant", "cat", "cat", "ant", "cat"]
y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
#predictive value 를 왼쪽 행에 놓기 위하여 matrix 를 Transpose
print(confusion_matrix(y_true, y_pred, labels=["ant", "cat"]).T)

[[2 1]
 [0 3]]


|               | ant  | cat  |
| ------------- | ---- | ---- |
| predict "ant" | 2    | 1    |
| predict "cat" | 0    | 3    |

| *ant 기준*        | ant  | not ant |
| ----------------- | ---- | ------- |
| predict "ant"     | 2(TP)    | 1(FP)       |
| predict "not ant" | 0(FP)    | 3(TN)       |

| *cat 기준*        | not cat | cat  |
| ----------------- | ------- | ---- |
| predict "not cat" | 2(TN)       | 1(FN)    |
| predict "cat"     | 0(FP)       | 3(TP)    |

> confusion matrix 에서 예측기준에 따라 TP, FP, TN, FN 이 달라진다. 

In [28]:
print(classification_report(y_true, y_pred, labels=["ant", "cat"]))

              precision    recall  f1-score   support

         ant       0.67      1.00      0.80         2
         cat       1.00      0.75      0.86         4

    accuracy                           0.83         6
   macro avg       0.83      0.88      0.83         6
weighted avg       0.89      0.83      0.84         6



In [30]:
from sklearn.metrics import confusion_matrix, classification_report
y_true = ["cat", "ant", "cat", "cat", "ant", "cat", "bird", "bird"]
y_pred = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "bird"]
print(confusion_matrix(y_true, y_pred, labels=["ant", "cat", "bird"]).T)
print(classification_report(y_true, y_pred, labels=["ant", "cat", "bird"]))

[[2 1 0]
 [0 3 0]
 [0 0 2]]
              precision    recall  f1-score   support

         ant       0.67      1.00      0.80         2
         cat       1.00      0.75      0.86         4
        bird       1.00      1.00      1.00         2

    accuracy                           0.88         8
   macro avg       0.89      0.92      0.89         8
weighted avg       0.92      0.88      0.88         8

