## Digits

## 1. 모듈 임포트

In [1]:
from sklearn.datasets import load_digits   ## 데이터 지정
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

## 2. 데이터 준비, 3. 데이터 이해하기

In [2]:
digits = load_digits()
digits_data = digits.data
digits_label = digits.target
print("< Feature names >")
print(digits.feature_names)
print("\n< Target names >")
print(digits.target_names)
print("\n< Data shape >")
print(digits_data.shape)
print("\n< Label shape >")
print(digits_label.shape)

print(digits.DESCR)


< Feature names >
['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', 'pixel_6_1', 'pixel_6_2', 'pixel_6_3', 'pixel_6_4', 'pixel_6_5', 'pixel_6_6', 'pixel_6_7', 'pixel_7_0', 'pixel_7_1', 'pixel_7_2', 'pixel_7_3', 'pixel_7_4', 'pixel_7_5', 'pixel_7_6', 'pixel_7_7']

< Target names >
[0 1 2 3 4 5 6 7 8 9]

< Data shape >
(1797, 64)

< Label shape >
(1797,)
.. _digits_dataset:

Optical recognition of handwritten d

## 4. train, test 데이터 분리

In [3]:
X_train, X_test, y_train, y_test = train_test_split(digits_data, digits_label, 
                                                    test_size = 0.5, random_state= 8)  ## test_size를 변경.

## 5. 모델별 학습

In [4]:
### Decision tree
from sklearn.tree import DecisionTreeClassifier   ## 모델 불러오기

decision_tree = DecisionTreeClassifier(random_state = 32)  ## 모델 지정
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("------------------------------Train result------------------------------")
print(classification_report(y_test, y_pred))

------------------------------Train result------------------------------
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        89
           1       0.80      0.81      0.80        91
           2       0.77      0.90      0.83        88
           3       0.82      0.82      0.82        95
           4       0.87      0.86      0.86        91
           5       0.96      0.83      0.89        96
           6       0.92      0.94      0.93        81
           7       0.83      0.86      0.84        91
           8       0.83      0.68      0.75        93
           9       0.83      0.86      0.84        84

    accuracy                           0.85       899
   macro avg       0.86      0.86      0.85       899
weighted avg       0.86      0.85      0.85       899



In [5]:
### Random Forest
from sklearn.ensemble import RandomForestClassifier   ## 모델 불러오기

decision_tree = RandomForestClassifier(random_state = 32)   ## 모델 지정
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("------------------------------Train result------------------------------")
print(classification_report(y_test, y_pred))

------------------------------Train result------------------------------
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        89
           1       0.95      0.97      0.96        91
           2       0.96      0.99      0.97        88
           3       1.00      0.96      0.98        95
           4       1.00      0.97      0.98        91
           5       0.96      0.94      0.95        96
           6       1.00      0.99      0.99        81
           7       0.99      1.00      0.99        91
           8       0.96      0.92      0.94        93
           9       0.91      0.96      0.94        84

    accuracy                           0.97       899
   macro avg       0.97      0.97      0.97       899
weighted avg       0.97      0.97      0.97       899



In [6]:
### Support Vector Machine
from sklearn import svm   ## 모델 불러오기

decision_tree = svm.SVC(random_state = 32)   ## 모델 지정
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("------------------------------Train result------------------------------")
print(classification_report(y_test, y_pred))


------------------------------Train result------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        89
           1       0.95      1.00      0.97        91
           2       1.00      1.00      1.00        88
           3       0.99      0.99      0.99        95
           4       1.00      0.98      0.99        91
           5       0.98      0.99      0.98        96
           6       1.00      1.00      1.00        81
           7       1.00      1.00      1.00        91
           8       0.97      0.94      0.95        93
           9       0.99      0.98      0.98        84

    accuracy                           0.99       899
   macro avg       0.99      0.99      0.99       899
weighted avg       0.99      0.99      0.99       899



In [7]:
### Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier   ## 모델 불러오기

decision_tree = SGDClassifier(random_state = 32)   ## 모델 지정
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("------------------------------Train result------------------------------")
print(classification_report(y_test, y_pred))


------------------------------Train result------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        89
           1       0.95      0.86      0.90        91
           2       0.96      0.98      0.97        88
           3       0.93      0.99      0.96        95
           4       1.00      0.97      0.98        91
           5       0.96      0.95      0.95        96
           6       0.99      0.99      0.99        81
           7       0.99      0.99      0.99        91
           8       0.82      0.91      0.86        93
           9       0.96      0.89      0.93        84

    accuracy                           0.95       899
   macro avg       0.96      0.95      0.95       899
weighted avg       0.95      0.95      0.95       899



In [8]:
### Logistic Regression
from sklearn.linear_model import LogisticRegression   ## 모델 불러오기

decision_tree = LogisticRegression(random_state = 32, max_iter = 5000)   ## 모델 지정
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print("------------------------------Train result------------------------------")
print(classification_report(y_test, y_pred))

------------------------------Train result------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        89
           1       0.94      0.97      0.95        91
           2       0.99      0.98      0.98        88
           3       0.99      0.98      0.98        95
           4       0.99      0.98      0.98        91
           5       0.95      0.92      0.93        96
           6       0.99      1.00      0.99        81
           7       0.96      1.00      0.98        91
           8       0.97      0.89      0.93        93
           9       0.91      0.96      0.94        84

    accuracy                           0.97       899
   macro avg       0.97      0.97      0.97       899
weighted avg       0.97      0.97      0.97       899



## 6. 평가 및 회고

#### DT: Decision Tree, RF: Random Forest, SVM: Support Vector Machine, SGD: Stochastic Gradient Descent, LR: Logistic Regression
### 모델별 예측정확도 결과는 아래와 같다.

#### Test size: 0.1일때
#### DT: 82%, RF: 98%, SVM: 98%, SGD: 98%, LR: 97%

#### Test size: 0.2일때
#### DT: 86%, RF: 97%, SVM: 98%, SGD: 95%, LR: 96%

#### Test size: 0.3일때
#### DT: 84%, RF: 97%, SVM: 99%, SGD: 95%, LR: 97%

#### Test size: 0.4일때
#### DT: 85%, RF: 97%, SVM: 99%, SGD: 95%, LR: 97%

#### Test size: 0.5일때
#### DT: 85%, RF: 97%, SVM: 99%, SGD: 95%, LR: 97%

### SVM이 Test size에 관계없이 가장 높았다. 베스트 모델. 하지만 DT를 제외한 나머지도 95% 이상으로 나쁘지 않았다.
### 이 데이터셋은 Test size에 영향을 받는 폭이 크지 않다. 모두 정확도가 높은 편. 2~3%차이는 알고리즘의 차이로 구분하기 쉽지 않다.
### DT의 경우 들쭉날쭉했는데, 경로의 무작위성 영향을 많이 받는 것처럼 보였다. 
### RF는 DT에 비해 안정적이고 효과적으로 보인다. 아무래도 DT를 무작위적으로 여러 번 수행하는 특성 때문에 그런 것으로 보인다.
