In [9]:
# 필요한 모듈 import하기
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 데이터준비
digits = load_digits()

In [10]:
# 불러온 dataset 정보 확인
digits.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

In [31]:
# 데이터 이해하기

# Feature Data 지정하기
digits_data = digits.data
# Data의 크기 확인
print(digits_data.shape)
# feature의 갯수는 64개이다.

(1797, 64)


In [12]:
# 8X8 픽셀의 숫자이기 때문에 1797개의 데이터가 각각 64개의 정보를 가지고있다.
digits_data[0]

array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.])

In [7]:
# Label Data 지정하기
# Data의 정답지인 label을 확인한다.
digits_label = digits.target
# label의 갯수는 아래와 같다.
print(digits_label.shape)
# label의 갯수는 data의 갯수와 같은데 각 data에 붙은 label의 갯수와 같기 때문이다.
digits_label
# label의 값을 확인해보면, 0~9까지의 정수인 것을 알 수 있다.

(1797,)


array([0, 1, 2, ..., 8, 9, 8])

In [8]:
# Target Names 출력해보기
digits.target_names

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [14]:
# 데이터 Describe 해 보기
print(digits.DESCR)
# NIST에서 추출한 데이터. 총 43명이 훈련세트에 참여했다.

.. _digits_dataset:

Optical recognition of handwritten digits dataset
--------------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each blo

In [18]:
# train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(digits_data, 
                                                    digits_label, 
                                                    test_size=0.2, 
                                                    random_state=7)

In [21]:
# DecisionTree
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print(classification_report(y_test, y_pred))
# accuracy : 86% 정흭도

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.81      0.81      0.81        42
           2       0.79      0.82      0.80        40
           3       0.79      0.91      0.85        34
           4       0.83      0.95      0.89        37
           5       0.90      0.96      0.93        28
           6       0.84      0.93      0.88        28
           7       0.96      0.82      0.89        33
           8       0.88      0.65      0.75        43
           9       0.78      0.78      0.78        32

    accuracy                           0.86       360
   macro avg       0.86      0.86      0.86       360
weighted avg       0.86      0.86      0.85       360



In [23]:
# RandomTreeClassifier
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
random_y_pred = random_forest.predict(X_test)

print(classification_report(y_test, random_y_pred))
# accuracy : 96% 정흭도

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.93      1.00      0.97        42
           2       1.00      1.00      1.00        40
           3       1.00      1.00      1.00        34
           4       0.93      1.00      0.96        37
           5       0.90      0.96      0.93        28
           6       1.00      0.96      0.98        28
           7       0.94      0.97      0.96        33
           8       1.00      0.84      0.91        43
           9       0.94      0.94      0.94        32

    accuracy                           0.96       360
   macro avg       0.96      0.96      0.96       360
weighted avg       0.97      0.96      0.96       360



In [25]:
# SVM
from sklearn import svm
SVM_model =svm.SVC()
SVM_model.fit(X_train, y_train)
SVM_y_pred = SVM_model.predict(X_test)

print(classification_report(y_test, SVM_y_pred))
# accuracy : 99% 정흭도

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.95      1.00      0.98        42
           2       1.00      1.00      1.00        40
           3       1.00      1.00      1.00        34
           4       1.00      1.00      1.00        37
           5       0.93      1.00      0.97        28
           6       1.00      1.00      1.00        28
           7       1.00      1.00      1.00        33
           8       1.00      0.93      0.96        43
           9       1.00      0.97      0.98        32

    accuracy                           0.99       360
   macro avg       0.99      0.99      0.99       360
weighted avg       0.99      0.99      0.99       360



In [26]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
sgd_y_pred = sgd_model.predict(X_test)

print(classification_report(y_test, sgd_y_pred))
# accuracy : 94% 정확도

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.95      0.83      0.89        42
           2       0.95      1.00      0.98        40
           3       0.91      0.94      0.93        34
           4       0.97      1.00      0.99        37
           5       0.93      0.96      0.95        28
           6       0.96      0.96      0.96        28
           7       0.94      0.97      0.96        33
           8       0.95      0.84      0.89        43
           9       0.86      0.97      0.91        32

    accuracy                           0.94       360
   macro avg       0.94      0.95      0.94       360
weighted avg       0.95      0.94      0.94       360



In [29]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
logistic_y_pred = logistic_model.predict(X_test)

print(classification_report(y_test, logistic_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.95      0.95      0.95        42
           2       0.98      1.00      0.99        40
           3       0.94      0.97      0.96        34
           4       0.97      1.00      0.99        37
           5       0.82      0.96      0.89        28
           6       1.00      0.96      0.98        28
           7       0.97      0.97      0.97        33
           8       0.92      0.81      0.86        43
           9       0.97      0.91      0.94        32

    accuracy                           0.95       360
   macro avg       0.95      0.95      0.95       360
weighted avg       0.95      0.95      0.95       360



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [30]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
# 오류발생으로 인한 오류해결 필요
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# 데이터를 스케일링함.=> 추가공부 필요
logistic_model = make_pipeline(StandardScaler(),LogisticRegression())
logistic_model.fit(X_train, y_train)
logistic_y_pred = logistic_model.predict(X_test)

print(classification_report(y_test, logistic_y_pred))
# accuracy : 96% 정확도

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.93      0.93      0.93        42
           2       0.98      1.00      0.99        40
           3       0.97      1.00      0.99        34
           4       1.00      0.97      0.99        37
           5       0.88      1.00      0.93        28
           6       1.00      0.93      0.96        28
           7       1.00      1.00      1.00        33
           8       0.95      0.88      0.92        43
           9       0.91      0.91      0.91        32

    accuracy                           0.96       360
   macro avg       0.96      0.96      0.96       360
weighted avg       0.96      0.96      0.96       360



In [None]:
'''
모델비교 결과
--------------------------------------
DecisionTree : 86% accuracy
RandomTreeClassifier : 94% accuracy
SVM : 99% accuracy
SGD Classifier : 94% accuracy
Logistic Regression : 96% accuracy
--------------------------------------

DecisionTree는 다른 모델과 비교해봤을때, 정확도가 너무 떨어지고
RandomTreeClassifier, SGD Classifier 는 정확도가 높긴하지만 SVM보다는 떨어진다.
Logistic Regression의 경우 정확도는 높게 나오지만 SVM과는 다르게
데이터를 추가적으로 정리해주어야하는 번거로움이 있다.
이에 반해 SVM은 추가적으로 무언가를 해야하지도 않고, precision이나 recall을 봐도
문제가 있다고 판단되지 않는다.
그러므로 SVM을 선택하는 것이 적절하다고 판단된다.

'''