<a href="https://colab.research.google.com/github/DongGwan0505/MNIST-classification-project/blob/main/he_image_classification_of_MNIST_dataset_by_using_machine_learning_techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## 기본설정
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)
# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join( PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs( IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout :
        plt.tight_layout()
    plt.savefig (path, format=fig_extension, dpi=resolution)


In [None]:
## MNIST 데이터 로드
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

X, y = mnist["data"], mnist["target"]
# MNIST 데이터셋은 28x28 픽셀의 이미지가 784개의 특성으로 펼쳐져 있기 때문에,
70,000개의 이미지가 있다면 X.shape는 (70000, 784)이다

y = y.astype(np.uint8)
#문자인 y를 8bits의 unsigned int로 변환한다.

X = X / 255.0
#Pixel=Pixel/255

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
#training data와 test data로 분리한다.

In [1]:
## Softmax Regression
softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=1000, C=1.0, random_state=42)
softmax_reg.fit(X_train, y_train)
#일단 C=1로 설정한 후 진행해보았다.
softmax_reg.predict([X_train[0]])

softmax_reg.predict_proba([X_train[0]])

y = y.astype(np.uint8)
X = X / 255.0
#위는 이전 코드와 동일하다.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.astype(np.float64))
#데이터 X에 대해 standard scaler를 적용하고 train data와 test data로 나누었다.
X_train, X_test, y_train, y_test = X_scaled[:60000], X_scaled[60000:], y[:60000], y[60000:]


NameError: name 'LogisticRegression' is not defined

In [None]:
## SVM (suppott vector machine)

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score

# 스케일링 및 PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# PCA를 적용하여 데이터 차원 축소, 설명된 분산의 95%를 유지하도록 설정
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# 모델 훈련
svm_clf = SVC(kernel='rbf', C=1, gamma=0.001)
svm_clf.fit(X_train_pca, y_train)
# 성능 평가
y_pred = svm_clf.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print(f"테스트 세트 정확도: {accuracy:.2f}")


In [None]:
## Random Forrest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier
(max_features="sqrt", max_samples=None, max_depth=None, n_estimators=20, random_state=42)
# max_features="sqrt", max_samples=None, max_depth=None로 설정했다.
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
