<a href="https://colab.research.google.com/github/Charles980903/Proj2/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x, total, unit: x  # If tqdm doesn't exist, replace it with a function that does nothing
    print('**** Could not import tqdm. Please install tqdm for download progressbars! (pip install tqdm) ****')

# Python2 compatibility
try:
    input = raw_input
except NameError:
    pass

download_dict = {
    '1) Kuzushiji-MNIST (10 classes, 28x28, 70k examples)': {
        '1) MNIST data format (ubyte.gz)':
            ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz'],
        '2) NumPy data format (.npz)':
            ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-imgs.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-labels.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-imgs.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-labels.npz'],
    },
    '2) Kuzushiji-49 (49 classes, 28x28, 270k examples)': {
        '1) NumPy data format (.npz)':
            ['http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-imgs.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-labels.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-imgs.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-labels.npz'],
    },
    '3) Kuzushiji-Kanji (3832 classes, 64x64, 140k examples)': {
        '1) Folders of images (.tar)':
            ['http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar'],
    }

}

# Download a list of files
def download_list(url_list):
    for url in url_list:
        path = url.split('/')[-1]
        r = requests.get(url, stream=True)
        with open(path, 'wb') as f:
            total_length = int(r.headers.get('content-length'))
            print('Downloading {} - {:.1f} MB'.format(path, (total_length / 1024000)))

            for chunk in tqdm(r.iter_content(chunk_size=1024), total=int(total_length / 1024) + 1, unit="KB"):
                if chunk:
                    f.write(chunk)
    print('All dataset files downloaded!')

# Ask the user about which path to take down the dict
def traverse_dict(d):
    print('Please select a download option:')
    keys = sorted(d.keys())  # Print download options
    for key in keys:
        print(key)

    userinput = input('> ').strip()

    try:
        selection = int(userinput) - 1
    except ValueError:
        print('Your selection was not valid')
        traverse_dict(d)  # Try again if input was not valid
        return

    selected = keys[selection]

    next_level = d[selected]
    if isinstance(next_level, list):  # If we've hit a list of downloads, download that list
        download_list(next_level)
    else:
        traverse_dict(next_level)     # Otherwise, repeat with the next level

traverse_dict(download_dict)

Please select a download option:
1) Kuzushiji-MNIST (10 classes, 28x28, 70k examples)
2) Kuzushiji-49 (49 classes, 28x28, 270k examples)
3) Kuzushiji-Kanji (3832 classes, 64x64, 140k examples)
> 1
Please select a download option:
1) MNIST data format (ubyte.gz)
2) NumPy data format (.npz)
> 2
Downloading kmnist-train-imgs.npz - 18.0 MB


100%|██████████| 17954/17954 [00:10<00:00, 1698.90KB/s]


Downloading kmnist-train-labels.npz - 0.0 MB


100%|██████████| 30/30 [00:00<00:00, 211.12KB/s]


Downloading kmnist-test-imgs.npz - 3.0 MB


100%|██████████| 3008/3008 [00:02<00:00, 1091.28KB/s]


Downloading kmnist-test-labels.npz - 0.0 MB


100%|██████████| 6/6 [00:00<00:00, 14691.08KB/s]

All dataset files downloaded!





In [6]:
# Load the Kuzushiji-MNIST data
train_images = np.load('kmnist-train-imgs.npz')['arr_0']
train_labels = np.load('kmnist-train-labels.npz')['arr_0']
test_images = np.load('kmnist-test-imgs.npz')['arr_0']
test_labels = np.load('kmnist-test-labels.npz')['arr_0']

train_images = train_images.astype('float32') / 255.0
test_images = test_images.astype('float32') / 255.0

# Flatten the images for input into the Stacked Autoencoder
train_images_flattened = train_images.reshape(-1, 784)
test_images_flattened = test_images.reshape(-1, 784)

# Split the training data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(train_images_flattened, train_labels, test_size=0.2, random_state=42)

In [15]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.datasets import mnist
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from tensorflow.keras.callbacks import EarlyStopping


In [16]:
def build_sae(bottleneck_size=64):
    # Encoder
    input_layer = Input(shape=(784,))
    encoder = Dense(800, activation='relu')(input_layer)
    encoder = Dense(200, activation='relu')(encoder)
    bottleneck = Dense(bottleneck_size, activation='relu')(encoder)

    # Decoder
    decoder = Dense(200, activation='relu')(bottleneck)
    decoder = Dense(800, activation='relu')(decoder)
    output_layer = Dense(784, activation='sigmoid')(decoder)

    # Build model
    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    return autoencoder


In [1]:
from itertools import product

def extract_features(autoencoder, data):

    encoder_model = Model(inputs=autoencoder.input, outputs=autoencoder.layers[3].output)
    features = encoder_model.predict(data)
    return features

# Define ranges for hyperparameters to test
bottleneck_sizes = [32, 64, 128]
batch_sizes = [64, 128]
patience_values = [5, 10]


cv_results = {}



for bottleneck_size, batch_size, patience in product(bottleneck_sizes, batch_sizes, patience_values):
    print(f"Training SAE with bottleneck size {bottleneck_size}, batch size {batch_size}, patience {patience}")

    # Build the autoencoder
    autoencoder = build_sae(bottleneck_size)
    autoencoder.compile(optimizer='adam', loss='mse')

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)

    # Train the autoencoder with early stopping
    autoencoder.fit(
        x_train, x_train,
        epochs=50,  # Set a high epoch limit, early stopping will determine the actual stopping point
        batch_size=batch_size,
        validation_data=(x_val, x_val),
        callbacks=[early_stopping],
        verbose=0
    )

    # Extract features from the bottleneck layer
    train_features = extract_features(autoencoder, x_train)
    val_features = extract_features(autoencoder, x_val)

    # Scale the features for the SVM classifier
    scaler = StandardScaler()
    train_features_scaled = scaler.fit_transform(train_features)
    val_features_scaled = scaler.transform(val_features)

    # Train an SVM classifier using cross-validation
    svm_classifier = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
    scores = cross_val_score(svm_classifier, train_features_scaled, y_train, cv=5)
    avg_accuracy = np.mean(scores)

    # Store the results
    cv_results[(bottleneck_size, batch_size, patience)] = avg_accuracy
    print(f"Bottleneck size {bottleneck_size}, Batch size {batch_size}, Patience {patience} - CV Accuracy: {avg_accuracy:.4f}")


Training SAE with bottleneck size 32, batch size 64, patience 5


NameError: name 'build_sae' is not defined

In [None]:
_