In [1]:
import os
import numpy as np
import pandas as pd
import struct

def read_ubyte_images(filename):
    """
    Read images from a ubyte file.
    
    Parameters:
    filename (str): Path to the ubyte file containing images.
    
    Returns:
    np.array: A numpy array containing the image data.
    """
    with open(filename, 'rb') as f:
        # Read the magic number, number of images, rows, and columns
        magic, num_images, rows, cols = struct.unpack('>IIII', f.read(16))
        # Read the image data
        images = np.fromfile(f, dtype=np.uint8).reshape(num_images, rows, cols)
    return images

def read_ubyte_labels(filename):
    """
    Read labels from a ubyte file.
    
    Parameters:
    filename (str): Path to the ubyte file containing labels.
    
    Returns:
    np.array: A numpy array containing the label data.
    """
    with open(filename, 'rb') as f:
        # Read the magic number and number of items
        magic, num_items = struct.unpack('>II', f.read(8))
        # Read the label data
        labels = np.fromfile(f, dtype=np.uint8)
    return labels

def save_to_csv(images, labels, filename):
    """
    Save the images and labels to a CSV file.
    
    Parameters:
    images (np.array): Array of image data.
    labels (np.array): Array of labels.
    filename (str): Filename for the output CSV file.
    """
    # Flatten the images and create a dataframe
    flat_images = images.reshape(images.shape[0], -1)
    df = pd.DataFrame(flat_images)

    # Insert labels as the first column
    df.insert(0, 'label', labels)

    # Save dataframe to CSV
    df.to_csv(filename, index=False)


In [2]:
from pathlib import Path
# Path to the directory containing the ubyte files
data_dir = 'data'

# File paths for the ubyte files
train_images_file = os.path.join(data_dir, 'train-images.idx3-ubyte')
train_labels_file = os.path.join(data_dir, 'train-labels.idx1-ubyte')
test_images_file = os.path.join(data_dir, 't10k-images.idx3-ubyte')
test_labels_file = os.path.join(data_dir, 't10k-labels.idx1-ubyte')

# Read the ubyte files
train_images = read_ubyte_images(train_images_file)
train_labels = read_ubyte_labels(train_labels_file)
test_images = read_ubyte_images(test_images_file)
test_labels = read_ubyte_labels(test_labels_file)

# Split the training data into two sets
split_index = len(train_images) // 2  # Split into two equal halves
train_images_1 = train_images[:split_index]
train_labels_1 = train_labels[:split_index]
train_images_2 = train_images[split_index:]
train_labels_2 = train_labels[split_index:]

# Create a data folder if it doesn't exist
Path("data").mkdir(parents=True, exist_ok=True)

# Save the first half of the training data to CSV
save_to_csv(train_images_1, train_labels_1, os.path.join(data_dir, 'mnist_train_part1.csv'))

# Save the second half of the training data to CSV
save_to_csv(train_images_2, train_labels_2, os.path.join(data_dir, 'mnist_train_part2.csv'))

# Save testing data to CSV
save_to_csv(test_images, test_labels, os.path.join(data_dir, 'mnist_test.csv'))

print("MNIST data has been successfully split and saved to CSV format in the 'data' folder.")

MNIST data has been successfully split and saved to CSV format in the 'data' folder.
