In [9]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
import tensorflow as tf
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import pandas as pd

def load_overhead_mnist():

    # Read CSV files
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

    # Extract labels
    train_labels = train_df['label'].values
    test_labels = test_df['label'].values

    # Drop label column to isolate pixel data
    train_df.drop(columns=['label'], inplace=True)
    test_df.drop(columns=['label'], inplace=True)

    # Convert dataframe to numpy array and reshape to image format
    train_images = train_df.values.reshape(-1, 28, 28)
    test_images = test_df.values.reshape(-1, 28, 28)

    # Normalize pixel values to be between 0 and 1
    train_images = train_images / 255.0
    test_images = test_images / 255.0

    return (train_images, train_labels), (test_images, test_labels)

def perform_kmeans_and_select_samples(train_images, train_labels, num_clusters=150):
    print("Performing k-means clustering...")
    n_samples, nx, ny = train_images.shape
    train_images_reshaped = train_images.reshape((n_samples, nx*ny))
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    kmeans.fit(train_images_reshaped)
    centroids = kmeans.cluster_centers_

    closest, _ = pairwise_distances_argmin_min(centroids, train_images_reshaped)
    cluster_labels = kmeans.labels_
    training_indices = []

    for i in range(num_clusters):
        cluster_indices = np.where(cluster_labels == i)[0]
        distances = np.linalg.norm(train_images_reshaped[cluster_indices] - centroids[i], axis=1)
        num_select = max(int(0.2 * len(cluster_indices)), 1)
        nearest_indices = np.argsort(distances)[:num_select]
        training_indices.extend(cluster_indices[nearest_indices])

    selected_train_images = train_images_reshaped[training_indices]
    selected_train_labels = train_labels[training_indices]
    print("Selected training set based on clustering.")

    return selected_train_images, selected_train_labels

def build_neural_network(input_shape):
    model = Sequential([
        Flatten(input_shape=input_shape),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(10, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model



def main():
    (train_images, train_labels), (test_images, test_labels) = load_overhead_mnist()

    # Check for NaN values in the datasets
    if np.isnan(train_images).any():
        print("NaN values found in train_images. Filling with the mean of each column.")
        # Fill NaNs with the mean of the column
        col_mean = np.nanmean(train_images, axis=0)
        inds = np.where(np.isnan(train_images))
        train_images[inds] = np.take(col_mean, inds[1])

    selected_train_images, selected_train_labels = perform_kmeans_and_select_samples(train_images, train_labels)

    # Flatten the images for the neural network
    n_samples, nx, ny = train_images.shape
    selected_train_images = selected_train_images.reshape((selected_train_images.shape[0], nx*ny))

    # Scaling the data
    scaler = StandardScaler()
    selected_train_images = scaler.fit_transform(selected_train_images)
    test_images_reshaped = test_images.reshape((test_images.shape[0], nx*ny))
    test_images_reshaped = scaler.transform(test_images_reshaped)

    model = build_neural_network((nx*ny,))
    model.fit(selected_train_images, selected_train_labels, epochs=30, verbose=2)

    # Evaluate the model on the test data
    test_loss, test_accuracy = model.evaluate(test_images_reshaped, test_labels, verbose=2)
    print(f"Test accuracy: {test_accuracy*100:.2f}%")
if __name__ == "__main__":
    main()



Performing k-means clustering...




Selected training set based on clustering.
Epoch 1/30
53/53 - 1s - loss: 1.5904 - accuracy: 0.4496 - 1s/epoch - 25ms/step
Epoch 2/30
53/53 - 0s - loss: 1.1024 - accuracy: 0.6421 - 344ms/epoch - 6ms/step
Epoch 3/30
53/53 - 0s - loss: 0.9497 - accuracy: 0.6888 - 348ms/epoch - 7ms/step
Epoch 4/30
53/53 - 0s - loss: 0.7856 - accuracy: 0.7320 - 318ms/epoch - 6ms/step
Epoch 5/30
53/53 - 0s - loss: 0.7218 - accuracy: 0.7572 - 183ms/epoch - 3ms/step
Epoch 6/30
53/53 - 0s - loss: 0.8472 - accuracy: 0.7302 - 188ms/epoch - 4ms/step
Epoch 7/30
53/53 - 0s - loss: 0.6075 - accuracy: 0.7944 - 186ms/epoch - 4ms/step
Epoch 8/30
53/53 - 0s - loss: 0.5328 - accuracy: 0.8213 - 215ms/epoch - 4ms/step
Epoch 9/30
53/53 - 0s - loss: 0.5234 - accuracy: 0.8177 - 189ms/epoch - 4ms/step
Epoch 10/30
53/53 - 0s - loss: 0.4652 - accuracy: 0.8387 - 203ms/epoch - 4ms/step
Epoch 11/30
53/53 - 0s - loss: 0.4319 - accuracy: 0.8519 - 178ms/epoch - 3ms/step
Epoch 12/30
53/53 - 0s - loss: 0.4100 - accuracy: 0.8585 - 198ms/e

# The last tms/step signifies that it took t ms to complete this step. The loss function quantifies how accurately the model's predictions match the actual labels, a lower value signifying a better match. Accuracy gives the fraction of times the model correctly predicted the labeled value, denominator being the total values

## Epoch m/n signifies that out of m total epochs, n have been traversed. An epoch is a pass through the entire dataset
## 53/53 signifies 53 batches out of 53 batches have been processed. The whole dataset was divided into 53 batches




In [7]:
import pandas as pd



# Load the train.csv to check the column names
train_df = pd.read_csv('train.csv')
print("Columns in train.csv:", train_df.columns)

# Load the test.csv to check the column names
test_df = pd.read_csv('test.csv')
print("Columns in test.csv:", test_df.columns)


Columns in train.csv: Index(['label', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6',
       'pixel7', 'pixel8', 'pixel9',
       ...
       'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779', 'pixel780',
       'pixel781', 'pixel782', 'pixel783', 'pixel784'],
      dtype='object', length=785)
Columns in test.csv: Index(['label', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6',
       'pixel7', 'pixel8', 'pixel9',
       ...
       'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779', 'pixel780',
       'pixel781', 'pixel782', 'pixel783', 'pixel784'],
      dtype='object', length=785)
