In [None]:
from scipy.ndimage import rotate
import pandas as pd 
import numpy as np 
import math
# libraries for data visualization
import matplotlib.pyplot as plt
# libraries for model selection
import xgboost as xgb
# libraries for model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
#library to read matlab files
from scipy.io import loadmat
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV


<h1>Load and Parse Data</h1>

In [None]:
#Import the MNIST.mat file
mat_data = loadmat('/Users/deep/Desktop/ML_Project/MNIST.mat')
# Print the keys of the MATLAB file to understand what data it contains
print(mat_data.keys())

pixel_features = mat_data['train_fea']
pixel_labels = mat_data['train_gnd'].ravel()

pixel_test_features = mat_data['test_fea']
pixel_test_labels = mat_data['test_gnd'].ravel()

#Normalize the pixel features
pixel_features = pixel_features/255
pixel_test_features = pixel_test_features/255


<h1>Encode Labels <h1>

In [None]:
# Encode labels to be in [0, num_class) so it will start from 0 instead of 1
label_encoder = LabelEncoder()
pixel_labels = label_encoder.fit_transform(pixel_labels)
pixel_test_labels = label_encoder.transform(pixel_test_labels)

<h1>Visualize Raw Data<h1>

In [None]:
# add the number of images gonna be printed if we want every image at the 3750 iteration 
num_images = len(pixel_features) // 3750

# Matplot is used to render the image 
fig, axes = plt.subplots(4, 4, figsize=(28, 28))
axes = axes.ravel()  # Flatten the 28x28 matrix into an array of 100 subplots

# Loop through subplots and add images
for i in range(num_images):
    idx = i * 3750 # 3750 is the number of images in each iteration
    
    if idx < 60000:
        image = pixel_features[idx].reshape(28, 28)  # Assuming the images are 28x28
        axes[i].imshow(image, cmap='gray')
        axes[i].set_title(f"Label: {pixel_labels[idx]}", fontsize=8)
        plt.subplots_adjust(hspace=0.8)
        axes[i].axis('off')
    


plt.show()

<h1> Augment Data <h1>

In [None]:
# Assuming pixel_features is a numpy array with each row as a flattened image
# Reshape it to a 3D array: number of images x width x height, -1 means all of the images
image_data_reshaped = pixel_features.reshape(-1, 28, 28)
image_data_reshaped2 = pixel_test_features.reshape(-1, 28, 28)
# Rotated images by 180 degrees
train_features_2D = np.rot90(image_data_reshaped, k=3, axes=(1, 2))
train_features_2D2 = np.rot90(image_data_reshaped2, k=3, axes=(1, 2))
#flip the images to the left
train_features_2D = np.flip(train_features_2D, axis=(2))
train_features_2D2 = np.flip(train_features_2D2, axis=(2))
# Flatten the images again for XGBoost:
flattened_images = train_features_2D.reshape(pixel_features.shape)
flattened_images2 = train_features_2D2.reshape(pixel_test_features.shape)

# Generate shuffled indices
shuffled_indices = np.random.permutation(flattened_images.shape[0])
shuffled_indices2 = np.random.permutation(flattened_images2.shape[0])

# Shuffle features and labels correspondingly
train_features_shuffled = flattened_images[shuffled_indices]
train_labels_shuffled = pixel_labels[shuffled_indices]

train_features_shuffled2 = flattened_images2[shuffled_indices2]
train_labels_shuffled2 = pixel_test_labels[shuffled_indices2]

# training Data
rotated_images = train_features_shuffled
rotated_labels = train_labels_shuffled

# testing Data
rotated_test_images = train_features_shuffled2
rotated_test_labels = train_labels_shuffled2


print(rotated_images.shape)
print(rotated_labels.shape)
print(rotated_test_images.shape)
print(rotated_test_labels.shape)


<h1> Visualization of Augmented Training Data </h1>

In [None]:
fig, axes = plt.subplots(10, 10, figsize=(28, 28))
axes = axes.ravel()  # Flatten the 10x10 matrix into an array of 100 subplots

# Loop through subplots and add images
for i in range(100):
    image = rotated_images[i].reshape(28, 28)  # Assuming the images are 10x10
    axes[i].imshow(image, cmap='gray')
    axes[i].set_title(f"Label: {rotated_labels[i]}", fontsize=8)
    plt.subplots_adjust(hspace=0.8)
    axes[i].axis('off')

plt.show()

<h1> Visualization of Augmented Testing Data </h1>

In [None]:

fig, axes = plt.subplots(10, 10, figsize=(28, 28))
axes = axes.ravel()  # Flatten the 10x10 matrix into an array of 100 subplots

# Loop through subplots and add images
for i in range(100):
    image = rotated_test_images[i].reshape(28, 28)  # Assuming the images are 10x10
    axes[i].imshow(image, cmap='gray')
    axes[i].set_title(f"Label: {rotated_test_labels[i]}", fontsize=8)
    plt.subplots_adjust(hspace=0.8)
    axes[i].axis('off')

plt.show()

<h1>Split Data: Test = Test, Validation<h1>

In [None]:
#splitting the data into train and validation


pixel_train_features = rotated_images[:48000]
pixel_train_labels = rotated_labels[:48000]

pixel_val_features = rotated_images[48000:]
pixel_val_labels = rotated_labels[48000:]


print(pixel_train_features.shape)
print(pixel_train_labels.shape)
print(pixel_val_features.shape)
print(pixel_val_labels.shape)

<h1> Xgboost <h1>

In [None]:

#Convert the data into DMatrix format
pixel_train = xgb.DMatrix(pixel_train_features, label=pixel_train_labels)
pixel_val = xgb.DMatrix(pixel_val_features, label=pixel_val_labels)
pixel_test = xgb.DMatrix(rotated_test_images, label=rotated_test_labels)
print(rotated_test_labels.shape)

<h1> Parameters <h1>

In [None]:

# Set the parameters for the xgboost
params = {
    'num_class': 10,
    'max_depth': 5,
    'eta': 0.5,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softmax',  # Ensure you're using the softmax objective
}

