In [4]:
import os
import csv
import torch
import random
import hashlib
import itertools
import numpy as np
import polars as pl
from PIL import Image
from tqdm import tqdm
import torch.nn as nn
import matplotlib.pyplot as plt
from collections import Counter
import torch.nn.functional as nnF
from datetime import date, datetime
import torchvision.transforms as transforms
import torchvision.transforms.functional as F
from torch.utils.data import DataLoader, Dataset

from sklearn.feature_extraction.image import img_to_graph

In [5]:
random.seed(234213)

In [6]:
# collecting training image paths and names into lists
training_images_output_path = "../../replicatingAlexNet/images/training_images_processed/"

training_images_processed_path = [
    training_images_output_path + f for f in os.listdir(training_images_output_path)
    if f.endswith('.JPEG')
]

training_images_processed_names = [
    f for f in os.listdir(training_images_output_path)
    if f.endswith('.JPEG')
]

# collecting validation image paths and names into lists
validation_images_output_path = "../../replicatingAlexNet/images/validation_images_processed/"

validation_images_processed_path = [
    validation_images_output_path + f for f in os.listdir(validation_images_output_path)
    if f.endswith('.JPEG')
]

validation_images_processed_names = [
    f for f in os.listdir(validation_images_output_path)
    if f.endswith('.JPEG')
]

# sorting names in ascending order; this is crucial to ensure that the
# proper validation labels are attached to the file names
validation_images_processed_names = sorted(validation_images_processed_names)

labels_w_information = pl.read_csv("../../replicatingAlexNet/images/labels_w_information.csv")
training_labels_for_CEloss = labels_w_information["made_up_label"].to_list()

In [7]:
print(
    img_to_graph(
        F.pil_to_tensor(Image.open(training_images_processed_path[0])), 
        return_as=np.ndarray
    )
)

[[17  0  0 ...  0  0  0]
 [ 0 17  3 ...  0  0  0]
 [ 0  3 14 ...  0  0  0]
 ...
 [ 0  0  0 ... 11  0  0]
 [ 0  0  0 ...  0 11  2]
 [ 0  0  0 ...  0  2  9]]


In [8]:
# USING SOME PREVIOUSLY WRITTEN CODE

# below, I am sampling only labels with greater than 1200 examples and then randomly selecting 25 labels
labels_thousand_plus = [
    label for label, count in Counter(training_labels_for_CEloss).items() 
    if count >= 1200
]
labels_thousand_plus_subset = random.sample(labels_thousand_plus, k=25)

# mapping the subset to numbers between 0-24
# this is needed for the CE loss function
labels_subset_mapped = {
    original_label: new_label for original_label, new_label
    in zip(labels_thousand_plus_subset, range(0, len(labels_thousand_plus_subset)))
}

# a dictionary of indices as keys with the 25 randomly selected labels as values
label_w_index = {
    index: label for index, label 
    in zip(range(0, len(training_labels_for_CEloss)), training_labels_for_CEloss)
    if label in labels_thousand_plus_subset
}

# randomly selecting k of those indices which are then used to 
# select the subset of inputs and labels in training_subset
random_indices = random.sample(list(label_w_index.keys()), k=int(256 * 128))

In [11]:
# loading the validation IDs provided in the Image Net devkit
validation_ILSVRC2010_IDs = []

with open("../../replicatingAlexNet/devkit-1.0/data/ILSVRC2010_validation_ground_truth.txt") as file:
    while line := file.readline():
        validation_ILSVRC2010_IDs.append(int(line.rstrip()))

In [12]:
# converting the IDs to labels for the validation data
validation_true_labels = [i - 1 for i in validation_ILSVRC2010_IDs]

validation_subset_25_labels = {
    image_name: label for image_name, label 
    in zip(validation_images_processed_names, validation_true_labels) 
    if label in labels_thousand_plus_subset
}