In [59]:
import numpy as np
import kagglehub
import os
from collections import defaultdict
from scipy.sparse import csr_matrix

In [2]:
if not os.path.exists("/content/data"):
  !kaggle datasets download -d hojjatk/mnist-dataset -p /content
  !unzip /content/mnist-dataset.zip -d /content/data
  !rm /content/mnist-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/hojjatk/mnist-dataset
License(s): copyright-authors
Downloading mnist-dataset.zip to /content
 91% 20.0M/22.0M [00:01<00:00, 19.6MB/s]
100% 22.0M/22.0M [00:01<00:00, 12.9MB/s]
Archive:  /content/mnist-dataset.zip
  inflating: /content/data/t10k-images-idx3-ubyte/t10k-images-idx3-ubyte  
  inflating: /content/data/t10k-images.idx3-ubyte  
  inflating: /content/data/t10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte  
  inflating: /content/data/t10k-labels.idx1-ubyte  
  inflating: /content/data/train-images-idx3-ubyte/train-images-idx3-ubyte  
  inflating: /content/data/train-images.idx3-ubyte  
  inflating: /content/data/train-labels-idx1-ubyte/train-labels-idx1-ubyte  
  inflating: /content/data/train-labels.idx1-ubyte  


In [3]:
import numpy as np
import struct

def read_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(shape)

# Assuming the file is in the /content/mnist-dataset directory
images = read_idx('/content/data/train-images-idx3-ubyte/train-images-idx3-ubyte')
labels = read_idx('/content/data/train-labels-idx1-ubyte/train-labels-idx1-ubyte')

print(images.shape)  # Print the shape of the image data
print(labels.shape)  # Print the shape of the label datan

(60000, 28, 28)
(60000,)


In [4]:
idxInImage = [np.where(labels == i)[0] for i in range(10)]
idxInImage

[array([    1,    21,    34, ..., 59952, 59972, 59987]),
 array([    3,     6,     8, ..., 59979, 59984, 59994]),
 array([    5,    16,    25, ..., 59983, 59985, 59991]),
 array([    7,    10,    12, ..., 59978, 59980, 59996]),
 array([    2,     9,    20, ..., 59943, 59951, 59975]),
 array([    0,    11,    35, ..., 59968, 59993, 59997]),
 array([   13,    18,    32, ..., 59982, 59986, 59998]),
 array([   15,    29,    38, ..., 59963, 59977, 59988]),
 array([   17,    31,    41, ..., 59989, 59995, 59999]),
 array([    4,    19,    22, ..., 59973, 59990, 59992])]

In [5]:
class Neuron:
    def __init__(self, pixelNum):
        self.pixelNum = pixelNum
        self.connections = defaultdict(int)

In [6]:
neurons = [Neuron(i) for i in range(28*28)]

In [56]:
num_pixels = images.shape[1] * images.shape[2]  # Total number of neurons (one per pixel)
connection_matrix = np.zeros((num_pixels, num_pixels), dtype=np.int32)  # Efficient storage

In [7]:
# Set the pixel threshold
pixelThreshold = 100

# Get image dimensions (MNIST images are 28x28)
image_height, image_width = images.shape[1], images.shape[2]

# Find all pixel indices where the value > pixelThreshold
image_idx, row_idx, col_idx = np.where(images > pixelThreshold)

# Convert (row, col) to a single pixel index
pixel_nums = row_idx * image_width + col_idx

# Get unique image indices and where they appear
unique_images, start_indices = np.unique(image_idx, return_index=True)

# Split pixel numbers efficiently for each image
image_pixel_lists = np.split(pixel_nums, start_indices[1:])

# Create a dictionary mapping image index -> list of pixel numbers
image_pixel_dict = dict(zip(unique_images, image_pixel_lists))

# Print pixel numbers for the first few images
for img_id in sorted(image_pixel_dict.keys())[:5]:  # Print for first 5 images
    print(f"Image {img_id}: {image_pixel_dict[img_id]}")

Image 0: [156 157 158 160 161 162 163 179 180 181 182 183 184 185 186 187 188 189
 190 204 205 206 207 208 209 210 211 212 213 232 233 234 235 236 237 238
 239 240 241 261 262 263 264 265 269 291 292 319 320 321 348 349 377 378
 379 380 406 407 408 409 435 436 437 438 465 466 467 493 494 495 519 520
 521 522 523 545 546 547 548 549 550 551 571 572 573 574 575 576 577 598
 599 600 601 602 603 623 624 625 626 627 628 629 649 650 651 652 653 654
 655 656 676 677 678 679 680 681 682]
Image 1: [128 129 130 155 156 157 158 159 182 183 184 185 186 187 209 210 211 212
 213 215 216 217 235 236 237 238 239 240 241 243 244 245 263 264 265 266
 267 268 269 272 273 290 291 292 293 296 300 301 317 318 319 320 328 329
 330 344 345 346 356 357 358 372 373 384 385 386 399 400 401 412 413 414
 427 428 429 440 441 442 455 456 467 468 469 483 484 494 495 496 511 512
 521 522 523 539 540 547 548 549 550 567 568 569 573 574 575 576 577 595
 596 597 598 599 600 601 602 603 604 623 624 625 626 627 628 629 630

In [8]:
# Iterate through images and update neuron connections
for img_id, pixels in zip(unique_images, image_pixel_lists):
    # Each pixel in the image is connected to every other pixel in that image
    for i in range(len(pixels)):
        for j in range(i + 1, len(pixels)):  # Avoid redundant loops
            p1, p2 = pixels[i], pixels[j]
            neurons[p1].connections[p2] += 1
            neurons[p2].connections[p1] += 1  # Since connections are bidirectional


# Print sample neuron connections (e.g., first 5 neurons)
for i in range(5):
    print(f"Neuron {i} connections: {dict(neurons[i].connections)}")

KeyboardInterrupt: 

In [57]:
# Iterate through images and update connections efficiently
for img_id, pixels in zip(unique_images, image_pixel_lists):
    # Create all pairwise pixel connections (avoid looping manually)
    pixel_combinations = np.array(np.meshgrid(pixels, pixels)).T.reshape(-1, 2)

    # Remove self-connections
    pixel_combinations = pixel_combinations[pixel_combinations[:, 0] != pixel_combinations[:, 1]]

    # Get row and column indices for batch updates
    row_indices = pixel_combinations[:, 0]
    col_indices = pixel_combinations[:, 1]

    # Use NumPy's advanced indexing to increment values in bulk
    np.add.at(connection_matrix, (row_indices, col_indices), 1)  # Vectorized update

In [53]:
np.array(np.meshgrid(pixels, pixels)).T.reshape(-1, 2)

array([[ 71,  71],
       [ 71,  72],
       [ 71,  73],
       ...,
       [606, 604],
       [606, 605],
       [606, 606]])

In [60]:
num_pixels = 28 * 28  # Total number of pixels in MNIST images
num_images = images.shape[0]

# Find all pixel indices where value > pixelThreshold
image_idx, row_idx, col_idx = np.where(images > pixelThreshold)

# Convert (row, col) to a 1D pixel index
pixel_nums = row_idx * 28 + col_idx

# Create a sparse binary activation matrix
# - Rows represent images
# - Columns represent pixels
# - A 1 in position (i, j) means pixel `j` is active in image `i`
activation_matrix = csr_matrix(
    (np.ones(len(image_idx), dtype=np.int8), (image_idx, pixel_nums)),
    shape=(num_images, num_pixels)
)

# Compute co-occurrence matrix using matrix multiplication
co_occurrence_matrix = activation_matrix.T @ activation_matrix

# Convert to CSR format for efficient lookups
co_occurrence_matrix = co_occurrence_matrix.tocsr()

In [69]:
co_occurrence_matrix.toarray()[131]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    1,    8,    8,    2,    2,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    1,    8,   21,   33,   43,   75,  -56,   66,   22,
       -106,   59,   16,    4,    1,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    2,    2,    6,   14,   33,   73, -106,   22,
        -87,  -89,   99,   12,    5,   22,  -15,  -64,   50,   11,    2,
          0,    0,    0,    0,    0,    0,    2,    4,    2,   11,   27,
         43,  104,  -16,  -71,  -43,  120,  -10,  102,  -37,  -68, -121,
        -56,  -86,   -1, -111,   36,    7,    0,    0,    0,    0,    0,
          1,    4,    9,   14,   29,   75,  -85,   