In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
from collections import Counter
import json
import subprocess

print(tf.__version__)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

2025-10-08 21:35:54.251462: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2.16.2
1 Physical GPUs, 1 Logical GPUs


2025-10-08 21:35:56.469777: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:926] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-10-08 21:36:00.064968: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:926] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-10-08 21:36:00.065003: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:926] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-10-08 21:36:00.065987: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:926] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-10-08 21:36:00.066027: I external/local_xla/xla/stream_executor/rocm/rocm_executor.

In [3]:
# clean for debug purpose only 

trainData = None
valData = None
testData = None
model = None

tf.keras.backend.clear_session(free_memory=True)


## Loading the data

TODO

In [2]:
# load images and labels 


labelSet = Counter()
dataDict = {}
vocab = set()


datasetLen = 0
with open("./A2_train_v3.jsonl", "r") as jsonFile:
	for line in jsonFile:
		datasetLen += 1
		loadedLine = json.loads(line)
		if loadedLine["Image_ID"] not in dataDict:
			dataDict[loadedLine["Image_ID"]] = []

		labelSet[loadedLine["Label"]] += 1

		hypo = [''.join(char for char in word if char.isalnum()) for word in loadedLine["Hypothesis"].lower().split()]
		vocab.update(hypo)

		dataDict[loadedLine["Image_ID"]].append((hypo, loadedLine["Label"]))

labelTuple = tuple(labelSet.keys())
vocab = list(vocab)

vocabIndex = {vocab[i]: i for i in range(len(vocab))}

print(len(dataDict.keys()))
print(len(vocabIndex))
print(datasetLen)
print(labelSet)

19573
9274
39129
Counter({'entailment': 19619, 'contradiction': 19510})


## Creating the tensorflow dataset

TODO

In [3]:
# load the glove embeddings 

def getGlove():
  print('Downloading glove')
  subprocess.run(['wget', 'https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip'])
  subprocess.run(['unzip', '-q glove.6B.zip'])

def generateMatrix(dim):
    print('parsing glove data')
    embeddingMatrix = np.zeros((len(vocab), dim))
    embeddedVocab = []
    
    with open(f'glove.6B.{dim}d.txt', encoding="utf-8") as gloveFile:
      for line in gloveFile:
        values = line.split()
        word = values[0]
    
        if word in vocab:
          embeddingMatrix[vocabIndex[word]] = np.asarray(values[1:], dtype='float32')
          embeddedVocab.append(word)

    print(f'embedded {len(embeddedVocab)} out of {len(vocab)}')
    return embeddingMatrix


embeddingDim = 200
embeddingMatrix = None

if not os.path.isfile(f'glove.6B.{embeddingDim}d.txt'):
  getGlove()

if os.path.isfile(f'embeddingMatrix.{embeddingDim}d.pkl'):
  with open(f'embeddingMatrix.{embeddingDim}d.pkl', 'rb') as f:
    embeddingMatrix = pickle.load(f)

  print('loaded matrix')
else:
  embeddingMatrix = generateMatrix(embeddingDim)

  with open(f'embeddingMatrix.{embeddingDim}d.pkl', 'ab') as f:
    pickle.dump(embeddingMatrix, f)

  print('saved matrix')

loaded matrix


In [68]:
maxLen = 128

X1array = []
X2array = []
YArray = []

for key, hypoAndLabels in dataDict.items():
	img = f'./A2_Images/{key}.jpg'

	for hypo, label in hypoAndLabels:
		label = tf.convert_to_tensor([labelTuple.index(label)])
		label.set_shape([1])

		hypo = [vocabIndex[word] for word in hypo]
		hypo = tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences([hypo], maxlen=maxLen)[0])
		hypo.set_shape([maxLen])

		X1array.append(img)
		X2array.append(hypo)
		YArray.append(label)

X1Numpy = np.array(X1array)
X2Numpy = np.array(X2array, dtype='uint16')
YNumpy = np.array(YArray, dtype='uint8')

dataset = tf.data.Dataset.from_tensor_slices(({'image': X1Numpy, 'text': X2Numpy}, YNumpy))

def getImage(path):
	img = tf.io.read_file(path)
	img = tf.io.decode_image(img, channels=3, dtype=tf.uint8)
	img = tf.image.resize(img, (512, 512))
	img = tf.image.convert_image_dtype(img, dtype=tf.bfloat16)
	img = tf.reshape(img, (1, 512, 512, 3))
	img = tf.image.extract_patches(
		images=img,
		sizes=[1, 16, 16, 1],
		strides=[1, 16, 16, 1],
		rates=[1, 1, 1, 1],
		padding='VALID'
	)
	img = tf.reshape(img, (32,32,768))

	return img

def getImageWrapper(x, y):
	img = tf.py_function(func=getImage, inp=[x['image']], Tout=tf.bfloat16)
	img.set_shape([32,32,768])

	x['image'] = img 
	return x, y

dataset = dataset.map(getImageWrapper, num_parallel_calls=tf.data.AUTOTUNE)

In [69]:
for i in dataset.take(1):
	print(i)

({'image': <tf.Tensor: shape=(32, 32, 768), dtype=bfloat16, numpy=
array([[[55, 116, 161, ..., 50.25, 114, 165],
        [41.25, 111, 162, ..., 216, 208, 187],
        [214, 212, 190, ..., 206, 199, 180],
        ...,
        [157, 70, 13.625, ..., 175, 176, 162],
        [173, 171, 158, ..., 180, 178, 165],
        [179, 180, 166, ..., 168, 166, 153]],

       [[52.25, 118.5, 162, ..., 53, 120, 163],
        [48, 114, 162, ..., 210, 204, 181],
        [216, 209, 184, ..., 203, 202, 182],
        ...,
        [164, 29, 1.25, ..., 186, 188, 167],
        [175, 176, 162, ..., 184, 182, 169],
        [183, 181, 168, ..., 170, 168, 155]],

       [[41.25, 105, 153, ..., 118, 155, 178],
        [58.5, 122, 166, ..., 151, 154, 145],
        [215, 208, 184, ..., 191, 192, 171],
        ...,
        [190, 52.75, 5.28125, ..., 181, 179, 166],
        [187, 188, 169, ..., 187, 185, 172],
        [186, 184, 171, ..., 168, 166, 153]],

       ...,

       [[157, 157, 142, ..., 173, 172, 160],
    

2025-10-08 22:12:02.576742: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [50]:
image = np.random.random((64, 64, 1)).astype("float32") # 1 RGB image
patches = tf.keras.ops.image.extract_patches(image, (16, 16))

print(patches)
print(tf.reshape(patches, (256, 4, 4)).numpy())


tf.Tensor(
[[[0.8390292  0.7480758  0.90529305 ... 0.79180294 0.9660142  0.04925289]
  [0.88415784 0.12577565 0.28305644 ... 0.908674   0.7082081  0.07784933]
  [0.14504912 0.8904532  0.25477633 ... 0.5163142  0.61102545 0.63277656]
  [0.4768772  0.7293987  0.89062566 ... 0.31080365 0.19454598 0.73740184]]

 [[0.04842119 0.78201604 0.5962284  ... 0.393729   0.64133334 0.30180177]
  [0.74369717 0.80858225 0.8433     ... 0.6428573  0.7747854  0.8645563 ]
  [0.35362944 0.45882687 0.21380596 ... 0.83703375 0.08469753 0.13021818]
  [0.41641042 0.12799005 0.35094154 ... 0.36266577 0.86535305 0.18672238]]

 [[0.36918664 0.23469527 0.29032108 ... 0.21911213 0.06178004 0.77791804]
  [0.87476987 0.68608606 0.0249742  ... 0.12947448 0.14006604 0.9795229 ]
  [0.50127435 0.38305467 0.7084395  ... 0.1350239  0.37582183 0.73694515]
  [0.36680695 0.8297529  0.6200732  ... 0.6234115  0.6546367  0.80751973]]

 [[0.46511182 0.80500436 0.09670209 ... 0.12187485 0.54915303 0.60303885]
  [0.23125681 0.80721

In [8]:
testSize = int(datasetLen * 0.05)
valSize = int(datasetLen * 0.05)
trainSize = int(datasetLen - testSize - valSize)
batchSize = 100

def optimize(ds):
	ds = ds.batch(batchSize) 
	ds = ds.cache() 
	ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
	
	return ds

def getTest(ds):
	ds = ds.take(testSize) 
	ds = optimize(ds)
	
	return ds

def getVal(ds):
	ds = ds.skip(testSize)
	ds = ds.take(valSize) 
	ds = optimize(ds)

	return ds

def getTrain(ds):
	ds = ds.skip(valSize + testSize)
	ds = ds.take(trainSize)
	ds = optimize(ds)

	return ds

testDS = getTest(dataset)
valDS = getVal(dataset)
trainDS = getTrain(dataset)


print(f"test data batches {tf.data.experimental.cardinality(testDS).numpy()}")
print(f"val data batches {tf.data.experimental.cardinality(valDS).numpy()}")
print(f"train data batches {tf.data.experimental.cardinality(trainDS).numpy()}")
print(f'ratios test:{testSize} val:{valSize} train:{trainSize}')


test data batches 20
val data batches 20
train data batches 353
ratios test:1956 val:1956 train:35217


# Training

ToDo


In [None]:
# Image Encoder 
class PatchEncoder(Layer):
    def __init__(self, num_patches=196, projection_dim=768):
        super(PatchEncoder, self).__init__()
        self.num_patches = num_patches
        self.projection_dim = projection_dim
        w_init = tf.random_normal_initializer()
        class_token = w_init(shape=(1, projection_dim), dtype="float32")
        self.class_token = tf.Variable(initial_value=class_token, trainable=True)
        self.projection = Dense(units=projection_dim)
        self.position_embedding = Embedding(input_dim=num_patches+1, output_dim=projection_dim)

    def call(self, patch):
        batch = tf.shape(patch)[0]
        # reshape the class token embedins
        class_token = tf.tile(self.class_token, multiples = [batch, 1])
        class_token = tf.reshape(class_token, (batch, 1, self.projection_dim))
        # calculate patches embeddings
        patches_embed = self.projection(patch)
        patches_embed = tf.concat([patches_embed, class_token], 1)
        # calcualte positional embeddings
        positions = tf.range(start=0, limit=self.num_patches+1, delta=1)
        positions_embed = self.position_embedding(positions)
        # add both embeddings
        encoded = patches_embed + positions_embed
        return encoded

inputImageLayer = tf.keras.layers.Input(shape=(224, 224, 3), name='image')


# imagenet pre made model
baseModel = tf.keras.applications.MobileNetV3Large(
  weights='imagenet',
  include_top=False,
  pooling='max'
)

baseModel.trainable = False
convoLayers = baseModel(inputImageLayer, training=False)
denseImage = tf.keras.layers.Dense(256, activation='relu', name='denseImage')(convoLayers)

imageOut = tf.keras.layers.LayerNormalization()(denseImage)


# Text Encoder 

inputTextLayer = tf.keras.layers.Input(shape=(maxLen,), name='text')
embeddingText = tf.keras.layers.Embedding(len(vocab), embeddingDim, mask_zero=True, weights=[embeddingMatrix], trainable=False, name='embedding')(inputTextLayer)
attentionText = tf.keras.layers.MultiHeadAttention(8, maxLen, name='attentionText', output_shape=(256))(denseText, denseText, denseText)
attentionText = tf.keras.layers.MultiHeadAttention(8, maxLen, name='attentionText', output_shape=(256))(denseText, denseText, denseText)

textOut = tf.keras.layers.LayerNormalization()(attentionText)

# Entanglement decoder 
decoderInput = tf.keras.layers.Concatenate()([imageOut, textOut])

dense1 = tf.keras.layers.Dense(256, activation='relu')(decoderInput)
dense2 = tf.keras.layers.Dense(256, activation='relu')(dense1)

decoderOutput = tf.keras.layers.Dense(1, name='output', activation='sigmoid')(dense2)

model = tf.keras.Model(inputs=[inputImageLayer, inputTextLayer], outputs=decoderOutput)

model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy']
)
model.summary()


  return MobileNetV3(


NameError: name 'denseText' is not defined

In [None]:
# Training

history = model.fit(
  trainDS,
	validation_data=valDS,
  epochs=8,
  batch_size=batchSize
)

In [None]:
model.save('./final.keras')