In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
from collections import Counter
import json
import subprocess
from vit_keras import vit

print(tf.__version__)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

2025-10-11 13:17:21.958470: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2.16.2
1 Physical GPUs, 1 Logical GPUs


2025-10-11 13:17:24.985079: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:926] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-10-11 13:17:27.450845: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:926] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-10-11 13:17:27.450891: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:926] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-10-11 13:17:27.451671: I external/local_xla/xla/stream_executor/rocm/rocm_executor.cc:926] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-10-11 13:17:27.451724: I external/local_xla/xla/stream_executor/rocm/rocm_executor.

## Loading the data

TODO

In [2]:
# load images and labels 

labelSet = Counter()
dataDict = {}
vocab = set()

datasetLen = 0
with open("./A2_train_v3.jsonl", "r") as jsonFile:
	for line in jsonFile:
		datasetLen += 1
		loadedLine = json.loads(line)
		if loadedLine["Image_ID"] not in dataDict:
			dataDict[loadedLine["Image_ID"]] = []

		labelSet[loadedLine["Label"]] += 1

		hypo = [''.join(char for char in word if char.isalnum()) for word in loadedLine["Hypothesis"].lower().split()]
		vocab.update(hypo)

		dataDict[loadedLine["Image_ID"]].append((hypo, loadedLine["Label"]))

labelTuple = tuple(labelSet.keys())
vocab = list(vocab)

vocabIndex = {vocab[i]: i for i in range(len(vocab))}

print(len(dataDict.keys()))
print(len(vocabIndex))
print(datasetLen)
print(labelSet)

# load the glove embeddings 

def getGlove():
  print('Downloading glove')
  subprocess.run(['wget', 'https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip'])
  subprocess.run(['unzip', '-q glove.6B.zip'])

def generateMatrix(dim):
    print('parsing glove data')
    embeddingMatrix = np.zeros((len(vocab), dim))
    embeddedVocab = []
    
    with open(f'glove.6B.{dim}d.txt', encoding="utf-8") as gloveFile:
      for line in gloveFile:
        values = line.split()
        word = values[0]
    
        if word in vocab:
          embeddingMatrix[vocabIndex[word]] = np.asarray(values[1:], dtype='float32')
          embeddedVocab.append(word)

    print(f'embedded {len(embeddedVocab)} out of {len(vocab)}')
    return embeddingMatrix


embeddingDim = 200
embeddingMatrix = None

if not os.path.isfile(f'glove.6B.{embeddingDim}d.txt'):
  getGlove()

if os.path.isfile(f'embeddingMatrix.{embeddingDim}d.pkl'):
  with open(f'embeddingMatrix.{embeddingDim}d.pkl', 'rb') as f:
    embeddingMatrix = pickle.load(f)

  print('loaded matrix')
else:
  embeddingMatrix = generateMatrix(embeddingDim)

  with open(f'embeddingMatrix.{embeddingDim}d.pkl', 'ab') as f:
    pickle.dump(embeddingMatrix, f)

  print('saved matrix')

19573
9274
39129
Counter({'entailment': 19619, 'contradiction': 19510})
loaded matrix


## Creating the tensorflow dataset

TODO

In [None]:
maxLen = 128

X1array = []
X2array = []
YArray = []

for key, hypoAndLabels in dataDict.items():
	img = f'./A2_Images/{key}.jpg'

	for hypo, label in hypoAndLabels:
		label = tf.convert_to_tensor([labelTuple.index(label)])
		label.set_shape([1])

		hypo = [vocabIndex[word] for word in hypo]
		hypo = tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences([hypo], maxlen=maxLen)[0])
		hypo.set_shape([maxLen])

		X1array.append(img)
		X2array.append(hypo)
		YArray.append(label)

X1Numpy = np.array(X1array)
X2Numpy = np.array(X2array, dtype='uint16')
YNumpy = np.array(YArray, dtype='uint8')

dataset = tf.data.Dataset.from_tensor_slices(({'image': X1Numpy, 'text': X2Numpy}, YNumpy))

imageChannels = 3
imageRes = 224
patchesPerImage = 8

patchRes = int(imageRes / patchesPerImage) 
patchNum = patchesPerImage ** 2 * imageChannels 

def getImage(path):
	img = tf.io.read_file(path)
	img = tf.io.decode_image(img, channels=imageChannels, dtype=tf.float32)
	img = tf.image.resize(img, (imageRes, imageRes))

	return img

def getImageWrapper(x, y):
	img = tf.py_function(func=getImage, inp=[x['image']], Tout=tf.float32)
	img.set_shape([imageRes, imageRes, imageChannels])

	x['image'] = img 
	return x, y

dataset = dataset.map(getImageWrapper, num_parallel_calls=8)

testSize = int(datasetLen * 0.05)
valSize = int(datasetLen * 0.05)
trainSize = int(datasetLen - testSize - valSize)
batchSize = 100

def optimize(ds, name):
	ds = ds.batch(batchSize) 
	ds = ds.cache(name + 'dataset.cache') 
	ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
	
	return ds

def getTest(ds):
	ds = ds.take(testSize) 
	ds = optimize(ds, 'test')
	
	return ds

def getVal(ds):
	ds = ds.skip(testSize)
	ds = ds.take(valSize) 
	ds = optimize(ds, 'val')

	return ds

def getTrain(ds):
	ds = ds.skip(valSize + testSize)
	ds = ds.take(trainSize)
	ds = optimize(ds, 'train')

	return ds

testDS = getTest(dataset)
valDS = getVal(dataset)
trainDS = getTrain(dataset)


print(f"test data batches {tf.data.experimental.cardinality(testDS).numpy()}")
print(f"val data batches {tf.data.experimental.cardinality(valDS).numpy()}")
print(f"train data batches {tf.data.experimental.cardinality(trainDS).numpy()}")
print(f'ratios test:{testSize} val:{valSize} train:{trainSize}')


test data batches 20
val data batches 20
train data batches 353
ratios test:1956 val:1956 train:35217


In [None]:
# clear dataset cache

for f in os.listdir('./'):  
	if 'dataset.cache' in f:
		os.remove(f)

# Training

ToDo


In [None]:
densSize = 256
imageTransformerLen = 256

class PositionEncoder(tf.keras.Layer):
	def __init__(self, patchRes, patchNum, name=None):
		super(PositionEncoder, self).__init__(name=name)
		self.patchRes = patchRes
		self.patchNum = patchNum
		w_init = tf.random_normal_initializer()
		classToken = w_init(shape=(1, patchNum), dtype=tf.float32)
		self.classToken = tf.Variable(initial_value=classToken, trainable=True)
		self.projection =tf.keras.layers.Dense(units=patchNum)
		self.positionEmbedding = tf.keras.layers.Embedding(input_dim=patchRes+1, output_dim=patchNum, mask_zero=True)

	def call(self, patch):
		batch = tf.shape(patch)[0]
		classToken = tf.tile(self.classToken, multiples = [batch, 1])
		classToken = tf.reshape(classToken, (batch, 1, self.patchNum))
		patchesEmbed = self.projection(patch)
		patchesEmbed = tf.concat([patchesEmbed, classToken], 1)
		positions = tf.range(start=0, limit=self.patchRes+1, delta=1)
		positionsEmbed = self.positionEmbedding(positions)
		encoded = patchesEmbed + positionsEmbed
		return encoded

class Transformer(tf.keras.Layer):
	def __init__(self, dim, heads=4, dropout=0.1, name=None):
		super(Transformer, self).__init__(name=name)
		self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
		self.attn = tf.keras.layers.MultiHeadAttention(num_heads=heads, key_dim=dim, dropout=dropout)
		self.norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
		self.dense1 = tf.keras.layers.Dense(dim * 2, activation='relu')
		self.dense2 = tf.keras.layers.Dense(dim, activation='relu')
		self.drop = tf.keras.layers.Dropout(dropout)

	def call(self, inputLayers):
		attention = self.attn(inputLayers, inputLayers)
		attention = tf.keras.layers.Add()([attention, inputLayers])
		attention = self.norm1(attention)

		mlp = self.drop(attention)
		mlp = self.dense1(mlp)
		mlp = self.drop(mlp)
		mlp = self.dense2(mlp)
		mlp = tf.keras.layers.Add()([mlp, attention])
		mlp = self.norm2(mlp)

		return mlp

# Image Encoder 
inputImageLayer = tf.keras.layers.Input(shape=(imageRes, imageRes, imageChannels), name='image')

baseModel = tf.keras.applications.MobileNetV3Large(
	weights='imagenet',  
	input_shape=(imageRes, imageRes, imageChannels),
	include_top=False,
	pooling='max'
)

baseModel.trainable = False
imageOut = baseModel(inputImageLayer, training=False)
# pretrainedLayers = tf.keras.layers.Reshape((49, -1))(pretrainedLayers)
# imageOut = tf.keras.layers.ZeroPadding1D((1, 0), name='imageOut')(pretrainedLayers)

# imageOut = tf.keras.layers.GlobalMaxPooling1D(name='textOut')(pretrainedLayers)


# convoLayers = tf.keras.layers.Conv2D(imageTransformerLen, 16, strides=16, activation='relu')(inputImageLayer)
# convoLayers = tf.keras.layers.Reshape((-1, imageTransformerLen))(convoLayers)

# transformerImage = Transformer(imageTransformerLen)(convoLayers)
# transformerImage = Transformer(imageTransformerLen)(transformerImage)
# transformerImage = Transformer(imageTransformerLen)(transformerImage)

# imageOut = tf.keras.layers.AveragePooling1D(4)(transformerImage)
# imageOut = tf.keras.layers.ZeroPadding1D((1, 0), name='imageOut')(imageOut)


# Text Encoder 
inputTextLayer = tf.keras.layers.Input(shape=(maxLen,), name='text')

embeddingText = tf.keras.layers.Embedding(len(vocab), embeddingDim, weights=[embeddingMatrix], trainable=False)(inputTextLayer)
reshapeText = tf.keras.layers.Reshape((embeddingDim, maxLen))(embeddingText)
embeddingPositionText = PositionEncoder(embeddingDim, maxLen, name='embeddingText')(reshapeText)

transformerText = Transformer(maxLen)(embeddingPositionText)
# transformerText = Transformer(maxLen)(transformerText)
# transformerText = Transformer(maxLen)(transformerText)

textOut = tf.keras.layers.GlobalMaxPooling1D(name='textOut')(transformerText)

# Entanglement decoder 
decoderInput = tf.keras.layers.Concatenate(name='decoderInput')([imageOut, textOut])

dense = tf.keras.layers.Dense(densSize, activation='relu')(decoderInput)
dense = tf.keras.layers.Dense(densSize, activation='relu')(dense)
dense = tf.keras.layers.Dense(densSize, activation='relu')(dense)

decoderOutput = tf.keras.layers.Dense(1, name='output', activation='sigmoid')(dense)

model = tf.keras.Model(inputs=[inputImageLayer, inputTextLayer], outputs=decoderOutput)

model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy']
)
model.summary()


In [87]:
# Training

history = model.fit(
  trainDS,
	validation_data=valDS,
  epochs=8,
  batch_size=batchSize
)

Epoch 1/8
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 82ms/step - accuracy: 0.5501 - loss: 0.6890 - val_accuracy: 0.6048 - val_loss: 0.6674
Epoch 2/8
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 60ms/step - accuracy: 0.6020 - loss: 0.6659 - val_accuracy: 0.5706 - val_loss: 0.7081
Epoch 3/8
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 59ms/step - accuracy: 0.5801 - loss: 0.6757 - val_accuracy: 0.5716 - val_loss: 0.6783
Epoch 4/8
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 59ms/step - accuracy: 0.5892 - loss: 0.6726 - val_accuracy: 0.5700 - val_loss: 0.6795
Epoch 5/8
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 59ms/step - accuracy: 0.5614 - loss: 0.6817 - val_accuracy: 0.4949 - val_loss: 0.6894
Epoch 6/8
[1m129/353[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m13s[0m 58ms/step - accuracy: 0.5729 - loss: 0.6809

KeyboardInterrupt: 

In [None]:
model.save('a2-models/pretrained-image-b16.keras')

In [None]:
tf.keras.backend.clear_session(free_memory=True)
import gc 
gc.collect()

In [80]:
model = tf.keras.models.load_model('a2-models/firstGood.keras')

TypeError: <class 'keras.src.models.functional.Functional'> could not be deserialized properly. Please ensure that components that are Python object instances (layers, models, etc.) returned by `get_config()` are explicitly deserialized in the model's `from_config()` method.

config={'module': 'keras.src.models.functional', 'class_name': 'Functional', 'config': {}, 'registered_name': 'Functional', 'build_config': {'input_shape': None}, 'compile_config': {'optimizer': {'module': 'keras.optimizers', 'class_name': 'Adam', 'config': {'name': 'adam', 'learning_rate': 0.0010000000474974513, 'weight_decay': None, 'clipnorm': None, 'global_clipnorm': None, 'clipvalue': None, 'use_ema': False, 'ema_momentum': 0.99, 'ema_overwrite_frequency': None, 'loss_scale_factor': None, 'gradient_accumulation_steps': None, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-07, 'amsgrad': False}, 'registered_name': None}, 'loss': 'binary_crossentropy', 'loss_weights': None, 'metrics': ['accuracy'], 'weighted_metrics': None, 'run_eagerly': False, 'steps_per_execution': 1, 'jit_compile': True}}.

Exception encountered: Could not locate class 'PositionEncoder'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'module': None, 'class_name': 'PositionEncoder', 'config': {'name': 'embeddingImage', 'patchRes': 256, 'patchNum': 192, 'trainable': True, 'dtype': {'module': 'keras', 'class_name': 'DTypePolicy', 'config': {'name': 'float32'}, 'registered_name': None, 'shared_object_id': 125264365974432}}, 'registered_name': 'PositionEncoder', 'build_config': {'input_shape': [None, 256, 192]}, 'name': 'embeddingImage', 'inbound_nodes': [{'args': [{'class_name': '__keras_tensor__', 'config': {'shape': [None, 256, 192], 'dtype': 'float32', 'keras_history': ['image', 0, 0]}}], 'kwargs': {}}]}