In [33]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os

print(tf.__version__)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

2.16.2
1 Physical GPUs, 1 Logical GPUs


In [32]:
# clean for debug purpose only 

trainData = None
valData = None
testData = None
model = None

tf.keras.backend.clear_session(free_memory=True)


## Loading the data

TODO

In [3]:
# load images and labels 
from collections import Counter
import json

labelSet = Counter()
dataDict = {}
vocab = set()


datasetLen = 0
with open("./A2_train_v3.jsonl", "r") as jsonFile:
	for line in jsonFile:
		datasetLen += 1
		loadedLine = json.loads(line)
		if loadedLine["Image_ID"] not in dataDict:
			dataDict[loadedLine["Image_ID"]] = []

		labelSet[loadedLine["Label"]] += 1

		hypo = [''.join(char for char in word if char.isalnum()) for word in loadedLine["Hypothesis"].lower().split()]
		vocab.update(hypo)

		dataDict[loadedLine["Image_ID"]].append((hypo, loadedLine["Label"]))

labelTuple = tuple(labelSet.keys())
vocab = list(vocab)

vocabIndex = {vocab[i]: i for i in range(len(vocab))}

print(len(dataDict.keys()))
print(len(vocabIndex))
print(datasetLen)
print(labelSet)

19573
9274
39129
Counter({'entailment': 19619, 'contradiction': 19510})


## Creating the tensorflow dataset

TODO

In [4]:
embeddingDim = 100
embeddingMatrix = np.zeros((len(vocab), embeddingDim))
embeddedVocab = []

with open(f'glove.6B.{embeddingDim}d.txt', encoding="utf-8") as gloveFile:
	for line in gloveFile:
		values = line.split()
		word = values[0]

		if word in vocab:
			embeddingMatrix[vocabIndex[word]] = np.asarray(values[1:], dtype='float32')
			embeddedVocab.append(word)
        

In [5]:
maxLen = 128

X1array = []
X2array = []
YArray = []

for key, hypoAndLabels in dataDict.items():
	img = f'./A2_Images/{key}.jpg'

	for hypo, label in hypoAndLabels:
		label = tf.convert_to_tensor([labelTuple.index(label)])
		label.set_shape([1])

		hypo = [vocabIndex[word] for word in hypo]
		hypo = tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences([hypo], maxlen=maxLen)[0])
		hypo.set_shape([maxLen])

		X1array.append(img)
		X2array.append(hypo)
		YArray.append(label)

X1Numpy = np.array(X1array)
X2Numpy = np.array(X2array)
YNumpy = np.array(YArray, dtype='uint8')

dataset = tf.data.Dataset.from_tensor_slices(({'image': X1Numpy, 'text': X2Numpy}, YNumpy))

def getImage(path):
	img = tf.io.read_file(path)
	img = tf.io.decode_image(img, channels=3)
	img = tf.image.resize(img, (224, 224), method='nearest')
	return img

def getImageWrapper(x, y):
	img = tf.py_function(func=getImage, inp=[x['image']], Tout=tf.uint8)
	img.set_shape([224, 224, 3])

	x['image'] = img 
	return x, y

dataset = dataset.map(getImageWrapper, num_parallel_calls=tf.data.AUTOTUNE)

In [36]:
for i in dataset.take(1):
	print(i)

({'image': <tf.Tensor: shape=(224, 224, 3), dtype=uint8, numpy=
array([[[ 55, 118, 162],
        [ 53, 117, 165],
        [ 45, 113, 162],
        ...,
        [179, 177, 164],
        [175, 173, 161],
        [175, 173, 160]],

       [[ 61, 122, 169],
        [ 54, 118, 166],
        [ 53, 119, 169],
        ...,
        [180, 178, 165],
        [174, 173, 155],
        [176, 174, 159]],

       [[ 58, 122, 168],
        [ 57, 121, 169],
        [ 54, 120, 170],
        ...,
        [177, 175, 162],
        [179, 178, 158],
        [176, 174, 159]],

       ...,

       [[129, 132, 141],
        [149, 149, 157],
        [168, 168, 170],
        ...,
        [157, 130, 123],
        [158, 140, 130],
        [155, 140, 133]],

       [[  4,   5,  26],
        [ 15,  12,  31],
        [ 28,  24,  38],
        ...,
        [155, 128, 121],
        [158, 139, 132],
        [155, 140, 133]],

       [[  8,   7,  12],
        [ 12,   8,   7],
        [ 17,  12,   8],
        ...,
        [1

2025-10-06 16:13:41.403357: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [34]:
testSize = int(datasetLen * 0.05)
valSize = int(datasetLen * 0.05)
trainSize = int(datasetLen - testSize - valSize)
batchSize = 100

def optimize(ds):
	ds = ds.batch(batchSize) 
	ds = ds.cache() 
	ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
	
	return ds

def getTest(ds):
	ds = ds.take(testSize) 
	ds = optimize(ds)
	
	return ds

def getVal(ds):
	ds = ds.skip(testSize)
	ds = ds.take(valSize) 
	ds = optimize(ds)

	return ds

def getTrain(ds):
	ds = ds.skip(valSize + testSize)
	ds = ds.take(trainSize)
	ds = optimize(ds)

	return ds

testDS = getTest(dataset)
valDS = getVal(dataset)
trainDS = getTrain(dataset)


print(f"test data batches {tf.data.experimental.cardinality(testDS).numpy() * batchSize}")
print(f"val data batches {tf.data.experimental.cardinality(valDS).numpy() * batchSize}")
print(f"train data batches {tf.data.experimental.cardinality(trainDS).numpy() * batchSize}")
print(f'ratios test:{testSize} val:{valSize} train:{trainSize}')


test data batches 2000
val data batches 2000
train data batches 35300
ratios test:1956 val:1956 train:35217


# Training

The performance metric I chose to optimize for needed to handle heavily imbalanced data so I chose to take the accuracy of my model at predicting each class then taking the weighted arithmetic mean of all of those. I could have also used the macro or micro F1 score but I chose to use accuracy because it is a simple metric that has more real world meaning than the F1 score. For medical this algorithm to be useful in medical diagnosis I want to target an average accuracy of 97.5%, while this is a fairly low accuracy target since the goal of this project is to classify a large number of cells I figure that the count and ratio of call types is the more important than getting every single cell correctly labeled. 

Below is the code I used to compile, train, and evaluate all the models.
I created my own custom loss function based off sparse categorical cross entropy but I added weights to compensate for the class imbalance.
I created accuracy metrics for each class. 

In [55]:
# Compiling
inputImageLayer = tf.keras.layers.Input(shape=(224, 224, 3), name='image')

# imagenet pre made model
baseModel = tf.keras.applications.MobileNetV3Large(
	weights='imagenet',  
	include_top=False,
	pooling='max'
)

baseModel.trainable = False
convoLayers = baseModel(inputImageLayer, training=False)
denseImage = tf.keras.layers.Dense(256, activation='relu', name='denseImage')(convoLayers)

inputTextLayer = tf.keras.layers.Input(shape=(maxLen,), name='text')
embeddingText = tf.keras.layers.Embedding(len(vocab), embeddingDim, mask_zero=True, weights=[embeddingMatrix], trainable=False, name='embedding')(inputTextLayer)
denseText = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(128, activation='relu'), name='denseText')(embeddingText)
attentionText = tf.keras.layers.MultiHeadAttention(8, maxLen, name='attentionText')(denseText, denseText, denseText)
denseText2 = tf.keras.layers.Dense(256, activation='relu', name='denseText2')(attentionText)

combinedLayer = tf.keras.layers.Dot((2, 1), normalize=True, name='combined')([denseText2, denseImage])
dense1 = tf.keras.layers.Dense(256, activation='relu')(combinedLayer)
dense2 = tf.keras.layers.Dense(256, activation='relu')(dense1)
output = tf.keras.layers.Dense(1, name='output', activation='sigmoid')(dense2)

model = tf.keras.Model(inputs=[inputImageLayer, inputTextLayer], outputs=output)

model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy']
)
model.summary()


In [56]:
# Training

history = model.fit(
  trainDS,
	validation_data=valDS,
  epochs=8,
  batch_size=batchSize
)

Epoch 1/8
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 76ms/step - accuracy: 0.6585 - loss: 0.6015 - val_accuracy: 0.7111 - val_loss: 0.5598
Epoch 2/8
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 42ms/step - accuracy: 0.7231 - loss: 0.5288 - val_accuracy: 0.7050 - val_loss: 0.5464
Epoch 3/8
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 42ms/step - accuracy: 0.7394 - loss: 0.5041 - val_accuracy: 0.7101 - val_loss: 0.5288
Epoch 4/8
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 40ms/step - accuracy: 0.7517 - loss: 0.4877 - val_accuracy: 0.7224 - val_loss: 0.5187
Epoch 5/8
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 40ms/step - accuracy: 0.7609 - loss: 0.4708 - val_accuracy: 0.7290 - val_loss: 0.5237
Epoch 6/8
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 40ms/step - accuracy: 0.7711 - loss: 0.4560 - val_accuracy: 0.7224 - val_loss: 0.5359
Epoch 7/8
[1m353/353

In [43]:
model.save('./final.keras')