# NOTE :

### Kindly give the path to your datasets properly. I have adjusted based on the file structure on kaggle. Adjustment is needed only in last cell

### Due to limited compute and storage resources, we have considered to process and work with only 3 subjects [p00,p01,p02].
### For the leave-out strategy we have used only p00

## "TODO" sections :  where we can add all 15 subject ids instead of 3, when good compute and storage units are available

In [2]:
!pip install tensorflow==2.15.1



In [3]:
!pip install tf-models-official==2.15

Collecting tf-models-official==2.15
  Downloading tf_models_official-2.15.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting gin-config (from tf-models-official==2.15)
  Downloading gin_config-0.5.0-py3-none-any.whl.metadata (2.9 kB)
Collecting immutabledict (from tf-models-official==2.15)
  Downloading immutabledict-4.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting pycocotools (from tf-models-official==2.15)
  Downloading pycocotools-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting sacrebleu (from tf-models-official==2.15)
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from tf-models-official==2.15)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metada

In [5]:
import tensorflow as tf
tf.version.VERSION

'2.15.1'

In [7]:
import tensorflow as tf
from sklearn.metrics import mean_absolute_error
import random
import numpy as np
import os, glob
from tensorflow.keras.layers import Flatten, Dense, BatchNormalization, Input
from tensorflow.keras.models import Model
from PIL import Image

gpus=tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu,True)
print(gpus)
import matplotlib.pyplot as plt

# Ensure the same seed for reproducibility
random.seed(12)
np.random.seed(12)
tf.random.set_seed(12)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [8]:
# If you have any custom objects (e.g., custom metrics or layers), define/import them here.
class AngularError(tf.keras.metrics.Metric):
    def __init__(self, name='mean_angular_error', **kwargs):
        super().__init__(name=name, **kwargs)
        self.total_error = self.add_weight(name='total_error', initializer='zeros')
        self.num_samples = self.add_weight(name='num_samples', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.math.l2_normalize(y_true, axis=-1)
        y_pred = tf.math.l2_normalize(y_pred, axis=-1)
        dot_product = tf.reduce_sum(y_true * y_pred, axis=-1)
        dot_product = tf.clip_by_value(dot_product, -1.0, 1.0)
        angular_error = tf.acos(dot_product)
        angular_error = angular_error * 57.296
        self.total_error.assign_add(tf.reduce_sum(angular_error))
        self.num_samples.assign_add(tf.cast(tf.shape(y_true)[0], tf.float32))

    def result(self):
        return self.total_error / self.num_samples

    def reset_state(self):
        self.total_error.assign(0.0)
        self.num_samples.assign(0.0)


# IMPLEMENTING THE GAZE MODEL

In [9]:
# Clear any previous session
tf.keras.backend.clear_session()

# Load pretrained EfficientNetV2 for face feature extraction
effcnt_net = tf.keras.applications.EfficientNetV2B0(include_top=False, include_preprocessing=True, pooling=None)
effcnt_net.trainable = True
g_face = Model(inputs=effcnt_net.inputs, outputs=effcnt_net.outputs, name='g_face')

# Load pretrained VGG16 for eye feature extraction
vgg16 = tf.keras.applications.VGG16(include_top=False, pooling=None)
vgg16.trainable = True
vgg16_processor = tf.keras.applications.vgg16.preprocess_input
g_eye = Model(inputs=vgg16.inputs, outputs=vgg16.outputs, name='g_eye')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/efficientnet_v2/efficientnetv2-b0_notop.h5
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [10]:
# this did training for Gazer

# Define the Gaze Model using the pretrained models
class GazeModel(tf.keras.Model):
    def __init__(self):
        super(GazeModel, self).__init__()
        self.g_face = g_face
        self.g_eye = g_eye
        self.flat = Flatten()
        self.embedding = tf.keras.layers.Embedding(15, 6, embeddings_regularizer=tf.keras.regularizers.L2(0.01), mask_zero=True, name='subject_embedding')
        self.embedding.trainable = False
        self.MLP = tf.keras.Sequential([
            Dense(1280, activation='relu'),
            BatchNormalization(),
            Dense(3, name='gaze_location')
        ], name='MLP')

    def call(self, input_dict):
        # Transpose to (batch_size, height, width, channels)
        face_image = input_dict['face']
        flipped_face_image = tf.image.flip_left_right(face_image)
        left_eye_image = input_dict['lefteye']
        right_eye_image = input_dict['righteye']

        face_features = self.g_face(face_image)
        flipped_face_features = self.g_face(flipped_face_image)
        left_features = self.g_eye(left_eye_image)
        right_features = self.g_eye(right_eye_image)

        face_features = self.flat(face_features)
        flipped_face_features = self.flat(flipped_face_features)
        left_features = self.flat(left_features)
        right_features = self.flat(right_features)
        
        embedding = self.embedding(input_dict['id'])
        rot_mat = tf.reshape(input_dict['rotation_matrix'], [tf.shape(input_dict['rotation_matrix'])[0], -1])
        eye_coords = tf.reshape(input_dict['eye_coords'], [tf.shape(input_dict['eye_coords'])[0], -1])
        
        total = tf.concat([face_features, flipped_face_features, left_features,
                           right_features, embedding, rot_mat, eye_coords], 1)
        total = self.MLP(total)
        return total

# Define transformations using TensorFlow
def preprocess_image(image, target_size):
    image = tf.image.resize(image, target_size)
    image = tf.keras.applications.efficientnet_v2.preprocess_input(image)
    return image

class GazeDataset(tf.data.Dataset):
    def __new__(cls, subject_to_leave_out=None, batch_size=8, validation=False):
        def _generator():
            root_dir = 'processed_data'
            # TODO: Use the full list of subject ids if proper compute resources available
            subjects = ['p00', 'p01', 'p02']
            transform_face = lambda img: preprocess_image(img, (224, 224))
            transform_eye = lambda img: preprocess_image(img, (112, 112))

            for subject in subjects:
                if validation:
                    if subject != subject_to_leave_out:
                        continue
                else:
                    if subject == subject_to_leave_out:
                        continue
                
                person_dir = os.path.join(root_dir, 'Image', subject)
                for image_name in os.listdir(os.path.join(person_dir, 'face')):
                    face_image_path = os.path.join(person_dir, 'face', image_name)
                    left_eye_image_path = os.path.join(person_dir, 'lefteye', image_name)
                    right_eye_image_path = os.path.join(person_dir, 'righteye', image_name)
                    rotation_matrix_path = os.path.join(person_dir, 'rotation_matrix', image_name.replace('.jpg', '.npy'))
                    rotation_matrix_flipped_path = os.path.join(person_dir, 'rotation_matrix_flipped', image_name.replace('.jpg', '.npy'))
                    gaze_2d_path = os.path.join(person_dir, '2d_gaze', image_name.replace('.jpg', '.npy'))
                    gaze_3d_path = os.path.join(person_dir, '3d_gaze', image_name.replace('.jpg', '.npy'))
                    gaze_3d_flipped_path = os.path.join(person_dir, '3d_gaze_flipped', image_name.replace('.jpg', '.npy'))
                    eye_coords_path = os.path.join(person_dir, 'eye_coords', image_name.replace('.jpg', '.npy'))

                    face_image = Image.open(face_image_path).convert('RGB')
                    left_eye_image = Image.open(left_eye_image_path).convert('RGB')
                    right_eye_image = Image.open(right_eye_image_path).convert('RGB')

                    face_image = transform_face(np.array(face_image))
                    left_eye_image = transform_eye(np.array(left_eye_image))
                    right_eye_image = transform_eye(np.array(right_eye_image))

                    rotation_matrix = np.load(rotation_matrix_path)
                    rotation_matrix_flipped = np.load(rotation_matrix_flipped_path)
                    gaze_2d = np.load(gaze_2d_path)
                    gaze_3d = np.load(gaze_3d_path)
                    gaze_3d_flipped = np.load(gaze_3d_flipped_path)
                    eye_coords = np.load(eye_coords_path)

                    yield face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject

        return tf.data.Dataset.from_generator(
            _generator,
            output_signature=(
                tf.TensorSpec(shape=(224, 224, 3), dtype=tf.float32),
                tf.TensorSpec(shape=(112, 112, 3), dtype=tf.float32),
                tf.TensorSpec(shape=(112, 112, 3), dtype=tf.float32),
                tf.TensorSpec(shape=(3, 3), dtype=tf.float32),
                tf.TensorSpec(shape=(3, 3), dtype=tf.float32),
                tf.TensorSpec(shape=(2,), dtype=tf.float32),
                tf.TensorSpec(shape=(3,), dtype=tf.float32),
                tf.TensorSpec(shape=(3,), dtype=tf.float32),
                tf.TensorSpec(shape=(6,), dtype=tf.float32),
                tf.TensorSpec(shape=(), dtype=tf.string)
            )
        ).batch(batch_size)
# Define custom Huber loss function
def custom_huber_loss(y_true, y_pred, delta=1.5):
    error = y_true - y_pred
    is_small_error = tf.abs(error) <= delta

    small_error_loss = tf.square(error) / 2
    big_error_loss = delta * (tf.abs(error) - delta / 2)

    return tf.where(is_small_error, small_error_loss, big_error_loss)

# Calculate gaze loss as the average of Huber losses
def gaze_loss(y_true, y_pred, delta=1.5):
    huber_losses = custom_huber_loss(y_true, y_pred, delta)
    return tf.reduce_mean(huber_losses)

# Train the Gaze Model
def train_gaze_model(model, train_dataloader, val_dataloader, optimizer, num_epochs=2):
    for epoch in range(num_epochs):
        running_loss = 0.0
        num_batches = 0
        for i, (face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject_id) in enumerate(train_dataloader):
            with tf.GradientTape() as tape:
                input_dict = {
                    'face': face_image,
                    'flipped_face': tf.image.flip_left_right(face_image),
                    'lefteye': left_eye_image,
                    'righteye': right_eye_image,
                    'id': tf.constant([0 if s == 'p00' else 1 if s == 'p01' else 2 for s in subject_id], dtype=tf.int32),
                    'rotation_matrix': rotation_matrix,
                    'eye_coords': eye_coords
                }
                outputs = model(input_dict)
                #print("training")
                loss = gaze_loss(gaze_3d, outputs)
            grads = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))
            running_loss += loss.numpy()
            num_batches += 1
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/num_batches}")

        val_loss = 0.0
        num_val_batches = 0
        for i, (face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject_id) in enumerate(val_dataloader):
            input_dict = {
                'face': face_image,
                'flipped_face': tf.image.flip_left_right(face_image),
                'lefteye': left_eye_image,
                'righteye': right_eye_image,
                'id': tf.constant([0 if s == 'p00' else 1 if s == 'p01' else 2 for s in subject_id], dtype=tf.int32),
                'rotation_matrix': rotation_matrix,
                'eye_coords': eye_coords
            }
            outputs = model(input_dict)
            loss = gaze_loss(gaze_3d, outputs)
            val_loss += loss.numpy()
            num_val_batches += 1
        print(f"Validation Loss: {val_loss/num_val_batches}")

# Implementing the leave-one-out strategy

# TODO
#subjects = ['p00', 'p01', 'p02']
# Due to less compute resources and storage, we are leaving out only one subject
subjects = ['p00']
validation_losses = []

for subject in subjects:
    print(f"Leaving out subject: {subject}")
    train_dataloader = GazeDataset(subject_to_leave_out=subject, batch_size=8, validation=False)
    val_dataloader = GazeDataset(subject_to_leave_out=subject, batch_size=8, validation=True)
    
    # Initialize models and optimizer
    gaze_model = GazeModel()
    gaze_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    
    # Train the gaze model
    train_gaze_model(gaze_model, train_dataloader, val_dataloader, gaze_optimizer, num_epochs=2)
    
    # Save the GazeModel weights
    gaze_model.save_weights(f'gaze_model_weights_{subject}')
    print(f"Gaze model weights for subject {subject} saved.")
    
    # Validate the model and store the loss
    val_loss = 0.0
    num_val_batches = 0
    for i, (face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject_id) in enumerate(val_dataloader):
        input_dict = {
            'face': face_image,
            'flipped_face': tf.image.flip_left_right(face_image),
            'lefteye': left_eye_image,
            'righteye': right_eye_image,
            'id': tf.constant([0 if s == 'p00' else 1 if s == 'p01' else 2 for s in subject_id], dtype=tf.int32),
            'rotation_matrix': rotation_matrix,
            'eye_coords': eye_coords
        }
        outputs = gaze_model(input_dict)
        #print("validating")
        loss = gaze_loss(gaze_3d, outputs)
        val_loss += loss.numpy()
        num_val_batches += 1
    validation_losses.append(val_loss / num_val_batches)
    print(f"Validation Loss for subject {subject}: {val_loss / num_val_batches}")

# Calculate the average validation loss
average_val_loss = sum(validation_losses) / len(validation_losses)
print(f"Average Validation Loss: {average_val_loss}")


Leaving out subject: p00


I0000 00:00:1720712272.547552      34 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch [1/2], Loss: 5.751259233853314
Validation Loss: 0.007881487585604191
Epoch [2/2], Loss: 0.02611466482964655
Validation Loss: 0.004395916781077782
Gaze model weights for subject p00 saved.
Validation Loss for subject p00: 0.004395916781077782
Average Validation Loss: 0.004395916781077782


# IMPLEMENTING THE CALIBRATION MODEL

In [11]:
# this did training for calibratiobn

# Calibration Dataset Class
class CalibrationDataset(tf.data.Dataset):
    def __new__(cls, subject_to_leave_out=None, batch_size=8, validation=False):
        def _generator():
            root_dir = 'processed_data'
            # TODO: Use the full list of subject ids if proper compute resources available
            subjects = ['p00', 'p01', 'p02']
            transform_face = lambda img: preprocess_image(img, (224, 224))
            transform_eye = lambda img: preprocess_image(img, (112, 112))

            for subject in subjects:
                if validation:
                    if subject != subject_to_leave_out:
                        continue
                else:
                    if subject == subject_to_leave_out:
                        continue
                
                person_dir = os.path.join(root_dir, 'Image', subject)
                for image_name in os.listdir(os.path.join(person_dir, 'face')):
                    face_image_path = os.path.join(person_dir, 'face', image_name)
                    left_eye_image_path = os.path.join(person_dir, 'lefteye', image_name)
                    right_eye_image_path = os.path.join(person_dir, 'righteye', image_name)
                    rotation_matrix_path = os.path.join(person_dir, 'rotation_matrix', image_name.replace('.jpg', '.npy'))
                    rotation_matrix_flipped_path = os.path.join(person_dir, 'rotation_matrix_flipped', image_name.replace('.jpg', '.npy'))
                    gaze_2d_path = os.path.join(person_dir, '2d_gaze', image_name.replace('.jpg', '.npy'))
                    gaze_3d_path = os.path.join(person_dir, '3d_gaze', image_name.replace('.jpg', '.npy'))
                    gaze_3d_flipped_path = os.path.join(person_dir, '3d_gaze_flipped', image_name.replace('.jpg', '.npy'))
                    eye_coords_path = os.path.join(person_dir, 'eye_coords', image_name.replace('.jpg', '.npy'))

                    face_image = Image.open(face_image_path).convert('RGB')
                    left_eye_image = Image.open(left_eye_image_path).convert('RGB')
                    right_eye_image = Image.open(right_eye_image_path).convert('RGB')

                    face_image = transform_face(np.array(face_image))
                    left_eye_image = transform_eye(np.array(left_eye_image))
                    right_eye_image = transform_eye(np.array(right_eye_image))

                    rotation_matrix = np.load(rotation_matrix_path)
                    rotation_matrix_flipped = np.load(rotation_matrix_flipped_path)
                    gaze_2d = np.load(gaze_2d_path)
                    gaze_3d = np.load(gaze_3d_path)
                    gaze_3d_flipped = np.load(gaze_3d_flipped_path)
                    eye_coords = np.load(eye_coords_path)

                    yield face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject

        return tf.data.Dataset.from_generator(
            _generator,
            output_signature=(
                tf.TensorSpec(shape=(224, 224, 3), dtype=tf.float32),
                tf.TensorSpec(shape=(112, 112, 3), dtype=tf.float32),
                tf.TensorSpec(shape=(112, 112, 3), dtype=tf.float32),
                tf.TensorSpec(shape=(3, 3), dtype=tf.float32),
                tf.TensorSpec(shape=(3, 3), dtype=tf.float32),
                tf.TensorSpec(shape=(2,), dtype=tf.float32),
                tf.TensorSpec(shape=(3,), dtype=tf.float32),
                tf.TensorSpec(shape=(3,), dtype=tf.float32),
                tf.TensorSpec(shape=(6,), dtype=tf.float32),
                tf.TensorSpec(shape=(), dtype=tf.string)
            )
        ).batch(batch_size)

# Define the Calibration Model
class CalibrationModel(tf.keras.Model):
    def __init__(self, gaze_model):
        super(CalibrationModel, self).__init__()
        self.g_face = gaze_model.g_face
        self.g_eye = gaze_model.g_eye
        self.transformer_stack = transformer
        self.flat = Flatten()
        self.MLP1 = tf.keras.Sequential([
            Dense(1280, activation='relu'),
            BatchNormalization(),
        ], name='MLP1')
        self.MLP2 = tf.keras.Sequential([
            Dense(1280, activation='relu'),
            BatchNormalization(),
        ], name='MLP2')
        self.output_layer = Dense(6, name='subject_feature')

    def call(self, input_dict):
        face_features = self.g_face(input_dict['face'])
        flipped_face_features = self.g_face(input_dict['flipped_face'])
        left_features = self.g_eye(input_dict['lefteye'])
        right_features = self.g_eye(input_dict['righteye'])

        face_features = self.flat(face_features)
        flipped_face_features = self.flat(flipped_face_features)
        left_features = self.flat(left_features)
        right_features = self.flat(right_features)

        rot_mat = tf.reshape(input_dict['rotation_matrix'], [tf.shape(input_dict['rotation_matrix'])[0], -1])
        rot_mat_flipped = tf.reshape(input_dict['rotation_matrix_flipped'], [tf.shape(input_dict['rotation_matrix_flipped'])[0], -1])
        eye_coords = tf.reshape(input_dict['eye_coords'], [tf.shape(input_dict['eye_coords'])[0], -1])
        gaze = input_dict['gaze']
        gaze_flipped = input_dict['gaze_flipped']

        total = tf.concat([face_features, flipped_face_features, left_features,
                           right_features, eye_coords, rot_mat, rot_mat_flipped,
                           gaze, gaze_flipped], 1)
        
        total = self.MLP1(total)
        total = tf.expand_dims(total, axis=1)
        total = self.transformer_stack(total)
        total = self.MLP2(tf.squeeze(total, 1))
        final_output = self.output_layer(total)
        return final_output

# Training function for Calibration Model
def train_calibration_model(model, dataloader, gaze_model, optimizer, num_epochs=2):
    for epoch in range(num_epochs):
        running_loss = 0.0
        num_batches = 0
        for i, (face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject_id) in enumerate(dataloader):
            with tf.GradientTape() as tape:
                input_dict = {
                    'face': face_image,
                    'flipped_face': tf.image.flip_left_right(face_image),
                    'lefteye': left_eye_image,
                    'righteye': right_eye_image,
                    'id': tf.constant([0 if s == 'p00' else 1 if s == 'p01' else 2 for s in subject_id], dtype=tf.int32),
                    'rotation_matrix': rotation_matrix,
                    'rotation_matrix_flipped': rotation_matrix_flipped,
                    'eye_coords': eye_coords,
                    'gaze': gaze_3d,
                    'gaze_flipped': gaze_3d_flipped
                }
                predicted_embeddings = model(input_dict)
                #print("training")
                # Accuracy loss
                preference_vectors = gaze_model.embedding(input_dict['id'])
                accuracy_loss = tf.reduce_mean(tf.square(predicted_embeddings - preference_vectors))
                
                # Consistency loss
                consistency_loss = embedding_consistency_loss(predicted_embeddings, batch_size=8)
                
                # Total loss
                loss = accuracy_loss + consistency_loss
                
            grads = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))
            running_loss += loss.numpy()
            num_batches += 1
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/num_batches}")

# Implementing the leave-one-out strategy for Calibration Model
validation_losses = []
# Initialize the transformer encoder
import tensorflow as tf
import tensorflow_models as tfm

transformer = tfm.nlp.models.TransformerEncoder(
    num_layers=6,
    num_attention_heads=4,
    intermediate_size=2048,
    activation='relu',
    dropout_rate=0.0,
    attention_dropout_rate=0.0,
    use_bias=True,
    norm_first=True,
    norm_epsilon=1e-06,
    intermediate_dropout=0.0,
)
# Define custom embedding consistency loss
def embedding_consistency_loss(embeddings, batch_size):
    diff = tf.expand_dims(embeddings, 1) - tf.expand_dims(embeddings, 0)
    diff = tf.reduce_sum(tf.square(diff), axis=-1)
    diff = tf.sqrt(diff + 1e-12)
    consistency_loss = tf.reduce_mean(diff)
    return consistency_loss

# Define angular difference function
def angular_difference(a, b):
    dot_product = tf.reduce_sum(a * b, axis=-1)
    norm_a = tf.norm(a, axis=-1)
    norm_b = tf.norm(b, axis=-1)
    cos_theta = dot_product / (norm_a * norm_b)
    cos_theta = tf.clip_by_value(cos_theta, -1.0, 1.0)
    return tf.acos(cos_theta)

for subject in subjects:
    print(f"Leaving out subject: {subject}")
    train_dataloader = CalibrationDataset(subject_to_leave_out=subject, batch_size=8, validation=False)
    val_dataloader = CalibrationDataset(subject_to_leave_out=subject, batch_size=8, validation=True)
    
    # Initialize models and optimizer
    gaze_model = GazeModel()
    gaze_model.load_weights('gaze_model_weights_p00')  # Load the pre-trained GazeModel weights
    calibration_model = CalibrationModel(gaze_model)
    calibration_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
    
    # Train the calibration model
    train_calibration_model(calibration_model, train_dataloader, gaze_model, calibration_optimizer, num_epochs=2)
    
    # Evaluate the calibration model
    val_loss = 0.0
    num_val_batches = 0
    for i, (face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject_id) in enumerate(val_dataloader):
        input_dict = {
            'face': face_image,
            'flipped_face': tf.image.flip_left_right(face_image),
            'lefteye': left_eye_image,
            'righteye': right_eye_image,
            'id': tf.constant([0 if s == 'p00' else 1 if s == 'p01' else 2 for s in subject_id], dtype=tf.int32),
            'rotation_matrix': rotation_matrix,
            'rotation_matrix_flipped': rotation_matrix_flipped,
            'eye_coords': eye_coords,
            'gaze': gaze_3d,
            'gaze_flipped': gaze_3d_flipped
        }
        predicted_embeddings = calibration_model(input_dict)
        #print("validating")
        # Accuracy loss
        preference_vectors = gaze_model.embedding(input_dict['id'])
        accuracy_loss = tf.reduce_mean(tf.square(predicted_embeddings - preference_vectors))
        
        # Consistency loss
        consistency_loss = embedding_consistency_loss(predicted_embeddings, batch_size=8)
        
        # Total loss
        loss = accuracy_loss + consistency_loss
        val_loss += loss.numpy()
        num_val_batches += 1
    
    validation_losses.append(val_loss / num_val_batches)
    print(f"Validation Loss for subject {subject}: {val_loss/num_val_batches}")

# Calculate overall performance
overall_performance = sum(validation_losses) / len(validation_losses)
print(f"Overall Performance: {overall_performance}")

# Save the Calibration Model
calibration_model.save_weights('calibration_model_weights')
print("Calibration model weights saved.")


Leaving out subject: p00
Epoch [1/2], Loss: 0.04863577569834888
Epoch [2/2], Loss: 0.005534778931876644
Validation Loss for subject p00: 0.003976876734445492
Overall Performance: 0.003976876734445492
Calibration model weights saved.


In [12]:
import numpy as np

# Function to introduce noise to images
def add_noise(image, noise_factor=0.1):
    noisy_image = image + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=image.shape)
    noisy_image = np.clip(noisy_image, 0.0, 1.0)
    return noisy_image

# Function to create a corrupted validation set
def create_corrupted_dataset(dataloader, noise_factor=0.1):
    corrupted_data = []
    for i, (face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject_id) in enumerate(dataloader):
        corrupted_face_image = add_noise(face_image, noise_factor)
        corrupted_left_eye_image = add_noise(left_eye_image, noise_factor)
        corrupted_right_eye_image = add_noise(right_eye_image, noise_factor)
        corrupted_data.append((corrupted_face_image, corrupted_left_eye_image, corrupted_right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject_id))
    return corrupted_data

# Evaluate the model with corrupted data
def evaluate_with_corrupted_data(model, dataloader, gaze_model, noise_factor=0.1):
    corrupted_data = create_corrupted_dataset(dataloader, noise_factor)
    val_loss = 0.0
    num_val_batches = 0

    for i, (corrupted_face_image, corrupted_left_eye_image, corrupted_right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject_id) in enumerate(corrupted_data):
        input_dict = {
            'face': corrupted_face_image,
            'flipped_face': tf.image.flip_left_right(corrupted_face_image),
            'lefteye': corrupted_left_eye_image,
            'righteye': corrupted_right_eye_image,
            'id': tf.constant([0 if s == 'p00' else 1 if s == 'p01' else 2 for s in subject_id], dtype=tf.int32),
            'rotation_matrix': rotation_matrix,
            'rotation_matrix_flipped': rotation_matrix_flipped,
            'eye_coords': eye_coords,
            'gaze': gaze_3d,
            'gaze_flipped': gaze_3d_flipped
        }
        predicted_embeddings = model(input_dict)
        
        # Accuracy loss
        preference_vectors = gaze_model.embedding(input_dict['id'])
        accuracy_loss = tf.reduce_mean(tf.square(predicted_embeddings - preference_vectors))
        
        # Consistency loss
        consistency_loss = embedding_consistency_loss(predicted_embeddings, batch_size=8)
        
        # Total loss
        loss = accuracy_loss + consistency_loss
        val_loss += loss.numpy()
        num_val_batches += 1
    
    return val_loss / num_val_batches

# Assess robustness with corrupted data
corrupted_performance = evaluate_with_corrupted_data(calibration_model, val_dataloader, gaze_model, noise_factor=0.1)
print(f"Performance with Corrupted Data: {corrupted_performance}")


Performance with Corrupted Data: 0.003977099084605773


#  IMPLEMENTING THE SPAZE MODEL

In [14]:
#### UNCOMMENT THIS CELL IF YOU WANT TO TEST SPAZE MODEL ONLY. 
#### MAKE SURE YOU HAVE LOADED YOUR PRETRAINED GAZE AND CALIBRATION MODEL


# import tensorflow as tf
# from tensorflow.keras.layers import Flatten, Dense, BatchNormalization, Input
# from tensorflow.keras.models import Model
# from PIL import Image

# # Define transformations using TensorFlow
# def preprocess_image(image, target_size):
#     image = tf.image.resize(image, target_size)
#     image = tf.keras.applications.efficientnet_v2.preprocess_input(image)
#     return image

# class GazeDataset(tf.data.Dataset):
#     def __new__(cls, subject_to_leave_out=None, batch_size=8, validation=False):
#         def _generator():
#             root_dir = 'processed_data'
#             subjects = ['p00', 'p01', 'p02']
#             transform_face = lambda img: preprocess_image(img, (224, 224))
#             transform_eye = lambda img: preprocess_image(img, (112, 112))

#             for subject in subjects:
#                 if validation:
#                     if subject != subject_to_leave_out:
#                         continue
#                 else:
#                     if subject == subject_to_leave_out:
#                         continue
                
#                 person_dir = os.path.join(root_dir, 'Image', subject)
#                 for image_name in os.listdir(os.path.join(person_dir, 'face')):
#                     face_image_path = os.path.join(person_dir, 'face', image_name)
#                     left_eye_image_path = os.path.join(person_dir, 'lefteye', image_name)
#                     right_eye_image_path = os.path.join(person_dir, 'righteye', image_name)
#                     rotation_matrix_path = os.path.join(person_dir, 'rotation_matrix', image_name.replace('.jpg', '.npy'))
#                     rotation_matrix_flipped_path = os.path.join(person_dir, 'rotation_matrix_flipped', image_name.replace('.jpg', '.npy'))
#                     gaze_2d_path = os.path.join(person_dir, '2d_gaze', image_name.replace('.jpg', '.npy'))
#                     gaze_3d_path = os.path.join(person_dir, '3d_gaze', image_name.replace('.jpg', '.npy'))
#                     gaze_3d_flipped_path = os.path.join(person_dir, '3d_gaze_flipped', image_name.replace('.jpg', '.npy'))
#                     eye_coords_path = os.path.join(person_dir, 'eye_coords', image_name.replace('.jpg', '.npy'))

#                     face_image = Image.open(face_image_path).convert('RGB')
#                     left_eye_image = Image.open(left_eye_image_path).convert('RGB')
#                     right_eye_image = Image.open(right_eye_image_path).convert('RGB')

#                     face_image = transform_face(np.array(face_image))
#                     left_eye_image = transform_eye(np.array(left_eye_image))
#                     right_eye_image = transform_eye(np.array(right_eye_image))

#                     rotation_matrix = np.load(rotation_matrix_path)
#                     rotation_matrix_flipped = np.load(rotation_matrix_flipped_path)
#                     gaze_2d = np.load(gaze_2d_path)
#                     gaze_3d = np.load(gaze_3d_path)
#                     gaze_3d_flipped = np.load(gaze_3d_flipped_path)
#                     eye_coords = np.load(eye_coords_path)

#                     yield face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject

#         return tf.data.Dataset.from_generator(
#             _generator,
#             output_signature=(
#                 tf.TensorSpec(shape=(224, 224, 3), dtype=tf.float32),
#                 tf.TensorSpec(shape=(112, 112, 3), dtype=tf.float32),
#                 tf.TensorSpec(shape=(112, 112, 3), dtype=tf.float32),
#                 tf.TensorSpec(shape=(3, 3), dtype=tf.float32),
#                 tf.TensorSpec(shape=(3, 3), dtype=tf.float32),
#                 tf.TensorSpec(shape=(2,), dtype=tf.float32),
#                 tf.TensorSpec(shape=(3,), dtype=tf.float32),
#                 tf.TensorSpec(shape=(3,), dtype=tf.float32),
#                 tf.TensorSpec(shape=(6,), dtype=tf.float32),
#                 tf.TensorSpec(shape=(), dtype=tf.string)
#             )
#         ).batch(batch_size)
# # Define custom Huber loss function
# def custom_huber_loss(y_true, y_pred, delta=1.5):
#     error = y_true - y_pred
#     is_small_error = tf.abs(error) <= delta

#     small_error_loss = tf.square(error) / 2
#     big_error_loss = delta * (tf.abs(error) - delta / 2)

#     return tf.where(is_small_error, small_error_loss, big_error_loss)

# # Calculate gaze loss as the average of Huber losses
# def gaze_loss(y_true, y_pred, delta=1.5):
#     huber_losses = custom_huber_loss(y_true, y_pred, delta)
#     return tf.reduce_mean(huber_losses)


In [13]:
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model, Sequential

# Define the SPAZE Model
class SPAZEModel(tf.keras.Model):
    def __init__(self):
        super(SPAZEModel, self).__init__()
        self.conv1 = Conv2D(64, (3, 3), activation='relu', padding='same')
        self.pool1 = MaxPooling2D((2, 2))
        self.conv2 = Conv2D(128, (3, 3), activation='relu', padding='same')
        self.pool2 = MaxPooling2D((2, 2))
        self.conv3 = Conv2D(256, (3, 3), activation='relu', padding='same')
        self.pool3 = MaxPooling2D((2, 2))
        self.flatten = Flatten()
        self.fc1 = Dense(512, activation='relu')
        self.dropout = Dropout(0.5)
        self.fc2 = Dense(2, name='gaze_point')

    def call(self, inputs):
        x = self.conv1(inputs)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.pool3(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [14]:

# this did training for spaze
# Define the training function for SPAZE Model
def train_spaze_model(model, dataloader, optimizer, num_epochs=2):
    for epoch in range(num_epochs):
        running_loss = 0.0
        num_batches = 0
        for i, (face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject_id) in enumerate(dataloader):
            with tf.GradientTape() as tape:
                outputs = model(face_image)
                
                # Debugging shape mismatch issue
                #print(f"gaze_2d shape: {gaze_2d.shape}")
                #print(f"outputs shape: {outputs.shape}")
                
                # Ensure gaze_2d and outputs have the same shape
                if gaze_2d.shape != outputs.shape:
                    gaze_2d = tf.reshape(gaze_2d, outputs.shape)
                
                loss = gaze_loss(gaze_2d, outputs)
            grads = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))
            running_loss += loss.numpy()
            num_batches += 1
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/num_batches}")

# Implementing the leave-one-out strategy for SPAZE model

# TODO: Due to less compute resources , we considered only one subject for leave-out strategy
subjects = ['p00']
spaze_results = []

for subject in subjects:
    print(f"Leaving out subject: {subject}")
    train_dataloader = GazeDataset(subject_to_leave_out=subject, batch_size=8)
    val_dataloader = GazeDataset(subject_to_leave_out=subject, batch_size=8)
    
    # Initialize SPAZE model and optimizer
    spaze_model = SPAZEModel()
    spaze_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    
    # Train the SPAZE model
    train_spaze_model(spaze_model, train_dataloader, spaze_optimizer, num_epochs=2)
    
    # Evaluate the SPAZE model on clean data
    val_loss_clean = 0.0
    num_val_batches_clean = 0
    for i, (face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject_id) in enumerate(val_dataloader):
        outputs = spaze_model(face_image)
        loss = gaze_loss(gaze_2d, outputs)
        val_loss_clean += loss.numpy()
        num_val_batches_clean += 1
    avg_val_loss_clean = val_loss_clean / num_val_batches_clean
    print(f"Validation Loss on clean data for subject {subject}: {avg_val_loss_clean}")
    
    # Evaluate the SPAZE model on corrupted data
    val_loss_corrupted = 0.0
    num_val_batches_corrupted = 0
    for i, (face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject_id) in enumerate(val_dataloader):
        corrupted_face_image = tf.image.random_brightness(face_image, max_delta=0.5)  # Example corruption
        outputs = spaze_model(corrupted_face_image)
        loss = gaze_loss(gaze_2d, outputs)
        val_loss_corrupted += loss.numpy()
        num_val_batches_corrupted += 1
    avg_val_loss_corrupted = val_loss_corrupted / num_val_batches_corrupted
    print(f"Validation Loss on corrupted data for subject {subject}: {avg_val_loss_corrupted}")
    
    # Collect results
    results = {
        'subject': subject,
        'spaze_clean_loss': avg_val_loss_clean,
        'spaze_corrupted_loss': avg_val_loss_corrupted
    }
    spaze_results.append(results)


Leaving out subject: p00
Epoch [1/2], Loss: 15.74140855112571
Epoch [2/2], Loss: 0.00545260396009932
Validation Loss on clean data for subject p00: 0.006276123268995434
Validation Loss on corrupted data for subject p00: 0.0062777507392068705


In [19]:
#import pandas as pd
# Summarize and visualize SPAZE results
#df_spaze_results = pd.DataFrame(spaze_results)
#print(df_spaze_results)

  subject  spaze_clean_loss  spaze_corrupted_loss
0     p00          0.007282              0.007284


# COMPARING PTGE AND SPAZE MODELS

In [20]:
import tensorflow as tf
import numpy as np
import pandas as pd

# Define the evaluation function
def evaluate_model(model, dataloader, corruption=None, model_type='ptge'):
    total_loss = 0.0
    num_batches = 0
    results = []

    for i, (face_image, left_eye_image, right_eye_image, rotation_matrix, rotation_matrix_flipped, gaze_2d, gaze_3d, gaze_3d_flipped, eye_coords, subject_id) in enumerate(dataloader):
        if corruption == 'noise':
            face_image += tf.random.normal(face_image.shape, mean=0.0, stddev=0.1)
            left_eye_image += tf.random.normal(left_eye_image.shape, mean=0.0, stddev=0.1)
            right_eye_image += tf.random.normal(right_eye_image.shape, mean=0.0, stddev=0.1)
        elif corruption == 'blur':
            face_image = tf.nn.conv2d(face_image, tf.random.normal((3, 3, 3, 3), mean=0.0, stddev=1.0), strides=[1, 1, 1, 1], padding='SAME')
            left_eye_image = tf.nn.conv2d(left_eye_image, tf.random.normal((3, 3, 3, 3), mean=0.0, stddev=1.0), strides=[1, 1, 1, 1], padding='SAME')
            right_eye_image = tf.nn.conv2d(right_eye_image, tf.random.normal((3, 3, 3, 3), mean=0.0, stddev=1.0), strides=[1, 1, 1, 1], padding='SAME')
        # Add other types of corruption as needed

        subject_indices = [int(s.decode().split('p')[1]) for s in subject_id.numpy()]

        input_dict = {
            'eye_coords': tf.convert_to_tensor(eye_coords, dtype=tf.float32),
            'face': tf.convert_to_tensor(face_image, dtype=tf.float32),
            'flipped_face': tf.convert_to_tensor(tf.image.flip_left_right(face_image), dtype=tf.float32),
            'id': tf.convert_to_tensor(subject_indices, dtype=tf.int32),
            'lefteye': tf.convert_to_tensor(left_eye_image, dtype=tf.float32),
            'righteye': tf.convert_to_tensor(right_eye_image, dtype=tf.float32),
            'rotation_matrix': tf.convert_to_tensor(rotation_matrix, dtype=tf.float32),
            'rotation_matrix_flipped': tf.convert_to_tensor(rotation_matrix_flipped, dtype=tf.float32)
        }

        #print(f"Batch {i} - Input Shapes and Types:")
        #for key, value in input_dict.items():
         #   print(f"{key}: shape={value.shape}, dtype={value.dtype}")

        if model_type == 'ptge':
            # First, get the initial gaze estimation from the Gaze Model
            initial_gaze_estimation = model['gaze_model'](input_dict)

            #print(f"Initial gaze estimation shape: {initial_gaze_estimation.shape}")

            # Now, use the Calibration Model to refine the gaze estimation
            calibration_input_dict = input_dict.copy()
            calibration_input_dict['gaze'] = initial_gaze_estimation
            calibration_input_dict['gaze_flipped'] = initial_gaze_estimation  # No flipping, use as is

            refined_gaze_estimation = model['calibration_model'](calibration_input_dict)

            # Ensure the refined_gaze_estimation shape matches the gaze_3d shape
            refined_gaze_estimation = refined_gaze_estimation[:, :3]  # Only take the first 3 columns

            #print(f"Refined gaze estimation shape: {refined_gaze_estimation.shape}")

            # Calculate loss
            loss = gaze_loss(gaze_3d, refined_gaze_estimation)
            results.append((gaze_3d.numpy(), refined_gaze_estimation.numpy()))

        elif model_type == 'spaze':
            # Get the gaze estimation from the SPAZE Model
            spaze_gaze_estimation = model['spaze_model'](face_image, training=False)

            #print(f"SPAZE gaze estimation shape: {spaze_gaze_estimation.shape}")

            # Calculate loss
            loss = gaze_loss(gaze_2d, spaze_gaze_estimation)
            results.append((gaze_2d.numpy(), spaze_gaze_estimation.numpy()))

        total_loss += loss.numpy()
        num_batches += 1

    average_loss = total_loss / num_batches
    return average_loss, results

# Load Pretrained Models in SavedModel format
#gaze_model = tf.keras.models.load_model('gaze_model_1', compile=False)
#calibration_model = tf.keras.models.load_model('calibration_model', compile=False)

# Initialize the SPAZE model
#spaze_model = SPAZEModel()

# Prepare the dataframes for results
results = {
    'subject': [],
    'ptge_clean_loss': [],
    'ptge_corrupted_loss_noise': [],
    'ptge_corrupted_loss_blur': [],
    'spaze_clean_loss': [],
    'spaze_corrupted_loss_noise': [],
    'spaze_corrupted_loss_blur': [],
}

#subjects = ['p00', 'p01', 'p02']
# TODO: Due to less compute resources , we considered only one subject for leave-out strategy
subjects = ['p00']
for subject in subjects:
    print(f"Evaluating for subject: {subject}")
    val_dataloader = GazeDataset(subject_to_leave_out=subject, batch_size=8)

    # Evaluate PTGE model on clean data
    ptge_clean_loss, _ = evaluate_model({'gaze_model': gaze_model, 'calibration_model': calibration_model}, val_dataloader, model_type='ptge')

    # Evaluate PTGE model on corrupted data (noise)
    ptge_corrupted_loss_noise, _ = evaluate_model({'gaze_model': gaze_model, 'calibration_model': calibration_model}, val_dataloader, corruption='noise', model_type='ptge')

    # Evaluate PTGE model on corrupted data (blur)
    ptge_corrupted_loss_blur, _ = evaluate_model({'gaze_model': gaze_model, 'calibration_model': calibration_model}, val_dataloader, corruption='blur', model_type='ptge')

    # Evaluate SPAZE model on clean data
    spaze_clean_loss, _ = evaluate_model({'spaze_model': spaze_model}, val_dataloader, model_type='spaze')

    # Evaluate SPAZE model on corrupted data (noise)
    spaze_corrupted_loss_noise, _ = evaluate_model({'spaze_model': spaze_model}, val_dataloader, corruption='noise', model_type='spaze')

    # Evaluate SPAZE model on corrupted data (blur)
    spaze_corrupted_loss_blur, _ = evaluate_model({'spaze_model': spaze_model}, val_dataloader, corruption='blur', model_type='spaze')

    results['subject'].append(subject)
    results['ptge_clean_loss'].append(ptge_clean_loss)
    results['ptge_corrupted_loss_noise'].append(ptge_corrupted_loss_noise)
    results['ptge_corrupted_loss_blur'].append(ptge_corrupted_loss_blur)
    results['spaze_clean_loss'].append(spaze_clean_loss)
    results['spaze_corrupted_loss_noise'].append(spaze_corrupted_loss_noise)
    results['spaze_corrupted_loss_blur'].append(spaze_corrupted_loss_blur)

# Convert the results to a DataFrame
df_results = pd.DataFrame(results)
df_results.to_csv('ptge_spaze_comparison_results.csv', index=False)
print("Results saved to 'ptge_spaze_comparison_results.csv'.")


Evaluating for subject: p00
Results saved to 'ptge_spaze_comparison_results.csv'.


# Plotting the errors for PTGE and SPAZE models for comparison

In [30]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [41]:
# Convert to DataFrame
df_results = pd.read_csv("/kaggle/working/ptge_spaze_comparison_results.csv")

#df_results = pd.read_csv("/kaggle/working/ptge_spaze_comparison_results.csv")
# Set the subject as the index (if not already set)
if 'subject' in df_results.columns:
    df_results.set_index('subject', inplace=True)

# Plotting the results
plt.figure(figsize=(14, 7))

# Plot each model's losses
plt.plot(df_results.index, df_results['ptge_clean_loss'], marker='o', label='PTGE Clean Loss', linestyle='--')
plt.plot(df_results.index, df_results['ptge_corrupted_loss_noise'], marker='o', label='PTGE Corrupted Loss (Noise)', linestyle='--')
plt.plot(df_results.index, df_results['ptge_corrupted_loss_blur'], marker='o', label='PTGE Corrupted Loss (Blur)', linestyle='--')

plt.plot(df_results.index, df_results['spaze_clean_loss'], marker='o', label='SPAZE Clean Loss', linestyle='-')
plt.plot(df_results.index, df_results['spaze_corrupted_loss_noise'], marker='o', label='SPAZE Corrupted Loss (Noise)', linestyle='-')
plt.plot(df_results.index, df_results['spaze_corrupted_loss_blur'], marker='o', label='SPAZE Corrupted Loss (Blur)', linestyle='-')

plt.title('Comparison of Losses for PTGE and SPAZE Models')
plt.xlabel('Subject')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# Save the plot to a file to ensure it's created
plt.savefig('comparison_plot-ptge-spaze.png')
