### Import Libraries and Mount Google Drive:

In [None]:
import json
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from google.colab import drive
import cv2
from transformers import BertTokenizer, TFBertForQuestionAnswering

drive.mount('/content/drive')


### Load Data

In [16]:
data_path = '/content/drive/MyDrive/MSVD-QA/'
video_path = data_path + 'video/'
train_qa_path = data_path + 'train_qa.json'
clarify_train_qa_path = data_path + 'clarify_train_qa.json'
val_qa_path = data_path + 'val_qa.json'

# Function to load JSON data
def load_json_data(path):
    with open(path, 'r') as file:
        return json.load(file)

# Loading QA data
train_qa = load_json_data(train_qa_path)
clarify_train_qa = load_json_data(clarify_train_qa_path)
val_qa = load_json_data(val_qa_path)

# Convert QA data to DataFrame
train_df = pd.DataFrame(train_qa)
clarify_train_df = pd.DataFrame(clarify_train_qa)
val_df = pd.DataFrame(val_qa)

name_mapping_path = data_path + 'youtube_mapping.txt'

def load_name_mapping(path):
    mapping = {}
    with open(path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 2:
                video_name, video_id = parts
                mapping[video_id] = video_name
    return mapping

name_mapping = load_name_mapping(name_mapping_path)


### Load Video Frames and Extract Features

In [26]:
# Load video frames using name mapping
def load_video_frames(video_id, max_frames=120):
    video_name = name_mapping.get('vid' + str(video_id), None)
    if not video_name:
        raise ValueError(f"No video found for video_id: {video_id}")

    video_full_path = video_path + video_name + '.avi'
    cap = cv2.VideoCapture(video_full_path)
    if not cap.isOpened():
        raise IOError(f"Cannot open video file: {video_full_path}")

    frames = []
    try:
        while len(frames) < max_frames:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, (224, 224))  # Resize to model's input size
            frame = frame.astype('float32') / 255.0  # Normalization
            frames.append(frame)
    finally:
        cap.release()
    return np.array(frames)

# TensorFlow Hub model
model_url = 'https://tfhub.dev/deepmind/i3d-kinetics-400/1'
video_model = hub.KerasLayer(model_url)

def extract_features(frames):
    # Ensure frames are in the right shape: (batch_size, num_frames, height, width, channels)
    frames = np.expand_dims(frames, axis=0)  # Add batch dimension
    features = video_model(frames)  # Extract features
    return features


### Load Bert Model

In [None]:
# Load BERT for QA
qa_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
qa_model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


### Precompute and Cache Videos

In [None]:
# Precompute and cache video features
video_features_cache = {}

for video_id in clarify_train_df['video_id'].unique():
    print(video_id)
    frames = load_video_frames(video_id)
    video_features = extract_features(frames)
    video_features_cache[video_id] = video_features


### Prepare Data for Training


In [None]:
# Prepare data for training
X_train = []
y_train = []

for _, row in clarify_train_df.iterrows():
    video_id = row['video_id']
    question = row['question']
    answer = row['answer']
    clarifications = row['clarifications']

    # Retrieve precomputed video features
    video_features = video_features_cache[video_id]
    video_features = tf.reshape(video_features, [1, -1])  # Reshape to (1, num_features)

    input_ids = []
    attention_masks = []

    encoded_dict = qa_tokenizer.encode_plus(
        question,
        add_special_tokens=True,
        max_length=128,
        truncation=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='tf',
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

    for clarification in clarifications:
        clarifying_question = clarification['clarifying_question']
        clarifying_answer = clarification['clarifying_answer']
        clarifying_text = clarifying_question + " " + clarifying_answer

        encoded_dict = qa_tokenizer.encode_plus(
            clarifying_text,
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf',
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)

    text_features = qa_model([input_ids, attention_masks])[0]
    text_features = tf.reduce_mean(text_features, axis=0)
    text_features = tf.expand_dims(text_features, axis=0)

    # Ensure video and text features have compatible dimensions
    print("Video features shape:", video_features.shape)  # Expected shape: (1, num_features)
    print("Text features shape:", text_features.shape)    # Expected shape: (1, 128)

    combined_features = tf.concat([video_features, text_features], axis=1)

    X_train.append(combined_features.numpy())
    y_train.append(answer)

X_train = np.array(X_train)
y_train = np.array(y_train)


### Define and Compile Model

In [29]:
# Define a simple classifier
model = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation='relu', input_shape=(combined_features.shape[1],)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(clarify_train_df['answer'].unique()), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


### Train the Model

In [None]:

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

### Test Model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the test data
test_qa_path = '/content/drive/MyDrive/MSVD-QA/test_qa.json'

def load_json_data(path):
    with open(path, 'r') as file:
        return json.load(file)

# Load the test QA data
test_qa = load_json_data(test_qa_path)
test_df = pd.DataFrame(test_qa)

# Load the BERT model and tokenizer
qa_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
qa_model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Define a function to calculate confidence score
def get_confidence_score(logits):
    probs = tf.nn.softmax(logits, axis=-1)
    confidence = tf.reduce_max(probs, axis=-1)
    return confidence.numpy()

# Define a function to generate clarifying question
def generate_clarifying_question(question, context):
    input_text = f"Q: {question} Context: {context} What is unclear?"
    encoded_dict = qa_tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=128,
        truncation=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='tf',
    )
    input_ids = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']

    outputs = qa_model(input_ids, attention_mask=attention_mask)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_idx = tf.argmax(start_scores, axis=1).numpy()[0]
    end_idx = tf.argmax(end_scores, axis=1).numpy()[0]
    clarifying_question = qa_tokenizer.convert_tokens_to_string(qa_tokenizer.convert_ids_to_tokens(input_ids[0][start_idx:end_idx+1]))
    return clarifying_question

# Define a function to test the model
def test_ivqa_model(test_df, video_features_cache, model, threshold=0.8):
    y_true = []
    y_pred = []
    confidence_scores = []

    for index, row in test_df.iterrows():
        video_id = row['video_id']
        question = row['question']
        true_answer = row['answer']

        # Retrieve precomputed video features
        video_features = video_features_cache[video_id]
        video_features = tf.reshape(video_features, [1, -1])  # Reshape to (1, num_features)

        # Encode the question
        encoded_dict = qa_tokenizer.encode_plus(
            question,
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf',
        )

        input_ids = encoded_dict['input_ids']
        attention_mask = encoded_dict['attention_mask']

        text_features = qa_model([input_ids, attention_mask])[0]
        text_features = tf.reduce_mean(text_features, axis=0)
        text_features = tf.expand_dims(text_features, axis=0)

        combined_features = tf.concat([video_features, text_features], axis=1)

        # Get model prediction and confidence score
        logits = model(combined_features)
        confidence_score = get_confidence_score(logits)

        if confidence_score < threshold:
            # Generate clarifying question
            clarifying_question = generate_clarifying_question(question, "")
            print(f"Clarifying question: {clarifying_question}")

            # Get clarifying answer from the user
            clarifying_answer = input("Please provide the clarifying answer: ")

            # Encode the clarifying question and answer
            clarifying_text = clarifying_question + " " + clarifying_answer
            encoded_dict = qa_tokenizer.encode_plus(
                clarifying_text,
                add_special_tokens=True,
                max_length=128,
                truncation=True,
                pad_to_max_length=True,
                return_attention_mask=True,
                return_tensors='tf',
            )

            input_ids = tf.concat([input_ids, encoded_dict['input_ids']], axis=0)
            attention_mask = tf.concat([attention_mask, encoded_dict['attention_mask']], axis=0)

            text_features = qa_model([input_ids, attention_mask])[0]
            text_features = tf.reduce_mean(text_features, axis=0)
            text_features = tf.expand_dims(text_features, axis=0)

            combined_features = tf.concat([video_features, text_features], axis=1)

        # Get final model prediction
        logits = model(combined_features)
        predicted_answer = tf.argmax(logits, axis=-1).numpy()[0]

        y_true.append(true_answer)
        y_pred.append(predicted_answer)
        confidence_scores.append(confidence_score)

    return y_true, y_pred, confidence_scores

# Assuming `video_features_cache` and `model` are preloaded and defined
y_true, y_pred, confidence_scores = test_ivqa_model(test_df, video_features_cache, model)

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
