# Image Feature Extraction

In [None]:
import tensorflow as tf
import os
import numpy as np
import json
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.layers import Input, Embedding, LSTM, Dense, Dropout, concatenate
from keras.models import Model
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re
import pandas as pd
import pickle
from tensorflow.keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertModel
import tensorflow as tf
import os
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.models import Model
from keras.applications.xception import Xception, preprocess_input
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils import load_img, img_to_array
from tqdm import tqdm
import re
import pickle

In [None]:
import numpy as np
from transformers import AutoFeatureExtractor, DeiTForImageClassificationWithTeacher
from PIL import Image
import torch
import os
from tqdm import tqdm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the feature extractor and model
feature_extractor = AutoFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
model = DeiTForImageClassificationWithTeacher.from_pretrained('facebook/deit-base-distilled-patch16-224')
model.eval().to(device)

image_dir = '/kaggle/input/visual-question-answering/val2014/val2014'

train_features = []
for filename in tqdm(os.listdir(image_dir)):
    image_path = os.path.join(image_dir, filename)
    img = Image.open(image_path).convert("RGB")  # Convert image to RGB if needed

    # Apply transforms and move data to GPU
    inputs = feature_extractor(images=img, return_tensors="pt").to(device)

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        features = logits if logits is not None else outputs.last_hidden_state
    train_features.append(features.cpu().numpy())  # Move data back to CPU and convert to numpy array

# Convert train_features to a numpy array
train_features = np.array(train_features)

# Reshape the array to (number of images, 1000)
train_features = train_features.squeeze()

print(train_features.shape)  # Should output (number_of_images, 1000)


In [None]:
tp = list(train_features)

In [None]:
len(tp[0])

In [None]:
# Define paths to dataset and output files
data_dir = "/kaggle/input/visual-question-answering/"
output_dir = "/kaggle/working/"
image_dir = os.path.join(data_dir, "val2014")
output_file = os.path.join(output_dir, "val_features.npy")

# Define a data generator to preprocess the images
target_size = (299, 299)
datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
generator = datagen.flow_from_directory(
    image_dir,
    target_size=target_size,
    batch_size=32,
    class_mode=None,
    shuffle=False
)

In [None]:
# Concatenate and reshape the extracted features into a numpy array
tp = np.concatenate(tp)
tp = tp.reshape((len(generator.filenames), -1))

# Save the extracted features to a numpy file
np.save(output_file, tp)

In [None]:
# add ids to features
img_ids = np.array([int(re.search("[0-9][0-9][0-9][0-9][0-9]+", gen).group()) for gen in generator.filenames])
image_features = {}
for i in range(len(img_ids)):
    image_features[img_ids[i]]= tp[i]

In [None]:
# save dictionary to test_image_features.pkl file
with open('diet_val_image_features.pkl', 'wb') as fp:
    pickle.dump(image_features, fp)
    print('dictionary saved successfully to file')

# Question Encoding

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

# Load the BERT-based model
model = SentenceTransformer('bert-base-uncased')

# List of questions
#questions = ["What is the capital of France?", "Who wrote Harry Potter?", "What is the meaning of life?"]

# Encode the questions
question_embeddings = model.encode(questions)

In [None]:
len(question_embeddings[0])

In [None]:
with open('question_embeddings.pkl', 'wb') as fp:
    pickle.dump(question_embeddings, fp)
    print('Question_Embeddings save successfully to file')

In [None]:
# Read dictionary pkl file
with open('/kaggle/input/bert-question-embeddings/question_embeddings.pkl', 'rb') as fp:
    question_embeddings = pickle.load(fp)
    print('successful')

successful


# Real All Files

In [None]:
train_file_questions = '/kaggle/input/visual-question-answering/v2_Questions_Train_mscoco/v2_OpenEnded_mscoco_train2014_questions.json'
train_file_annotations = '/kaggle/input/visual-question-answering/v2_Annotations_Train_mscoco/v2_mscoco_train2014_annotations.json'
val_file_questions = '/kaggle/input/visual-question-answering/v2_Questions_Val_mscoco/v2_OpenEnded_mscoco_val2014_questions.json'
val_file_annotations = '/kaggle/input/visual-question-answering/v2_Annotations_Val_mscoco/v2_mscoco_val2014_annotations.json'

with open(train_file_questions, 'r') as f:
    train_questions = json.load(f)['questions']
    f.close()

with open(train_file_annotations, 'r') as f:
    train_annotations = json.load(f)['annotations']
    f.close()

with open(val_file_questions, 'r') as f:
    val_questions = json.load(f)['questions']
    f.close()

with open(val_file_annotations, 'r') as f:
    val_annotations = json.load(f)['annotations']
    f.close()


#### read all train and validate image features with IDs from the pkl files

# Read dictionary pkl file
with open('/kaggle/input/deit-image-features/diet_train_image_features.pkl', 'rb') as fp:
    train_imgs_features = pickle.load(fp)
    print('successful')
#Read dictionary pkl file
with open('/kaggle/input/deit-image-features/diet_val_image_features.pkl', 'rb') as fp:
    val_imgs_features = pickle.load(fp)
    print('successful')

#### append validate to train features
print("Length of train_imgs_features:", len(train_imgs_features))
print("Length of val_imgs_features:", len(val_imgs_features))

#train_imgs_features = np.concatenate((train_imgs_features, val_imgs_features), axis=0)
train_imgs_features.update(val_imgs_features)

print(len(train_imgs_features))

#### append validate questions and answers to train questions and answers

# Combine the training and validation questions and annotations
train_questions += val_questions
train_annotations += val_annotations

successful
successful
Length of train_imgs_features: 82783
Length of val_imgs_features: 40504
123287


# Extract questions,answers,image_ids

In [None]:
# Extract the questions and answers
questions = []
answers = []
features_id = []

for i in range(len(train_questions)):
    questions.append(train_questions[i]['question'])
    answers.append(train_annotations[i]['multiple_choice_answer'])
    features_id.append(train_questions[i]["image_id"])
print("Length of features_id:", len(features_id))
print("Maximum index in features_id:", max(features_id))

Length of features_id: 658111
Maximum index in features_id: 581929


# Answers to one-hot-encoding

In [None]:
# Convert the answers to one-hot vectors
answers_tokenizer = Tokenizer()
answers_tokenizer.fit_on_texts(answers)
answer_word_index = answers_tokenizer.word_index
num_classes = len(answer_word_index)
answer_sequences = answers_tokenizer.texts_to_sequences(answers)

# Pad the answer sequences to ensure they all have the same length
max_answer_length = max(len(seq) for seq in answer_sequences)
padded_answers = pad_sequences(answer_sequences, maxlen=max_answer_length)

# Get the unique answers in the dataset and create a dictionary to map them to integer labels
unique_answers = list(set(answers))
label_map = {answer: i for i, answer in enumerate(unique_answers)}
with open('label_map_Full.pkl', 'wb') as fp:
    pickle.dump(label_map, fp)
    print('label_map save successfully to file')

# Convert the answers to integer labels and then to one-hot vectors
labels = [label_map[answer] for answer in answers]
one_hot_answers = to_categorical(labels, num_classes=len(unique_answers))

#### shape of the dataset

print(len(features_id))
print(question_embeddings.shape)
print(one_hot_answers.shape)

label_map save successfully to file
658111
(658111, 768)
(658111, 29332)


# Train,Test and Validation Split

In [None]:
### split train into 80% train and 20% test
#### (inplace to prevent memory overflow)

split_indices = np.random.randint(low=0, high=len(features_id), size=int(len(features_id)*0.2))
split_indices = sorted(split_indices,reverse=True)

test_padded_sequences = []
padded_sequences = list(question_embeddings)
for i in split_indices:
    test_padded_sequences.append(padded_sequences.pop(i))

test_one_hot_answers = []
one_hot_answers = list(one_hot_answers)
for i in split_indices:
    test_one_hot_answers.append(one_hot_answers.pop(i))

test_features_id = []
for i in split_indices:
    test_features_id.append(features_id.pop(i))

### split 30% test into 20% test and 10% validate
#### (inplace to prevent memory overflow)

split_indices = np.random.randint(low=0, high=len(test_features_id), size=int(len(test_features_id)*0.5))
split_indices = sorted(split_indices,reverse=True)

val_padded_sequences = []
for i in split_indices:
    val_padded_sequences.append(test_padded_sequences.pop(i))

val_one_hot_answers = []
for i in split_indices:
    val_one_hot_answers.append(test_one_hot_answers.pop(i))

val_features_id = []
for i in split_indices:
    val_features_id.append(test_features_id.pop(i))
print("Length of val_features_id:", len(val_features_id))
print("Maximum index in val_features_id:", max(val_features_id))

Length of val_features_id: 65811
Maximum index in val_features_id: 581929


# Architecture

In [None]:
# Define the input layers
question_input = Input(shape=(768, ), name='question_input')
image_input = Input(shape=(1000, ), name='image_input')

# Define the dense layer for the image features
image_dense = Dense(units=256, activation='relu', name='image_dense')(image_input)
image_dense = Dropout(0.2, name='image_dropout')(image_dense)

# Concatenate the output from the LSTM and dense layers
concatenated = concatenate([question_input, image_input], name='concatenated')

dense_cnc = Dense(units=512, activation='relu', name='dens_conc')(concatenated)
dense_cnc2 = Dense(units=256, activation='relu', name='dens_conc2')(dense_cnc)
dense_cnc3 = Dense(units=512, activation='relu', name='dens_conc3')(dense_cnc2)
dense_cnc4 = Dense(units=256, activation='relu', name='dens_conc4')(dense_cnc3)
# Define the output layer for the classification
output = Dense(units=len(unique_answers), activation='softmax', name='output')(dense_cnc4)

In [None]:
# Load the saved model
model = tf.keras.models.load_model("/kaggle/input/deit_bert/tensorflow2/version-1/1/Deit_Bert.h5")
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9), metrics=['accuracy'])
model.summary()

# Data Generator for Training

In [None]:
def data_generator(image_features, padded_questions, labels, batch_size):
        num_samples = len(labels)
        steps_per_epoch = num_samples // batch_size
        while True:
            for i in range(steps_per_epoch):
                batch_image_features = []
                for j in range(i*batch_size, (i+1)*batch_size):
                     # Check if the index is within bounds
                    if j < len(image_features):
                        batch_image_features.append(train_imgs_features[image_features[j]])
                batch_padded_questions = padded_questions[i*batch_size:(i+1)*batch_size]
                batch_labels = labels[i*batch_size:(i+1)*batch_size]
                yield (
                  (
                    tf.convert_to_tensor(batch_padded_questions, dtype=tf.float32),
                    tf.convert_to_tensor(batch_image_features, dtype=tf.float32)
                  ),
                tf.convert_to_tensor(batch_labels, dtype=tf.float32)
               )

Using (1/4)th of original data

In [None]:
size = len(features_id)
split = size//4
fd = features_id[:split]
ps = padded_sequences[:split]
oha = one_hot_answers[:split]
size = len(val_features_id)
val_split = (size//4)
vfd = val_features_id[:val_split]
vps = val_padded_sequences[:val_split]
voha = val_one_hot_answers[:val_split]
len(voha)

16452

# Training Loop

In [None]:
batch_size = 64
steps_per_epoch = len(oha) // batch_size

# Khởi tạo ModelCheckpoint
# checkpoint = ModelCheckpoint('xception_lstm_all_Full_80.h5', monitor='val_accuracy', save_best_only=True)
checkpoint = ModelCheckpoint('dino_lstm.keras', monitor='val_accuracy', save_best_only=True)

history = model.fit(data_generator(fd, ps ,oha,batch_size),
                        steps_per_epoch=steps_per_epoch,
                        epochs=10,
                        validation_data = data_generator(vfd,vps,voha,batch_size),
                        validation_steps = int(len(vfd)/batch_size),
                        callbacks=[checkpoint])

# Model Save

In [None]:
# Save the trained model
model.save("Deit_Bert_1.h5")

In [None]:
# Extract the history dictionary
history_dict = history.history

# Convert the history dictionary to a DataFrame
history_df = pd.DataFrame(history_dict)

history_df.to_csv('training_history_deit_bert.csv', index=False)

# Display the DataFrame
history_df

# Testing

In [None]:
a = data_generator(test_features_id,test_padded_sequences,test_one_hot_answers,1)

# Prediction on Test Data

In [None]:
i=65811
p=[]
t=[]
while(i):
    inp,labels = next(a)
    pred = model.predict(inp)
    true_class = np.argmax(labels,axis=1)
    pred_class = np.argmax(pred,axis=1)
    p.append(pred_class)
    t.append(true_class)
    i=i-1

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 862ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


I0000 00:00:1716014703.310130     133 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21

In [None]:
from sklearn.metrics import classification_report
report = classification_report(t, p)
print(report)

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         0
          34       1.00      0.50      0.67         2
          35       0.00      0.00      0.00         0
          37       0.00      0.00      0.00         1
          38       0.00      0.00      0.00         0
         112       0.00      0.00      0.00         1
         113       0.00      0.00      0.00         0
         133       1.00      0.33      0.50         3
         134       0.00      0.00      0.00         0
         157       1.00      0.15      0.27        26
         158       0.00      0.00      0.00         0
         160       0.00      0.00      0.00         1
         161       0.00      0.00      0.00         0
         169       0.00      0.00      0.00         1
         170       0.00      0.00      0.00         0
         202       0.00      0.00      0.00         1
         203       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd

# Define the data
data = {
    "metrics": ["macro avg", "weighted avg"],
    "precision": [0.17, 0.80],
    "recall": [0.11, 0.20],
    "f1-score": [0.12, 0.30],
    "support": [6581, 6581]
}

# Create a DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,metrics,precision,recall,f1-score,support
0,macro avg,0.17,0.11,0.12,6581
1,weighted avg,0.8,0.2,0.3,6581
