In [1]:
import os
os.environ["HF_TOKEN"] = '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'

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import cv2
import os

# Constants
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 20
EMBEDDING_DIM = 256
LSTM_UNITS = 256
NUM_HEADS = 8
FF_DIM = 512
BATCH_SIZE = 64
IMG_SIZE = 224

In [2]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, dataframe, tokenizer, img_dir, batch_size=BATCH_SIZE, shuffle=True):
        self.df = dataframe
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.img_dir = img_dir
        self.shuffle = shuffle
        self.indexes = np.arange(len(self.df))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __len__(self):
        return int(np.ceil(len(self.df) / float(self.batch_size)))
    
    # def __getitem__(self, idx):
    #     batch_indexes = self.indexes[idx * self.batch_size:(idx + 1) * self.batch_size]
    #     batch_df = self.df.iloc[batch_indexes]

    #     X_image = self.load_images(batch_df['image_id'])
    #     X_question = self.tokenizer.texts_to_sequences(batch_df['question_preprocessed'])
    #     X_question = pad_sequences(X_question, maxlen=MAX_SEQUENCE_LENGTH)
    #     y = self.tokenizer.texts_to_sequences(batch_df['answer_preprocessed'])
    #     y = pad_sequences(y, maxlen=1)  # 答えは単一のトークン
    #     y = y.reshape(-1)  # (batch_size,) の形状に変更

    #     return [X_image, X_question], y


    def __getitem__(self, idx):
        batch_indexes = self.indexes[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_df = self.df.iloc[batch_indexes]

        X_image = self.load_images(batch_df['image_id'])
        X_question = self.tokenizer.texts_to_sequences(batch_df['question_preprocessed'])
        X_question = pad_sequences(X_question, maxlen=MAX_SEQUENCE_LENGTH)
        y = self.tokenizer.texts_to_sequences(batch_df['answer_preprocessed'])
        y = pad_sequences(y, maxlen=MAX_SEQUENCE_LENGTH)
        
        # Convert y to sparse categorical format (use the first token as the answer)
        y = y[:, 0]

        return [X_image, X_question], y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def load_images(self, image_ids):
        images = []
        for img_id in image_ids:
            path = os.path.join(self.img_dir, f"{img_id}")
            img = cv2.imread(path)
            img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
            img = np.array(img) / 255.0
            images.append(img)
        return np.array(images)

In [3]:
class TransformerDecoderLayer(layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TransformerDecoderLayer, self).__init__()

        self.mha1 = layers.MultiHeadAttention(num_heads, d_model)
        self.mha2 = layers.MultiHeadAttention(num_heads, d_model)

        self.ffn = tf.keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model)
        ])

        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
        self.dropout3 = layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask=None, padding_mask=None):
        attn1 = self.mha1(query=x, key=x, value=x, attention_mask=look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2 = self.mha2(query=out1, key=enc_output, value=enc_output, attention_mask=padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)

        return out3

In [4]:
#1kome
class VQAModel(Model):
    def __init__(self, vocab_size, max_length):
        super(VQAModel, self).__init__()
        self.image_model = tf.keras.applications.ResNet50(include_top=False, weights='imagenet')
        self.image_model.trainable = False
        self.image_dense = layers.Dense(EMBEDDING_DIM, activation='relu')
        
        self.embedding = layers.Embedding(vocab_size, EMBEDDING_DIM)
        self.lstm = layers.LSTM(LSTM_UNITS, return_sequences=True)
        
        self.decoder_layer = TransformerDecoderLayer(EMBEDDING_DIM, NUM_HEADS, FF_DIM)
        self.final_dense = layers.Dense(EMBEDDING_DIM, activation='relu')
        self.output_layer = layers.Dense(vocab_size, activation='softmax')

    def call(self, inputs):
        image, question = inputs
        
        # Image encoding
        image_features = self.image_model(image)
        image_features = layers.GlobalAveragePooling2D()(image_features)
        image_features = self.image_dense(image_features)
        image_features = tf.expand_dims(image_features, 1)
        
        # Question encoding
        embedded_question = self.embedding(question)
        question_features = self.lstm(embedded_question)
        
        # Combine image and question features
        decoder_input = tf.concat([image_features, question_features], axis=1)
        
        # Transformer decoder
        decoder_output = self.decoder_layer(decoder_input, decoder_input, training=True)
        
        # Final processing
        output = self.final_dense(decoder_output)
        output = tf.reduce_mean(output, axis=1)  # Global average pooling
        output = self.output_layer(output)
        
        return output

In [35]:
#2kome
class VQAModel(Model):
    def __init__(self, vocab_size, max_length):
        super(VQAModel, self).__init__()
        self.image_model = tf.keras.applications.ResNet50(include_top=False, weights='imagenet')
        self.image_model.trainable = False  # 必要に応じてTrue に設定
        self.image_dense = layers.Dense(EMBEDDING_DIM, activation='relu')
        
        self.embedding = layers.Embedding(vocab_size, EMBEDDING_DIM)
        self.lstm = layers.Bidirectional(layers.LSTM(LSTM_UNITS, return_sequences=True))
        
        self.decoder_layers = [TransformerDecoderLayer(EMBEDDING_DIM, NUM_HEADS, FF_DIM) for _ in range(3)]
        self.final_attention = layers.MultiHeadAttention(NUM_HEADS, EMBEDDING_DIM)
        self.final_dense = layers.Dense(EMBEDDING_DIM, activation='relu')
        self.output_layer = layers.Dense(vocab_size, activation='softmax')

    def call(self, inputs):
        image, question = inputs
        
        # Image encoding
        image_features = self.image_model(image)
        image_features = layers.GlobalAveragePooling2D()(image_features)
        image_features = self.image_dense(image_features)
        image_features = tf.expand_dims(image_features, 1)
        
        # Question encoding
        embedded_question = self.embedding(question)
        question_features = self.lstm(embedded_question)
        
        # Combine image and question features
        decoder_input = tf.concat([image_features, question_features], axis=1)
        
        # Multiple Transformer decoder layers
        for decoder_layer in self.decoder_layers:
            decoder_input = decoder_layer(decoder_input, decoder_input, training=True)
        
        # Final attention and processing
        attention_output = self.final_attention(decoder_input, decoder_input, decoder_input)
        output = self.final_dense(attention_output)
        output = tf.reduce_mean(output, axis=1)  # Global average pooling
        output = self.output_layer(output)
        
        return output

In [5]:
# Prepare data
data_df_k1000 = pd.read_csv("./0Data/mscoco_train2014_preprocessed_k1000.csv")
X = data_df_k1000[['image_id', 'question_preprocessed', 'answer_preprocessed']]
y = data_df_k1000['answer_preprocessed']

In [6]:
# Tokenize text
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X['question_preprocessed'].tolist() + y.tolist())

In [7]:
# Create data generators
img_dir = "./0Data/MSCOCO/"  # MSCOCOの画像が保存されているディレクトリパスを指定してください
train_generator = DataGenerator(X, tokenizer, img_dir)

In [37]:
from sklearn.model_selection import train_test_split
# perform train validation & test split on the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, stratify=y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.10, stratify=y_train, random_state=42)


#X_train,y_train = pickle.load(open('./0Data/train_1129.pkl', 'rb'))
#X_val,y_val = pickle.load(open('./0Data/val_1129.pkl', 'rb'))
#X_test,y_test = pickle.load(open('./0Data/test_1129.pkl', 'rb'))



print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(314499, 3) (314499,)
(38828, 3) (38828,)
(34945, 3) (34945,)


In [8]:
# Create and compile model
model = VQAModel(MAX_VOCAB_SIZE, MAX_SEQUENCE_LENGTH)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)

model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [23]:
# Train model
history = model.fit(train_generator, epochs=10)

Epoch 1/10


2024-10-16 17:29:12.591345: I external/local_xla/xla/service/service.cc:168] XLA service 0x5614e87ec460 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-10-16 17:29:12.591434: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2024-10-16 17:29:12.591450: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (1): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2024-10-16 17:29:12.591462: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (2): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2024-10-16 17:29:12.591473: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (3): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2024-10-16 17:29:12.674030: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:172

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
model.save('./0Data/model/gpu/model_t51016', save_format="tf")

INFO:tensorflow:Assets written to: ./0Data/model/gpu/model_t51016/assets


INFO:tensorflow:Assets written to: ./0Data/model/gpu/model_t51016/assets


In [28]:
model.save_weights('./0Data/model/gpu/model_t51016_weights.h5')

In [14]:
def generate_answer(image_id, question):
    image_path = os.path.join(img_dir, f"{image_id}")
    img = cv2.imread(image_path)
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
    img = np.array(img) / 255.0
    img = np.expand_dims(img, axis=0)
    
    question_seq = tokenizer.texts_to_sequences([question])
    question_padded = pad_sequences(question_seq, maxlen=MAX_SEQUENCE_LENGTH)
    
    predictions = model.predict([img, question_padded])
    predicted_seq = np.argmax(predictions, axis=-1)  # axis=-1 を追加
    
    # デバッグ用の出力
    print("predictions shape:", predictions.shape)
    print("predicted_seq shape:", predicted_seq.shape)
    
    # predicted_seq が2次元の場合、最初の（そして唯一の）シーケンスを取得
    if len(predicted_seq.shape) == 2:
        predicted_seq = predicted_seq[0]
    
    # predicted_seq をリストに変換
    predicted_seq_list = predicted_seq.tolist()
    
    predicted_answer = tokenizer.sequences_to_texts([predicted_seq_list])[0]
    
    return predicted_answer

# 使用例
sample_image_id = X['image_id'].iloc[2]
sample_question = X['question_preprocessed'].iloc[2]
generated_answer = generate_answer(sample_image_id, sample_question)
print(f"Question: {sample_question}")
print(f"Generated Answer: {generated_answer}")

predictions shape: (1, 10000)
predicted_seq shape: (1,)
Question: is the sky blue
Generated Answer: gym


In [15]:
def generate_answer(image_id, question):
    image_path = os.path.join(img_dir, f"{image_id}")
    img = cv2.imread(image_path)
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
    img = np.array(img) / 255.0
    img = np.expand_dims(img, axis=0)
    
    question_seq = tokenizer.texts_to_sequences([question])
    question_padded = pad_sequences(question_seq, maxlen=MAX_SEQUENCE_LENGTH)
    
    predictions = model.predict([img, question_padded])
    predicted_seq = np.argmax(predictions, axis=-1)
    
    print("predictions shape:", predictions.shape)
    print("predicted_seq shape:", predicted_seq.shape)
    print("predicted_seq:", predicted_seq)
    
    # Top 5の予測結果を表示
    top_5_indices = np.argsort(predictions[0])[-5:][::-1]
    print("Top 5 predictions:")
    for idx in top_5_indices:
        word = tokenizer.index_word.get(idx, "<UNK>")
        prob = predictions[0][idx]
        print(f"  {word}: {prob:.4f}")
    
    predicted_seq_list = predicted_seq.tolist()
    predicted_answer = tokenizer.sequences_to_texts([predicted_seq_list])[0]
    
    return predicted_answer

# 使用例
sample_image_id = X['image_id'].iloc[0]
sample_question = X['question_preprocessed'].iloc[0]
generated_answer = generate_answer(sample_image_id, sample_question)
print(f"Question: {sample_question}")
print(f"Generated Answer: {generated_answer}")

predictions shape: (1, 10000)
predicted_seq shape: (1,)
predicted_seq: [3047]
Top 5 predictions:
  gym: 0.0002
  grumpy: 0.0002
  televisions: 0.0002
  repeat: 0.0002
  snows: 0.0002
Question: is this man a professional baseball player
Generated Answer: gym


In [40]:
def generate_answer(image_id, question):
    image_path = os.path.join(img_dir, f"{image_id}")
    img = cv2.imread(image_path)
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
    img = np.array(img) / 255.0
    img = np.expand_dims(img, axis=0)
    
    question_seq = tokenizer.texts_to_sequences([question])
    question_padded = pad_sequences(question_seq, maxlen=MAX_SEQUENCE_LENGTH)
    
    predictions = model.predict([img, question_padded])
    predicted_token = np.argmax(predictions[0])
    
    print("predictions shape:", predictions.shape)
    print("predicted_token:", predicted_token)
    
    # Top 5の予測結果を表示
    top_5_indices = np.argsort(predictions[0])[-5:][::-1]
    print("Top 5 predictions:")
    for idx in top_5_indices:
        word = tokenizer.index_word.get(idx, "<UNK>")
        prob = predictions[0][idx]
        print(f"  {word}: {prob:.4f}")
    
    predicted_answer = tokenizer.index_word.get(predicted_token, "<UNK>")
    
    return predicted_answer

In [41]:
# 使用例
sample_image_id = X['image_id'].iloc[0]
sample_question = X['question_preprocessed'].iloc[0]
generated_answer = generate_answer(sample_image_id, sample_question)
print(f"Question: {sample_question}")
print(f"Generated Answer: {generated_answer}")

ValueError: in user code:

    File "/data/t32303m/anaconda3/envs/KIBU3/lib/python3.9/site-packages/keras/src/engine/training.py", line 2440, in predict_function  *
        return step_function(self, iterator)
    File "/data/t32303m/anaconda3/envs/KIBU3/lib/python3.9/site-packages/keras/src/engine/training.py", line 2425, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/data/t32303m/anaconda3/envs/KIBU3/lib/python3.9/site-packages/keras/src/engine/training.py", line 2413, in run_step  **
        outputs = model.predict_step(data)
    File "/data/t32303m/anaconda3/envs/KIBU3/lib/python3.9/site-packages/keras/src/engine/training.py", line 2381, in predict_step
        return self(x, training=False)
    File "/data/t32303m/anaconda3/envs/KIBU3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/__autograph_generated_filegpfkik1x.py", line 17, in tf__call
        decoder_input = ag__.converted_call(ag__.ld(tf).concat, ([ag__.ld(image_features), ag__.ld(question_features)],), dict(axis=1), fscope)

    ValueError: Exception encountered when calling layer 'vqa_model_4' (type VQAModel).
    
    in user code:
    
        File "/tmp/ipykernel_110627/3008830667.py", line 31, in call  *
            decoder_input = tf.concat([image_features, question_features], axis=1)
    
        ValueError: Dimension 0 in both shapes must be equal, but are 256 and 512. Shapes are [256] and [512]. for '{{node vqa_model_4/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](vqa_model_4/ExpandDims, vqa_model_4/bidirectional/concat, vqa_model_4/concat/axis)' with input shapes: [?,1,256], [?,50,512], [] and with computed input tensors: input[2] = <1>.
    
    
    Call arguments received by layer 'vqa_model_4' (type VQAModel):
      • inputs=('tf.Tensor(shape=(None, 224, 224, 3), dtype=float32)', 'tf.Tensor(shape=(None, 50), dtype=int32)')


In [32]:
print("Tokenizer word_index:", list(tokenizer.word_index.items())[:10])
print("Tokenizer index_word:", list(tokenizer.index_word.items())[:10])

Tokenizer word_index: [('<OOV>', 1), ('the', 2), ('is', 3), ('what', 4), ('are', 5), ('yes', 6), ('no', 7), ('this', 8), ('in', 9), ('a', 10)]
Tokenizer index_word: [(1, '<OOV>'), (2, 'the'), (3, 'is'), (4, 'what'), (5, 'are'), (6, 'yes'), (7, 'no'), (8, 'this'), (9, 'in'), (10, 'a')]


In [33]:
print(model.layers[-1].get_config())

{'name': 'dense_16', 'trainable': True, 'dtype': 'float32', 'units': 10000, 'activation': 'softmax', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}
