In [1]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D, Concatenate, Dropout
from tensorflow.keras.models import Model
import numpy as np
from PIL import Image
import pandas as pd
import os




In [2]:
bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = TFBertForSequenceClassification.from_pretrained(bert_model_name, num_labels=2)




All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
resnet_model = ResNet50V2(weights='imagenet', include_top=False)




In [4]:
label = pd.read_csv('devset_images_gt.csv') 
# Preview the first 5 lines of the loaded data 
label_array = label.values


In [5]:
folder_path = "C:/Users/minhd/FPTU lab/DPL302m/Kaggle/2024-sum-dpl-302-m/devset_images/devset_images"
def load_and_preprocess_image(image_path, target_size=(256, 256)):
    image = Image.open(image_path)
    image = image.resize(target_size)
    image = np.array(image)
    if image.shape[-1] == 4:  # Nếu ảnh có kênh alpha (RGBA), chuyển về RGB
        image = image[:, :, :3]
    image = image / 255.0  # Chuẩn hóa ảnh về khoảng [0, 1]
    return image

# Đọc tất cả các ảnh trong folder
images = []
Y_train = []

# List of allowed file extensions
allowed_extensions = ['.jpg', '.png', '.gif']

# Giả sử label_array là một mảng các tên file và nhãn tương ứng
# Cập nhật lại danh sách các file ảnh với các định dạng khác nhau
for filename in os.listdir(folder_path):
    # Check if the file has one of the allowed extensions
    if os.path.splitext(filename)[1].lower() in allowed_extensions:
        # Extract the base name without the extension to match with label_array
        basename = os.path.splitext(filename)[0]
        # Find the corresponding label for the current image file
        for i in label_array:
            if basename == str(i[0]):
                image_path = os.path.join(folder_path, filename)
                image = load_and_preprocess_image(image_path)
                images.append(image)
                Y_train.append(i[1])
                break  # Break the inner loop once the label is found

In [88]:
images = np.array(images)
Y_train = np.array(Y_train)

In [7]:
# Đọc trực tiếp file JSON thành DataFrame
df = pd.read_json('devset_images_metadata.json')

# Nếu cần chuyển đổi từ cột chứa danh sách các đối tượng thành các cột DataFrame
df = pd.json_normalize(df['images'])

# Hiển thị DataFrame
df.head()

Unnamed: 0,description,user_tags,title,license_name,user_nsid,image_extension_original,longitude,image_id,license_url,date_uploaded,date_taken,latitude,image_url,user_nickname,capture_device
0,,"[2009 road trip, obrero road trip]",Biltmore Estate,Attribution-NonCommercial-NoDerivs License,95156977@N00,jpg,,3519864665,http://creativecommons.org/licenses/by-nc-nd/2.0/,1242004112,2009-05-10 08:27:33.0,,http://www.flickr.com/photos/95156977@N00/3519...,5 Flip-Flops (Earl),Canon EOS DIGITAL REBEL XT
1,,"[daulatabad, daulatabad fort, ellora, road trip]",Chand Minar,Attribution-ShareAlike License,24574470@N00,jpg,75.200386,4896119055,http://creativecommons.org/licenses/by-sa/2.0/,1281931224,2010-08-14 13:35:10.0,19.939383,http://www.flickr.com/photos/24574470@N00/4896...,sankarshan,NIKON CORPORATION NIKON D90
2,"After the flood, the boarded up stores bear up...","[cedarrapids, createsouthroadtrip2009, disaste...",Uplifting Graffiti,Attribution License,73451168@N00,jpg,,3468473862,http://creativecommons.org/licenses/by/2.0/,1240493762,2009-04-21 18:07:56.0,,http://www.flickr.com/photos/73451168@N00/3468...,J Wynia,Panasonic DMC-TZ5
3,,"[cork, enchente, flood, ireland, irlanda]",DSCF6487,Attribution-NonCommercial-NoDerivs License,12947023@N00,jpg,-8.621177,4120853942,http://creativecommons.org/licenses/by-nc-nd/2.0/,1258754762,2009-11-20 15:16:40.0,51.889603,http://www.flickr.com/photos/12947023@N00/4120...,guileite,FUJIFILM FinePix S6000fd
4,,"[athens georgia, brown, current, flood, mud, r...",Oconoe river - flooded,Attribution License,60704492@N00,jpg,-83.368265,4436083254,http://creativecommons.org/licenses/by/2.0/,1268676971,2010-03-13 15:14:04.0,33.949149,http://www.flickr.com/photos/60704492@N00/4436...,The_Gut,Canon PowerShot SX10 IS


In [8]:
label.rename(columns = {'id': 'image_id', 'label': 'train_y'}, inplace = True)
label.head()

Unnamed: 0,image_id,train_y
0,3519864665,0
1,4896119055,0
2,3468473862,0
3,4120853942,0
4,4436083254,0


In [14]:
data = pd.concat([df, label], axis = 1)
data.head()

Unnamed: 0,description,user_tags,title,license_name,user_nsid,image_extension_original,longitude,image_id,license_url,date_uploaded,date_taken,latitude,image_url,user_nickname,capture_device,image_id.1,train_y
0,,"[2009 road trip, obrero road trip]",Biltmore Estate,Attribution-NonCommercial-NoDerivs License,95156977@N00,jpg,,3519864665,http://creativecommons.org/licenses/by-nc-nd/2.0/,1242004112,2009-05-10 08:27:33.0,,http://www.flickr.com/photos/95156977@N00/3519...,5 Flip-Flops (Earl),Canon EOS DIGITAL REBEL XT,3519864665,0
1,,"[daulatabad, daulatabad fort, ellora, road trip]",Chand Minar,Attribution-ShareAlike License,24574470@N00,jpg,75.200386,4896119055,http://creativecommons.org/licenses/by-sa/2.0/,1281931224,2010-08-14 13:35:10.0,19.939383,http://www.flickr.com/photos/24574470@N00/4896...,sankarshan,NIKON CORPORATION NIKON D90,4896119055,0
2,"After the flood, the boarded up stores bear up...","[cedarrapids, createsouthroadtrip2009, disaste...",Uplifting Graffiti,Attribution License,73451168@N00,jpg,,3468473862,http://creativecommons.org/licenses/by/2.0/,1240493762,2009-04-21 18:07:56.0,,http://www.flickr.com/photos/73451168@N00/3468...,J Wynia,Panasonic DMC-TZ5,3468473862,0
3,,"[cork, enchente, flood, ireland, irlanda]",DSCF6487,Attribution-NonCommercial-NoDerivs License,12947023@N00,jpg,-8.621177,4120853942,http://creativecommons.org/licenses/by-nc-nd/2.0/,1258754762,2009-11-20 15:16:40.0,51.889603,http://www.flickr.com/photos/12947023@N00/4120...,guileite,FUJIFILM FinePix S6000fd,4120853942,0
4,,"[athens georgia, brown, current, flood, mud, r...",Oconoe river - flooded,Attribution License,60704492@N00,jpg,-83.368265,4436083254,http://creativecommons.org/licenses/by/2.0/,1268676971,2010-03-13 15:14:04.0,33.949149,http://www.flickr.com/photos/60704492@N00/4436...,The_Gut,Canon PowerShot SX10 IS,4436083254,0


In [26]:
def preprocess_user_tags(tags):
    if isinstance(tags, list):
        return ' '.join(tags)
    elif pd.isnull(tags):
        return '[NULL]'
    else:
        return tags

data['user_tags'] = data['user_tags'].apply(preprocess_user_tags)

In [28]:
data['text'] = data[['description', 'user_tags','title']].apply(lambda x: ' | '.join(x.dropna()), axis=1)
data['text']

0       2009 road trip obrero road trip | Biltmore Estate
1       daulatabad daulatabad fort ellora road trip | ...
2       After the flood, the boarded up stores bear up...
3          cork enchente flood ireland irlanda | DSCF6487
4       athens georgia brown current flood mud river s...
                              ...                        
5275    550d camino canon canoneos550d canoneoskissx4 ...
5276    albany, ny flood walk water | Albany's Corning...
5277                al the waters in pike road | IMG_4989
5278    2013 Fair Flood | 2013 county fair flood linn ...
5279    Alcatraz trip, San Francisco |  | Prison building
Name: text, Length: 5280, dtype: object

In [32]:
texts = data['text'].tolist()
labels = data['train_y'].tolist()

In [34]:
def encode_texts(texts, tokenizer, max_length):
    return tokenizer(texts, max_length=max_length, truncation=True, padding='max_length', return_tensors='tf')


In [136]:
encoded_inputs = encode_texts(texts, tokenizer, max_length=128)
dataset = tf.data.Dataset.from_tensor_slices((dict(encoded_inputs), labels))

{'input_ids': <tf.Tensor: shape=(5280, 128), dtype=int32, numpy=
array([[  101,  2268,  2346, ...,     0,     0,     0],
       [  101,  4830, 18060, ...,     0,     0,     0],
       [  101,  2044,  1996, ...,     0,     0,     0],
       ...,
       [  101,  2632,  1996, ...,     0,     0,     0],
       [  101,  2286,  4189, ...,     0,     0,     0],
       [  101,  2632, 11266, ...,     0,     0,     0]])>, 'token_type_ids': <tf.Tensor: shape=(5280, 128), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(5280, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}

In [142]:
# Define input layers for both text and image inputs
text_input = Input(shape=(None, ), dtype=tf.int32, name='text_input')
image_input = Input(shape=(256, 256, 3), name='image_input')

In [144]:
# BERT model for text classification
bert_outputs = bert_model(text_input)  # Extract the last hidden state
# Extract the last hidden state
# BERT model for text classification
bert_logits = bert_outputs.logits
 # Take the CLS token output

In [121]:
# ResNet50V2 model for image classification
resnet_features = resnet_model(image_input)
resnet_features = GlobalAveragePooling2D()(resnet_features)

In [124]:
# Combine both models' outputs
combined = Concatenate()([bert_logits, resnet_features])
combined = Dropout(0.3)(combined)
combined = Dense(128, activation='relu')(combined)
combined = Dropout(0.3)(combined)
output = Dense(1, activation='sigmoid')(combined)

In [126]:
# Define the combined model
combined_model = Model(inputs=[text_input, image_input], outputs=output)


In [128]:
# Compile the model
combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [130]:
# Summary of the combined model
combined_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 image_input (InputLayer)    [(None, 256, 256, 3)]        0         []                            
                                                                                                  
 text_input (InputLayer)     [(None, None)]               0         []                            
                                                                                                  
 resnet50v2 (Functional)     (None, None, None, 2048)     2356480   ['image_input[0][0]']         
                                                          0                                       
                                                                                                  
 tf_bert_for_sequence_class  TFSequenceClassifierOutput   1094837   ['text_input[0][0]']    

In [148]:
combined_model.fit([dataset, images], epochs= 2, batch_size=32)

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {"<class 'numpy.ndarray'>", "<class 'tensorflow.python.data.ops.batch_op._BatchDataset'>"}), <class 'NoneType'>

In [None]:
def preprocess_text(text):
    tokens = tokenizer.encode_plus(
        text,
        max_length=128,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    return tokens['input_ids'][0]  # Return tensor without batch dimension

In [None]:
def predict(text, image_path):
    # Preprocess inputs
    text_input_ids = preprocess_text(text)
    image_input_array = preprocess_image(image_path)
    
    # Add batch dimension to inputs
    text_input_ids = np.expand_dims(text_input_ids, axis=0)
    image_input_array = np.expand_dims(image_input_array, axis=0)
    
    # Make prediction
    prediction = combined_model.predict([text_input_ids, image_input_array])
    
    # Interpret result
    result = 'Flood' if prediction[0] > 0.5 else 'No Flood'
    return result

In [None]:
result = predict(text_example, image_example_path)
print(result)