In [137]:
import torch
from transformers import BertTokenizer, BertModel
from torchvision import models, transforms
from PIL import Image
import numpy as np
import os

In [139]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from string import punctuation
from nltk.corpus import stopwords, brown
import re
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import seaborn as sns
import matplotlib.pyplot as plt
import os
from transformers import BertTokenizer
import nltk
import re
import nltk
import pandas as pd
import numpy as np
import warnings
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer

In [141]:
# Load pre-trained BERT model and tokenizer
bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# Load pre-trained ResNet50V2 model
resnet_model = models.resnet50(pretrained=True)
resnet_model = torch.nn.Sequential(*list(resnet_model.children())[:-1])  # Remove the last layer

# Function to preprocess text
def preprocess_text(text, tokenizer, max_length=128):
    tokens = tokenizer.encode_plus(
        text,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='pt'  # Use return_tensors='pt' to get PyTorch tensors
    )
    return tokens['input_ids'][0]  # Return PyTorch tensor without batch dimension

from PIL import Image, ImageSequence
import torchvision.transforms as transforms

def preprocess_image(image_path, target_size=(256, 256)):
    image = Image.open(image_path)

    # If the image is a GIF, extract the first frame
    if image.format == 'GIF':
        frames = [frame.copy() for frame in ImageSequence.Iterator(image)]
        image = frames[0]

    transform = transforms.Compose([
        transforms.Resize(target_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    image = transform(image)
    return image  # Add batch dimension




In [142]:
import torch.nn as nn

class CombinedModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased'):
        super(CombinedModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        resnet_model = models.resnet50(pretrained=True)
        self.resnet = nn.Sequential(
            *list(resnet_model.children())[:-2],  # Up to the last conv layer
            nn.AdaptiveAvgPool2d((1, 1)),         # Ensure output is (batch_size, 2048, 1, 1)
            nn.Flatten()                          # Flatten to (batch_size, 2048)
        )
        
        bert_hidden_size = self.bert.config.hidden_size
        resnet_feature_size = 2048  # Fixed feature size for ResNet50
        
        self.fc1 = nn.Linear(bert_hidden_size + resnet_feature_size, 128)
        self.fc2 = nn.Linear(128, 1)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, text_input, image_input):
        text_outputs = self.bert(text_input)
        text_cls_token = text_outputs.last_hidden_state[:, 0, :]  # CLS token

        image_features = self.resnet(image_input)
        
        combined_features = torch.cat((text_cls_token, image_features), dim=1)
        combined_features = self.dropout(combined_features)
        combined_features = torch.relu(self.fc1(combined_features))
        combined_features = self.dropout(combined_features)
        output = torch.sigmoid(self.fc2(combined_features))
        
        return output
    
# Function to predict using the combined model
def predict(texts, image_paths, tokenizer, model):
    # Set model to evaluation mode
    model.eval()
    
    # Preprocess inputs
    text_input_ids = torch.stack([preprocess_text(text, tokenizer) for text in texts])
    image_input_tensors = torch.stack([preprocess_image(image_path) for image_path in image_paths])
    
    # Move inputs to the same device as the model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    text_input_ids = text_input_ids.to(device, dtype=torch.int64)
    image_input_tensors = image_input_tensors.to(device, dtype=torch.float)
    model = model.to(device)
    
    # Make prediction
    with torch.no_grad():
        outputs = model(text_input_ids, image_input_tensors)
    
    # Convert output to binary predictions
    binary_predictions = (outputs >= 0.9).int().squeeze().tolist()

    return binary_predictions

In [143]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, texts, image_paths, labels, tokenizer, max_length=128, target_size=(256, 256)):
        self.texts = texts
        self.image_paths = image_paths
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.target_size = target_size
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        
        text_input = preprocess_text(text, self.tokenizer, self.max_length)
        image_input = preprocess_image(image_path, self.target_size)
        
        return text_input, image_input, label
    
    

In [144]:
image_path = []
folder_path = "C:/Users/minhd/FPTU lab/DPL302m/Kaggle/2024-sum-dpl-302-m/devset_images/devset_images"
for filename in os.listdir(folder_path):
    image_path.append(folder_path + "/" + filename)

In [148]:
import pandas as pd

# Đọc trực tiếp file JSON thành DataFrame
df = pd.read_json('devset_images_metadata.json')

# Nếu cần chuyển đổi từ cột chứa danh sách các đối tượng thành các cột DataFrame
df = pd.json_normalize(df['images'])

# Hiển thị DataFrame
df.head()


Unnamed: 0,description,user_tags,title,license_name,user_nsid,image_extension_original,longitude,image_id,license_url,date_uploaded,date_taken,latitude,image_url,user_nickname,capture_device
0,,"[2009 road trip, obrero road trip]",Biltmore Estate,Attribution-NonCommercial-NoDerivs License,95156977@N00,jpg,,3519864665,http://creativecommons.org/licenses/by-nc-nd/2.0/,1242004112,2009-05-10 08:27:33.0,,http://www.flickr.com/photos/95156977@N00/3519...,5 Flip-Flops (Earl),Canon EOS DIGITAL REBEL XT
1,,"[daulatabad, daulatabad fort, ellora, road trip]",Chand Minar,Attribution-ShareAlike License,24574470@N00,jpg,75.200386,4896119055,http://creativecommons.org/licenses/by-sa/2.0/,1281931224,2010-08-14 13:35:10.0,19.939383,http://www.flickr.com/photos/24574470@N00/4896...,sankarshan,NIKON CORPORATION NIKON D90
2,"After the flood, the boarded up stores bear up...","[cedarrapids, createsouthroadtrip2009, disaste...",Uplifting Graffiti,Attribution License,73451168@N00,jpg,,3468473862,http://creativecommons.org/licenses/by/2.0/,1240493762,2009-04-21 18:07:56.0,,http://www.flickr.com/photos/73451168@N00/3468...,J Wynia,Panasonic DMC-TZ5
3,,"[cork, enchente, flood, ireland, irlanda]",DSCF6487,Attribution-NonCommercial-NoDerivs License,12947023@N00,jpg,-8.621177,4120853942,http://creativecommons.org/licenses/by-nc-nd/2.0/,1258754762,2009-11-20 15:16:40.0,51.889603,http://www.flickr.com/photos/12947023@N00/4120...,guileite,FUJIFILM FinePix S6000fd
4,,"[athens georgia, brown, current, flood, mud, r...",Oconoe river - flooded,Attribution License,60704492@N00,jpg,-83.368265,4436083254,http://creativecommons.org/licenses/by/2.0/,1268676971,2010-03-13 15:14:04.0,33.949149,http://www.flickr.com/photos/60704492@N00/4436...,The_Gut,Canon PowerShot SX10 IS


In [150]:
train_label = pd.read_csv('devset_images_gt.csv')
train_label.rename(columns = {'id': 'image_id', 'label': 'train_y'}, inplace = True)
data = pd.concat([df, train_label], axis = 1)

In [152]:
def preprocess_user_tags(tags):
    if isinstance(tags, list):
        return ' '.join(tags)
    elif pd.isnull(tags):
        return '[NULL]'
    else:
        return tags

data['user_tags'] = data['user_tags'].apply(preprocess_user_tags)

In [154]:
data['text'] = data[['description', 'user_tags','title']].apply(lambda x: ' | '.join(x.dropna()), axis=1)

In [183]:
texts = data['text'].tolist()
labels = data['train_y'].tolist()

In [185]:
from autocorrect import Speller
from nltk.tokenize import word_tokenize
import nltk
import re

In [189]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\minhd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\minhd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [193]:
dataset = CustomDataset(texts, image_path, labels, tokenizer)
dataloader = DataLoader(dataset, batch_size= 32, shuffle= False)

In [195]:
# Initialize the model, loss function, and optimizer
model = CombinedModel()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Training loop with accuracy calculation
num_epochs = 10
model.train()

for epoch in range(num_epochs):
    epoch_loss = 0
    correct = 0
    total = 0
    
    for text_inputs, image_inputs, labels in dataloader:
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(text_inputs, image_inputs)

        # Calculate accuracy
        predicted = (outputs >= 0.5).int()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Epoch [{epoch+1}/{num_epochs}], Accuracy: {accuracy:.2f}%')

print("Training complete.")




KeyboardInterrupt: 

In [None]:
test_df = pd.read_csv('test.csv')
test_df['text'] = test_df[['description', 'user_tags','title']].apply(lambda x: ' | '.join(x.dropna()), axis=1)
test_df['text'] = test_df['text'].apply(preprocess_user_tags)
test_texts = test_df['text'].to_list()
for i in range(len(test_texts)):
    test_texts[i] = clean_text(test_texts[i])

In [None]:
test_image_path = []
test_folder_path = "C:/Users/minhd/FPTU lab/DPL302m/Kaggle/2024-sum-dpl-302-m/testset_images/testset_images"
for filename in os.listdir(test_folder_path):
    test_image_path.append(test_folder_path + "/" + filename)

In [None]:
predictions = predict(test_texts, test_image_path, tokenizer, model)
print(predictions)

In [123]:
submit = pd.DataFrame({'id': test_df['image_id'], 'label': predictions})
submit.head()

Unnamed: 0,id,label
0,3483809003,0
1,3712805295,0
2,379845620,0
3,7343264988,1
4,3843337492,1


In [124]:
results_csv_path = 'Allin.csv'
submit.to_csv(results_csv_path, index=False)