In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, GlobalMaxPooling1D, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec





In [2]:
import pandas as pd

# Đọc trực tiếp file JSON thành DataFrame
df = pd.read_json('devset_images_metadata.json')

# Nếu cần chuyển đổi từ cột chứa danh sách các đối tượng thành các cột DataFrame
df = pd.json_normalize(df['images'])

# Hiển thị DataFrame
df.head()


Unnamed: 0,description,user_tags,title,license_name,user_nsid,image_extension_original,longitude,image_id,license_url,date_uploaded,date_taken,latitude,image_url,user_nickname,capture_device
0,,"[2009 road trip, obrero road trip]",Biltmore Estate,Attribution-NonCommercial-NoDerivs License,95156977@N00,jpg,,3519864665,http://creativecommons.org/licenses/by-nc-nd/2.0/,1242004112,2009-05-10 08:27:33.0,,http://www.flickr.com/photos/95156977@N00/3519...,5 Flip-Flops (Earl),Canon EOS DIGITAL REBEL XT
1,,"[daulatabad, daulatabad fort, ellora, road trip]",Chand Minar,Attribution-ShareAlike License,24574470@N00,jpg,75.200386,4896119055,http://creativecommons.org/licenses/by-sa/2.0/,1281931224,2010-08-14 13:35:10.0,19.939383,http://www.flickr.com/photos/24574470@N00/4896...,sankarshan,NIKON CORPORATION NIKON D90
2,"After the flood, the boarded up stores bear up...","[cedarrapids, createsouthroadtrip2009, disaste...",Uplifting Graffiti,Attribution License,73451168@N00,jpg,,3468473862,http://creativecommons.org/licenses/by/2.0/,1240493762,2009-04-21 18:07:56.0,,http://www.flickr.com/photos/73451168@N00/3468...,J Wynia,Panasonic DMC-TZ5
3,,"[cork, enchente, flood, ireland, irlanda]",DSCF6487,Attribution-NonCommercial-NoDerivs License,12947023@N00,jpg,-8.621177,4120853942,http://creativecommons.org/licenses/by-nc-nd/2.0/,1258754762,2009-11-20 15:16:40.0,51.889603,http://www.flickr.com/photos/12947023@N00/4120...,guileite,FUJIFILM FinePix S6000fd
4,,"[athens georgia, brown, current, flood, mud, r...",Oconoe river - flooded,Attribution License,60704492@N00,jpg,-83.368265,4436083254,http://creativecommons.org/licenses/by/2.0/,1268676971,2010-03-13 15:14:04.0,33.949149,http://www.flickr.com/photos/60704492@N00/4436...,The_Gut,Canon PowerShot SX10 IS


In [3]:
train_label = pd.read_csv('devset_images_gt.csv')
train_label.head()

Unnamed: 0,id,label
0,3519864665,0
1,4896119055,0
2,3468473862,0
3,4120853942,0
4,4436083254,0


In [4]:
train_label.rename(columns = {'id': 'image_id', 'label': 'train_y'}, inplace = True)
train_label.head()

Unnamed: 0,image_id,train_y
0,3519864665,0
1,4896119055,0
2,3468473862,0
3,4120853942,0
4,4436083254,0


In [5]:
data = pd.concat([df, train_label], axis = 1)
data.head()

Unnamed: 0,description,user_tags,title,license_name,user_nsid,image_extension_original,longitude,image_id,license_url,date_uploaded,date_taken,latitude,image_url,user_nickname,capture_device,image_id.1,train_y
0,,"[2009 road trip, obrero road trip]",Biltmore Estate,Attribution-NonCommercial-NoDerivs License,95156977@N00,jpg,,3519864665,http://creativecommons.org/licenses/by-nc-nd/2.0/,1242004112,2009-05-10 08:27:33.0,,http://www.flickr.com/photos/95156977@N00/3519...,5 Flip-Flops (Earl),Canon EOS DIGITAL REBEL XT,3519864665,0
1,,"[daulatabad, daulatabad fort, ellora, road trip]",Chand Minar,Attribution-ShareAlike License,24574470@N00,jpg,75.200386,4896119055,http://creativecommons.org/licenses/by-sa/2.0/,1281931224,2010-08-14 13:35:10.0,19.939383,http://www.flickr.com/photos/24574470@N00/4896...,sankarshan,NIKON CORPORATION NIKON D90,4896119055,0
2,"After the flood, the boarded up stores bear up...","[cedarrapids, createsouthroadtrip2009, disaste...",Uplifting Graffiti,Attribution License,73451168@N00,jpg,,3468473862,http://creativecommons.org/licenses/by/2.0/,1240493762,2009-04-21 18:07:56.0,,http://www.flickr.com/photos/73451168@N00/3468...,J Wynia,Panasonic DMC-TZ5,3468473862,0
3,,"[cork, enchente, flood, ireland, irlanda]",DSCF6487,Attribution-NonCommercial-NoDerivs License,12947023@N00,jpg,-8.621177,4120853942,http://creativecommons.org/licenses/by-nc-nd/2.0/,1258754762,2009-11-20 15:16:40.0,51.889603,http://www.flickr.com/photos/12947023@N00/4120...,guileite,FUJIFILM FinePix S6000fd,4120853942,0
4,,"[athens georgia, brown, current, flood, mud, r...",Oconoe river - flooded,Attribution License,60704492@N00,jpg,-83.368265,4436083254,http://creativecommons.org/licenses/by/2.0/,1268676971,2010-03-13 15:14:04.0,33.949149,http://www.flickr.com/photos/60704492@N00/4436...,The_Gut,Canon PowerShot SX10 IS,4436083254,0


In [6]:
def preprocess_user_tags(tags):
    if isinstance(tags, list):
        return ' '.join(tags)
    elif pd.isnull(tags):
        return '[NULL]'
    else:
        return tags

data['user_tags'] = data['user_tags'].apply(preprocess_user_tags)

In [7]:
data['text'] = data[['description', 'user_tags','title']].apply(lambda x: ' | '.join(x.dropna()), axis=1)
data['text']

0       2009 road trip obrero road trip | Biltmore Estate
1       daulatabad daulatabad fort ellora road trip | ...
2       After the flood, the boarded up stores bear up...
3          cork enchente flood ireland irlanda | DSCF6487
4       athens georgia brown current flood mud river s...
                              ...                        
5275    550d camino canon canoneos550d canoneoskissx4 ...
5276    albany, ny flood walk water | Albany's Corning...
5277                al the waters in pike road | IMG_4989
5278    2013 Fair Flood | 2013 county fair flood linn ...
5279    Alcatraz trip, San Francisco |  | Prison building
Name: text, Length: 5280, dtype: object

In [8]:
texts = data['text'].tolist()
labels = data['train_y'].tolist()  # Replace 'label_column_name' with the actual column name

In [9]:
# Hyperparameters
vocab_size = 10000
embedding_dim = 100
max_length = 20
padding_type = 'post'
truncating_type = 'post'

# Tokenize the text
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)


In [10]:
# Train Word2Vec model
w2v_model = Word2Vec(sentences=[text.split() for text in texts], vector_size=embedding_dim, window=5, min_count=1, workers=4)
w2v_model.save("word2vec.model")

# Create an embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
        try:
            embedding_vector = w2v_model.wv[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            continue


In [11]:
# Build the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    GlobalMaxPooling1D(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 100)           1000000   
                                                                 
 global_max_pooling1d (Glob  (None, 100)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 512)               51712     
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 513       
                                                                 
Total params: 1052225 (4.01 MB)
Trainable params: 52225 (204.00 KB)
Non-trainable params: 1000000 (3.81 MB)
____________

In [12]:
# Convert labels to numpy array
labels = np.array(labels)

# Train the model
history = model.fit(padded_sequences, labels, epochs=10, validation_split=0.2)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
def preprocess_text(text, tokenizer, max_length, padding_type='post', truncating_type='post'):
    # Convert text to sequences
    sequences = tokenizer.texts_to_sequences([text])
    # Pad the sequences
    padded_sequence = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)
    return padded_sequence


def predict(text, model, tokenizer, max_length):
    # Preprocess the text
    padded_sequence = preprocess_text(text, tokenizer, max_length)
    # Make prediction
    prediction = model.predict(padded_sequence)
    # Convert probability to class label (0 or 1)
    return prediction

In [39]:
test_df = pd.read_csv('test.csv')
test_df

Unnamed: 0,image_id,title,description,user_tags
0,3483809003,"Flooded Parking Lot At Emily Fowler Library, A...",Denton Creek overflows its banks and floods Oa...,"project, slis 5715, spring 2009"
1,3712805295,L'arc de Barà / The roman arch of Barà,Sembla que fou dedicat a August entorn l'any 1...,"arc, arc_de_berà, arch, archaeology, arco, arq..."
2,379845620,Highest point over the sea level that is reach...,,
3,7343264988,Lagos after the rains,"After heavy rain, Lagos (Nigeria) was still fl...","africa, lagos, nigeria"
4,3843337492,flooded Corley Ave,also a local black out due to the tree branch ...,"flood, storm, toronto"
...,...,...,...,...
1315,6452132743,Landscapes and cityscapes: daily dose of ugliness,You can find more like this in my Landscapes a...,"bouw, bus station, commuting, construction, da..."
1316,244899140,A cool looking building the the Parque Central,,"guatemala, xela"
1317,3073018258,Just Plain Wet,I liked the reflection of the trees on the wet...,"driving, fall, me, november, photographing, ra..."
1318,49525361,... Palmer,Looking up toward Magnolia. Our neighbor's hou...,"hurricane katrina, new orleans, post katrina"


In [41]:
test_df['text'] = test_df[['description', 'user_tags','title']].apply(lambda x: ' | '.join(x.dropna()), axis=1)

In [42]:
test_df['text'] = test_df['text'].apply(preprocess_user_tags)
test_texts = test_df['text'].to_list()

In [45]:
predicted_classes = predict(test_texts, model, tokenizer, max_length)
print(predicted_classes)

[[0.44359463]]
