In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
path = "../input/planets-dataset/planet/planet"
os.listdir(path)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

['sample_submission.csv', 'test-jpg', 'train_classes.csv', 'train-jpg']

In [2]:
import os
import cv2
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
from matplotlib.image import imread
from sklearn.metrics import fbeta_score
from tqdm import tqdm
import tensorflow as tf
from keras import optimizers
from tensorflow.keras.models import Sequential 
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Input , Dense , Dropout , Flatten,\
Conv2D,MaxPooling2D , BatchNormalization
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, History
import tensorflow_addons as tfa

In [3]:
train_paths = os.path.join(path,"train_classes.csv")
train_dir = os.path.join(path,'train-jpg')
submission_df = os.path.join(path,'sample_submission.csv')
test_dir = os.path.join(path,'test-jpg')

train_df = pd.read_csv(train_paths)
submission_df = pd.read_csv(submission_df)
train_df.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [4]:
def data_preprocess(data):
    
    # Getting all Unique Classification (Target)
    labels = data['tags'].to_numpy()
    set_labels = list()
    for label in labels:
        tags = label.split()
        for tag in tags:
            if tag not in set_labels:
                set_labels.append(tag)
    # Encoding Classes (targets)
    for set_label in set_labels:
        data[set_label]=data["tags"].apply(lambda x: 1 if set_label in x.split() else 0)
    
    # Adding .jpg to each images name to be able to iterate through them
    data['image_name'] = data['image_name'].apply(lambda x: x+".jpg")
    return data 


In [5]:
df_train = data_preprocess(train_df)

def img_generator(training=True,batch_size=128,target_size=(128,128)):
    # Function that geneerate the images data
    if training:
        train_generator = ImageDataGenerator(rescale = 1./255, validation_split = 0.2)
        train_gen = train_generator.flow_from_dataframe(dataframe=df_train,
                                              directory = train_dir,  
                                              x_col="image_name", 
                                              y_col= df_train.columns.to_list()[2:], 
                                              subset="training", 
                                              batch_size=batch_size,
                                              seed=42, 
                                              shuffle = True, 
                                              class_mode="raw", 
                                              target_size=target_size)
        # val_set contains 20 percent of images 
        val_gen = train_generator.flow_from_dataframe(dataframe=df_train,
                                            directory = train_dir,  
                                            x_col="image_name", 
                                            y_col= df_train.columns.to_list()[2:], 
                                            subset="validation", 
                                            batch_size= batch_size,
                                            seed=42, 
                                            shuffle = True, 
                                            class_mode="raw", 
                                            target_size=target_size)
        return train_gen, val_gen
    else:
        test_generator = ImageDataGenerator(rescale = 1./255)
        test_gen = test_generator.flow_from_dataframe(dataframe=df_test,
                                            directory = test_dir,  
                                            x_col="image_name", 
                                            y_col=None, 
                                            batch_size=batch_size,
                                            shuffle=False,
                                            class_mode=None, 
                                            target_size=target_size)
        return test_gen

In [6]:
train, val = img_generator()

Found 32384 validated image filenames.
Found 8095 validated image filenames.


Hyperparameters

In [7]:
model = Sequential()
#input layer
model.add(BatchNormalization(input_shape=(128, 128, 3)))
model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

#Hidden layers
model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(256, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(Conv2D(256, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))

#Output layer
model.add(Dense(17, activation='sigmoid'))

2023-01-16 18:28:11.278002: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [8]:
train_steps = train.n // train.batch_size
val_steps = val.n // train.batch_size

In [9]:
def fbeta_score(y_true, y_pred, beta = 2, epsilon = 1e-4):
    beta_squared = beta**2
    
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.greater(tf.cast(y_pred, tf.float32), tf.constant(0.5)), tf.float32)
    
    tp = tf.reduce_sum(y_true * y_pred, axis = 1)
    fp = tf.reduce_sum(y_pred, axis = 1) - tp
    fn = tf.reduce_sum(y_true, axis = 1) - tp
    
    precision = tp/(tp +fp +epsilon)
    recall = tp/(tp + fn + epsilon)
    
    return (1 + beta_squared) * precision * recall / (beta_squared * precision + recall + epsilon)

def accuracy(y_true, y_pred, epsilon = 1e-4):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.greater(tf.cast(y_pred, tf.float32), tf.constant(0.5)), tf.float32)
    
    tp = tf.reduce_sum(y_true * y_pred, axis = 1)
    fp = tf.reduce_sum(y_pred, axis = 1) - tp
    fn = tf.reduce_sum(y_true, axis = 1) - tp
    
    y_true = tf.cast(y_true, tf.bool)
    y_pred = tf.cast(y_pred, tf.bool)
        
    tn = tf.reduce_sum(tf.cast(tf.logical_not(y_true), tf.float32) * tf.cast(tf.logical_not(y_pred), tf.float32),                 axis = 1)
    return (tp + tn)/(tp + tn + fp + fn + epsilon)

In [10]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[fbeta_score,accuracy])

# Save the callback
callback = EarlyStopping(
    monitor="val_loss", 
    patience=2,
    verbose=0,
    baseline=None,
    restore_best_weights=True,
)

In [11]:
history = model.fit(x=train,
                    steps_per_epoch=train_steps,
                    validation_data=val,
                    validation_steps=val_steps,
                    epochs=5,
                    callbacks=[callback]) 

2023-01-16 18:28:12.573791: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
#Classifying images
submission_df = pd.read_csv(os.path.join(path,'sample_submission.csv'))
test_df = submission_df.iloc[:40669]['image_name'].reset_index().drop('index', axis =1)

test_df['image_name'] = submission_df['image_name'].apply(lambda x: x+'.jpg')
test_df.head()

Unnamed: 0,image_name
0,test_0.jpg
1,test_1.jpg
2,test_2.jpg
3,test_3.jpg
4,test_4.jpg


In [13]:
test_generator = ImageDataGenerator(rescale = 1./255)

test_gen = test_generator.flow_from_dataframe(dataframe=test_df,
                                            directory = os.path.join(path,'test-jpg'),  
                                            x_col="image_name", 
                                            y_col=None, 
                                            batch_size=67,
                                            shuffle=False,
                                            class_mode=None, 
                                            target_size=(128,128))

test_size = test_gen.n // test_gen.batch_size

Found 40669 validated image filenames.


In [14]:
pred = model.predict(test_gen, steps=test_size, verbose=1)



In [15]:
labels = df_train['tags'].to_numpy()
unique_labels = list()
for label in labels:
    tags = label.split()
    for tag in tags:
        if tag not in unique_labels:
            unique_labels.append(tag)

In [16]:
test_names = test_gen.filenames 
        
test_result = pd.DataFrame(pred)
test_result = test_result.apply(lambda x: ' '.join(np.array(unique_labels)[x >= 0.2]), axis=1)

test_result_df = pd.DataFrame({'image_name': test_names, 'tags': test_result})
test_result_df.head()

Unnamed: 0,image_name,tags
0,test_0.jpg,primary clear
1,test_1.jpg,primary clear
2,test_2.jpg,primary partly_cloudy
3,test_3.jpg,primary clear
4,test_4.jpg,primary partly_cloudy


In [17]:
additional_df = submission_df.iloc[40669:]['image_name'].reset_index().drop('index', axis=1)

In [18]:
additional_df['image_name'] = additional_df['image_name'].apply(lambda x: '{}.jpg'.format(x))
additional_df.head()

Unnamed: 0,image_name
0,file_0.jpg
1,file_1.jpg
2,file_10.jpg
3,file_100.jpg
4,file_1000.jpg


In [19]:
additional_gen = test_generator.flow_from_dataframe(dataframe=additional_df,
                                                  directory="../input/planets-dataset/test-jpg-additional/test-jpg-additional", 
                                                  x_col='image_name',
                                                  y_col = None,
                                                  batch_size=62,
                                                  shuffle=False,
                                                  class_mode=None, 
                                                  target_size=(128, 128))


additional_step = additional_gen.n // additional_gen.batch_size

Found 20522 validated image filenames.


In [20]:
pred_additional = model.predict(additional_gen, steps=additional_step, verbose=1)



In [21]:
additional_names = additional_gen.filenames 
        
additional_result = pd.DataFrame(pred_additional)
additional_result = additional_result.apply(lambda x: ' '.join(np.array(unique_labels)[x >= 0.2]), axis=1)

additional_result_df = pd.DataFrame({'image_name': additional_names, 'tags': additional_result})
additional_result_df.head()

Unnamed: 0,image_name,tags
0,file_0.jpg,primary clear
1,file_1.jpg,primary agriculture clear habitation road cult...
2,file_10.jpg,haze primary agriculture clear water road
3,file_100.jpg,primary agriculture clear water cultivation
4,file_1000.jpg,primary clear


In [22]:
final_df = pd.concat([test_result_df, additional_result_df])
final_df = final_df.reset_index().drop('index', axis =1)
final_df

Unnamed: 0,image_name,tags
0,test_0.jpg,primary clear
1,test_1.jpg,primary clear
2,test_2.jpg,primary partly_cloudy
3,test_3.jpg,primary clear
4,test_4.jpg,primary partly_cloudy
...,...,...
61186,file_9995.jpg,primary cloudy
61187,file_9996.jpg,primary agriculture clear water road
61188,file_9997.jpg,primary clear
61189,file_9998.jpg,haze primary cloudy


In [23]:
final_df['image_name'] = final_df['image_name'].apply(lambda x: x.replace('.jpg',''))
final_df

Unnamed: 0,image_name,tags
0,test_0,primary clear
1,test_1,primary clear
2,test_2,primary partly_cloudy
3,test_3,primary clear
4,test_4,primary partly_cloudy
...,...,...
61186,file_9995,primary cloudy
61187,file_9996,primary agriculture clear water road
61188,file_9997,primary clear
61189,file_9998,haze primary cloudy


In [24]:
final_df.to_csv('submission.csv', index=False)