In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras import backend as K
from keras.models import Sequential
from keras import layers
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from tensorflow.keras.optimizers import Adam, SGD
from PIL import Image
from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
import cv2
import os
from tqdm import tqdm

In [2]:
file_dir = "../input/planets-dataset/planet/planet/"
train_images_dir = os.path.join(file_dir, "train-jpg")
test_image_dir = os.path.join(file_dir, "test-jpg")
class_dir= os.path.join(file_dir, "train_classes.csv")
submission = os.path.join(file_dir, "sample_submission.csv")

In [3]:
data = pd.read_csv(class_dir)
submission_data = pd.read_csv(submission)


In [4]:
"""
Split the joined weather labels into individual labels

"""
list_of_label = []
for labeles in data.tags.to_numpy():
    labels = labeles.split(' ')
    for label in labels:
        if label not in list_of_label:
            list_of_label.append(label)
            
print(list_of_label)

range_of_labels = range(0,len(list_of_label))
dictionary_of_labels = dict(zip(range_of_labels, list_of_label))
dictionary_of_labels

['haze', 'primary', 'agriculture', 'clear', 'water', 'habitation', 'road', 'cultivation', 'slash_burn', 'cloudy', 'partly_cloudy', 'conventional_mine', 'bare_ground', 'artisinal_mine', 'blooming', 'selective_logging', 'blow_down']


{0: 'haze',
 1: 'primary',
 2: 'agriculture',
 3: 'clear',
 4: 'water',
 5: 'habitation',
 6: 'road',
 7: 'cultivation',
 8: 'slash_burn',
 9: 'cloudy',
 10: 'partly_cloudy',
 11: 'conventional_mine',
 12: 'bare_ground',
 13: 'artisinal_mine',
 14: 'blooming',
 15: 'selective_logging',
 16: 'blow_down'}

In [5]:
def dummy_data(inp):
    if label in inp.split():
        return 1
    else:
        return 0
    
for label in list_of_label:
    data[label] = data['tags'].apply(dummy_data)

    
data.head(3)


Unnamed: 0,image_name,tags,haze,primary,agriculture,clear,water,habitation,road,cultivation,slash_burn,cloudy,partly_cloudy,conventional_mine,bare_ground,artisinal_mine,blooming,selective_logging,blow_down
0,train_0,haze primary,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,train_1,agriculture clear primary water,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,train_2,clear primary,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
#Columns needed for analysis
data_columns = list(data.columns[2:])

In [7]:
#Appending .Jpg to image names
data['image_name'] = data['image_name'].apply(lambda image: f'{image}.jpg')
data.head(3)

Unnamed: 0,image_name,tags,haze,primary,agriculture,clear,water,habitation,road,cultivation,slash_burn,cloudy,partly_cloudy,conventional_mine,bare_ground,artisinal_mine,blooming,selective_logging,blow_down
0,train_0.jpg,haze primary,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,train_1.jpg,agriculture clear primary water,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,train_2.jpg,clear primary,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
import gc
gc.collect()

69

In [9]:
#Rescaling and generating images
imagesgenerator = ImageDataGenerator(rescale = 1./255, validation_split = 0.25)

# train_set contains 80 percent of images
training_data = imagesgenerator.flow_from_dataframe(dataframe=data,directory = train_images_dir,  x_col="image_name", 
                                              y_col=data_columns, subset="training", batch_size=128,seed=42, 
                                              shuffle = True,class_mode="raw",target_size=(128,128))


# val_set contains 20 percent of images 
validation_data = imagesgenerator.flow_from_dataframe(dataframe=data,directory = train_images_dir,  x_col="image_name", 
                                            y_col=data_columns, subset="validation", batch_size=128,seed=42, 
                                            shuffle = True, class_mode="raw",target_size=(128,128))

Found 30360 validated image filenames.
Found 10119 validated image filenames.


In [11]:
#F1 Score metrics 
def f1_score(y_true, y_pred, beta=2, epsilon=1e-4):
    beta_squared = beta ** 2

    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.greater(tf.cast(y_pred, tf.float32), tf.constant(0.5)), tf.float32)

    true_positives = tf.reduce_sum(y_true * y_pred, axis=1)
    false_positives = tf.reduce_sum(y_pred, axis=1) - true_positives
    false_negatives = tf.reduce_sum(y_true, axis=1) - true_positives

    precision = true_positives / (true_positives + false_positives + epsilon)
    recall = true_positives / (true_positives + false_negatives + epsilon)

    return (1 + beta_squared) * precision * recall / (beta_squared * precision + recall + epsilon)


model = Sequential()

# Add layers to the model
model.add(layers.BatchNormalization(input_shape=(128, 128, 3)))
model.add(layers.Conv2D(32, (3, 3), activation="relu",padding='same'))
model.add(layers.Conv2D(32, (3, 3), activation="relu"))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Dropout(0.1))

model.add(layers.Conv2D(64, (3, 3), activation="relu",padding='same'))
model.add(layers.Conv2D(64, (3, 3), activation="relu"))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Dropout(0.1))

model.add(layers.Conv2D(128, (3, 3), activation="relu",padding='same'))
model.add(layers.Conv2D(128, (3, 3), activation="relu"))
model.add(layers.Dropout(0.1))


# Flatten the output from the convolutional layers
model.add(layers.Flatten())

# Add a dense layer for classification
model.add(layers.Dense(256, activation="relu"))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(17, activation="sigmoid"))

# Compile the model
model.summary()
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=[f1_score,"accuracy"])


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_1 (Batch (None, 128, 128, 3)       12        
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 128, 128, 32)      896       
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 126, 126, 32)      9248      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 63, 63, 32)        0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 63, 63, 32)        0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 63, 63, 64)        18496     
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 61, 61, 64)       

In [12]:
# Batch sizes as stated in the imagegenerator
trainingsteps = training_data.n // training_data.batch_size
validationsteps= validation_data.n // validation_data.batch_size

model.fit(x=training_data,steps_per_epoch=trainingsteps,validation_data=validation_data,validation_steps=validationsteps,epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7d153bdcc310>

In [13]:
test_data = submission_data.iloc[:40669]['image_name'].reset_index().drop('index', axis =1)

In [14]:
test_data['image_name'] = submission_data['image_name'].apply(lambda x: '{}.jpg'.format(x))
test_data.head()

Unnamed: 0,image_name
0,test_0.jpg
1,test_1.jpg
2,test_2.jpg
3,test_3.jpg
4,test_4.jpg


In [15]:
testdatagenerator = ImageDataGenerator(rescale = 1./255)

testgenerator = testdatagenerator.flow_from_dataframe(dataframe=test_data,
                                            directory = test_image_dir,  
                                            x_col="image_name", 
                                            y_col=None, 
                                            batch_size=67,
                                            shuffle=False,
                                            class_mode=None, 
                                            target_size=(128,128))

size_of_test = testgenerator.n // testgenerator.batch_size

Found 40669 validated image filenames.


In [16]:
predictions = model.predict(testgenerator, steps=size_of_test, verbose=1)



In [17]:
additional = submission_data.iloc[40669:]['image_name'].reset_index().drop('index', axis =1)
additional['image_name'] = additional['image_name'].apply(lambda x: '{}.jpg'.format(x))
additional.head()

Unnamed: 0,image_name
0,file_0.jpg
1,file_1.jpg
2,file_10.jpg
3,file_100.jpg
4,file_1000.jpg


In [18]:
additionalgenerator = ImageDataGenerator(rescale = 1./255)
additionalgen = additionalgenerator.flow_from_dataframe(dataframe=additional,
                                            directory = "../input/planets-dataset/test-jpg-additional/test-jpg-additional",  
                                            x_col="image_name", 
                                            y_col=None, 
                                            batch_size=62,
                                            shuffle=False,
                                            class_mode=None, 
                                            target_size=(128,128))

size_of_additional = additionalgen.n // additionalgen.batch_size

Found 20522 validated image filenames.


In [19]:
additional_predictions = model.predict(additionalgen, steps=size_of_additional, verbose=1)



In [20]:
test_names = testgenerator.filenames 
        
test_result = pd.DataFrame(predictions)
test_result = test_result.apply(lambda x: ' '.join(np.array(list_of_label)[x >= 0.2]), axis=1)

test_dataframe = pd.DataFrame({'image_name': test_names, 'tags': test_result})
test_dataframe['image_name'] = test_dataframe['image_name'].apply(lambda x: x[:-4])

additional_names = additionalgen.filenames 
        
additional_result = pd.DataFrame(additional_predictions)
additional_result = additional_result.apply(lambda x: ' '.join(np.array(list_of_label)[x >= 0.2]), axis=1)

additional_dataframe = pd.DataFrame({'image_name': additional_names, 'tags': additional_result})
additional_dataframe['image_name'] = additional_dataframe['image_name'].apply(lambda x: x[:-4])


In [21]:
result_dataframe = pd.concat([test_dataframe, additional_dataframe])

result_dataframe = result_dataframe.reset_index().drop('index', axis =1)

result_dataframe

Unnamed: 0,image_name,tags
0,test_0,primary clear
1,test_1,primary clear partly_cloudy
2,test_2,primary partly_cloudy
3,test_3,primary clear
4,test_4,primary partly_cloudy
...,...,...
61186,file_9995,primary cloudy partly_cloudy
61187,file_9996,primary clear water
61188,file_9997,primary clear water
61189,file_9998,cloudy


In [22]:
result_dataframe.to_csv('my_submission_main.csv', index=False)