In [1]:
# Import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

In [4]:
# Kaggle API
import os
os.environ['KAGGLE_USERNAME'] = 'kambleabhijeet'
os.environ['KAGGLE_KEY'] = 'ea830af5f56bdcefde51c1f9b34db965'

In [5]:
# Downloading the dataset directly from Kaggle on the Colab environment 
!kaggle datasets download -d nikitarom/planets-dataset

Downloading planets-dataset.zip to /content
100% 1.49G/1.50G [00:39<00:00, 37.0MB/s]
100% 1.50G/1.50G [00:39<00:00, 40.6MB/s]


In [None]:
# Unzipping the dataset
! unzip /content/planets-dataset.zip

In [7]:
free_gpu_cache()

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% |  1% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  4% |  1% |


In [8]:
# Import the dataset
train_classes = pd.read_csv("/content/planet/planet/train_classes.csv")
sample_sub = pd.read_csv("/content/planet/planet/sample_submission.csv")
train_classes.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [9]:
# Create dictionary list for converting labels to numerical classes

label_list = {'agriculture': 14,
 'artisinal_mine': 5,
 'bare_ground': 1,
 'blooming': 3,
 'blow_down': 0,
 'clear': 10,
 'cloudy': 16,
 'conventional_mine': 2,
 'cultivation': 4,
 'habitation': 9,
 'haze': 6,
 'partly_cloudy': 13,
 'primary': 7,
 'road': 11,
 'selective_logging': 12,
 'slash_burn': 8,
 'water': 15}

In [10]:
# Counts the labels
counts = {}
tags = train_classes['tags'].map(lambda x: x.split(' '))
for labels in tags.values:
    for label in labels:
        counts[label] = counts[label] + 1  if label in counts else 0      

In [11]:
# Load dataset 
all_labels = tags.values
labels = list(set([y for x in all_labels for y in x]))

def load_data(train_classes, labels, resize):
    x_train = []
    y_train = []

    label_map = {l: i for i, l in enumerate(labels)}
    inv_label_map = {i: l for l, i in label_map.items()}

    for f, tags in train_classes.values:
        img = cv2.imread('/content/planet/planet/train-jpg/{}.jpg'.format(f)) 
        targets = np.zeros(17)
        for t in tags.split(' '):
            targets[label_map[t]] = 1 

        x_train.append(cv2.resize(img,resize))
        y_train.append(targets)
                
    y_train = np.array(y_train, np.uint8)
    x_train = np.array(x_train, np.float16) / 255.0

    return x_train, y_train

In [12]:
# Reshape the data
import cv2
from PIL import Image
from skimage import io
from tensorflow.keras.preprocessing.image import load_img, img_to_array

x, y = load_data(train_classes, labels,resize=(64,64))
print(x.shape)
print(y.shape)

(40479, 64, 64, 3)
(40479, 17)


In [13]:
free_gpu_cache()

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% |  1% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  4% |  1% |


In [14]:
# check shape of image data
print(train_classes.shape)
print(sample_sub.shape)

(40479, 2)
(61191, 2)


In [15]:
# Split the dataset into x_train, x_val, y_train and y_val
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x,y, test_size=0.2, random_state = 1)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(32383, 64, 64, 3)
(8096, 64, 64, 3)
(32383, 17)
(8096, 17)


In [16]:
# Labels are given as tags in a single dataframe series
from sklearn.preprocessing import MultiLabelBinarizer
biner = MultiLabelBinarizer()
tags = train_classes['tags'].str.split()
y = biner.fit_transform(tags)

labels = biner.classes_
print('Number of labels: ', len(labels))
print('\n')
print(labels)

Number of labels:  17


['agriculture' 'artisinal_mine' 'bare_ground' 'blooming' 'blow_down'
 'clear' 'cloudy' 'conventional_mine' 'cultivation' 'habitation' 'haze'
 'partly_cloudy' 'primary' 'road' 'selective_logging' 'slash_burn' 'water']


In [17]:
# Import the layers to build a model and also pre-trained models
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, BatchNormalization
from keras.layers import Conv2D, MaxPool2D
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.optimizers import Adam

In [18]:
def fbeta_score_K(y_true, y_pred):
    beta_squared = 4

    tp = K.sum(y_true * y_pred) + K.epsilon()
    fp = K.sum(y_pred) - tp
    fn = K.sum(y_true) - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    result = (beta_squared + 1) * (precision * recall) / (beta_squared * precision + recall + K.epsilon())
    return result

In [20]:
# To train our model we using pre-trained model VGG19
def build_model():
    base_model = VGG19(include_top=False, weights='imagenet', input_shape=(64, 64, 3))
    model = Sequential()
    model.add(BatchNormalization(input_shape=(64, 64, 3)))
    model.add(base_model)
    model.add(Flatten())
    model.add(Dense(17, activation='sigmoid'))
    opt = Adam(lr=1e-4)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[fbeta_score_K])
    return model

In [21]:
# Fit the model
model = build_model() 
model.fit(x_train, y_train, batch_size=64, epochs=10, verbose=1, validation_data=(x_val, y_val))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc9b2d192d0>

In [22]:
# Evaluate the model on x_val dataset
from sklearn.metrics import fbeta_score
y_pred = model.predict(x_val, batch_size=64)
score = fbeta_score(y_val, np.array(y_pred) > 0.2, beta=2, average='samples')

print("Test score (f1): {}".format(score))
print("Error: {}".format(100-score*100))

Test score (f1): 0.9061942662077559
Error: 9.380573379224415


In [23]:
test_loss, test_accuracy = model.evaluate(x_val, y_val)
print('Test loss: {}'.format(test_loss))
print('Test accuracy: {}'.format(test_accuracy))

Test loss: 0.12373435497283936
Test accuracy: 0.8419274687767029


In [42]:
# Now, we'll test the model on Test data
from tqdm import tqdm
X_test=[]

for img, label in tqdm(sample_sub[:40669].values, miniters = 1000):
  X_test.append(cv2.resize(cv2.imread('/content/planet/planet/test-jpg/{}.jpg'.format(img)), (64,64)))

for img, label in tqdm(sample_sub[40669:].values, miniters = 1000):
  X_test.append(cv2.resize(cv2.imread('/content/test-jpg-additional/test-jpg-additional/{}.jpg'.format(img)), (64,64)))

x_test = np.array(X_test, np.float16)/255

100%|██████████| 40669/40669 [00:54<00:00, 742.66it/s]
100%|██████████| 20522/20522 [00:30<00:00, 668.89it/s]


In [43]:
x_test.shape

(61191, 64, 64, 3)

In [44]:
test_pred = model.predict(x_test, batch_size = 64)

In [45]:
prediction = pd.DataFrame(test_pred, columns= labels)

In [46]:
final_prediction = []

for i in tqdm(range(prediction.shape[0]), miniters=1000):
    a = prediction.loc[[i]]
    a = a.apply(lambda x:x>0.2, axis =1)
    a = a.transpose()
    a = a.loc[a[i]==True]
    ' '.join(list(a.index))
    final_prediction.append(' '.join(list(a.index)))

100%|██████████| 61191/61191 [02:49<00:00, 361.96it/s]


In [47]:
sample_sub['tags'] = final_prediction
sample_sub.to_csv('My_submission.csv', index = False)
print("File saved successfully!")

File saved successfully!
