# Planet: Understanding the Amazon deforestation from Space challenge

In [None]:
# Import required libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
import matplotlib.pyplot as plt
from scipy.stats import bernoulli
import seaborn as sns
import cv2
from glob import glob # handles pathnames 
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.manifold import TSNE
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from skimage import io
import tifffile

import os
import gc
import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D

import cv2
from tqdm import tqdm
from sklearn.metrics import fbeta_score

In [None]:
# Print filenames of input datasets
print(check_output(["ls", "../input"]).decode("utf8"))

In [None]:
labels_df = pd.read_csv('../input/train_v2.csv')
labels_df.head(10)

In [None]:
# Print all unique classes
from itertools import chain
labels_list = list(chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values]))
num_labels = len(labels_list)
uniq_labels = set(labels_list) 
num_uniq_labels = len(uniq_labels) 

In [None]:
labels_s = pd.Series(labels_list).value_counts() # To sort them by count
fig, ax = plt.subplots(figsize=(16, 8))
sns.barplot(x=labels_s, y=labels_s.index, orient='h')
plt.show()

In [None]:
# Preprocess labels
x_train = []
x_test = []
y_train = []

df_train = pd.read_csv('../input/train_v2.csv')

flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

label_map = {l: i for i, l in enumerate(labels)}

In [None]:
labels = df_train['tags'].apply(lambda x: x.split(' '))
from collections import Counter, defaultdict
counts = defaultdict(int)
for l in labels:
    for l2 in l:
        counts[l2] += 1

data=[go.Bar(x=list(counts.keys()), y=list(counts.values()))]
layout=dict(height=800, width=800, title='Distribution of training labels')
fig=dict(data=data, layout=layout)
py.iplot(data, filename='train-label-dist')

In [None]:
_, ax = plt.subplots(3, 3, sharex='col', sharey='row', figsize=(20, 20))
i = 0
for f, l in df_train[:9].values:
    img = cv2.imread('../input/train-jpg/{}.jpg'.format(f))
    ax[i // 3, i % 3].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    ax[i // 3, i % 3].set_title('{} - {}'.format(f, l))
    #ax[i // 4, i % 4].show()
    i += 1
    
plt.show()

In [None]:
for f, tags in tqdm(df_train.values, miniters=1000):
    img = cv2.imread('../input/train-jpg/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    x_train.append(cv2.resize(img, (32, 32)))
    y_train.append(targets)

In [None]:
max_value = np.amax(x_train)
min_value = np.amin(x_train)

print('Min value in the training images array: ', min_value)
print('Max value in the training images array: ', max_value)

In [None]:
y_train = np.array(y_train)
x_train = np.array(x_train) / 255.
print("Shape of the training images array is: ")
print(x_train.shape)
print("Shape of the training labels array is: ")
print(y_train.shape)

In [None]:
 y_train = np.array(y_train, np.uint8)
 x_train = np.array(x_train, np.float16) / 255.
 print("Shape of the training images array is: ")
 print(x_train.shape)
 print("Shape of the training labels array is: ")
 print(y_train.shape)

In [None]:
# Approx. 86% for training and 14% for validation
x_train, x_valid, y_train, y_valid = (x_train[:35000], x_train[35000:], y_train[:35000], y_train[35000:])

In [None]:
# Create structure of Convolutional Neural Network
model = Sequential()

model.add(Conv2D(32, kernel_size=(5, 5),
                 activation='relu',
                 input_shape=(32, 32, 3)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              # We NEED binary here, since categorical_crossentropy 
              # l1 norms the output before calculating loss.
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train,
          batch_size=128,
          epochs=12,
          verbose=1,
          validation_data=(x_valid, y_valid))

In [None]:
# Plot model accuracy and loss

fig, axs = plt.subplots(1, 2, figsize=(12, 3), sharey=False)
plt.subplots_adjust(top = 0.99, bottom=0.01, hspace=0.5, wspace=0.4)

#axs[0].plot(history.history['acc'])
#axs[0].plot(history.history['val_acc'])
axs[0].set_title('model accuracy')
axs[0].set_ylabel('accuracy')
axs[0].set_xlabel('epoch')
axs[0].legend(['train', 'test'], loc='upper left')

axs[1].plot(history.history['loss'])
axs[1].plot(history.history['val_loss'])
axs[1].set_title('model loss')
axs[1].set_ylabel('loss')
axs[1].set_xlabel('epoch')
axs[1].legend(['train', 'test'], loc='upper left')
plt.show()