# DPhi Deep Learning Bootcamp Datathon

## Getting Started

First, we will get the zip file. we will use wget so that no matter how many times we refresh, we don't have drag-and-drop the files here. Then we will use zipfile package to extract the files.

In [None]:
# !pip install wget
import wget
link ='https://dphi-live.s3.eu-west-1.amazonaws.com/dataset/weather.zip' # get the zip file
wget.download(link)

: 

In [None]:
import zipfile
with zipfile.ZipFile('/content/weather.zip', 'r') as zip_ref: # extract the files
    zip_ref.extractall('/content')

Then we will import the necessary libraries. Note that all of them were used heavily. Some were used just to get an idea of our data or as debugging statements.

In [None]:
import pandas as pd # to get the dataframe
import tensorflow as tf # for deep learning
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # for numerical operations
import cv2 # to read or show the image
from sklearn.metrics import accuracy_score # to score our model
from keras.preprocessing.image import ImageDataGenerator, img_to_array # helpful in getting images
from tensorflow.keras.applications import vgg19 # get the model for image classifier

train_data = pd.read_csv('Training_set.csv') # read the training data
train_data.sample(7)

In [None]:
train_data.info() # no null data

In [None]:
np.round(train_data.label.value_counts(normalize=True),4) # get an idea of spread of values

In [None]:
sns.countplot(train_data.label)
# spread of labels seem balanced
plt.show()

## Getting the image data and preparing them for the model

We will first get the filepath as they will provide us the image. It is important to use a merging method as the filepaths can be obtained in any order.

In [None]:
file_paths = [[fname, '/content/train/' + fname] for fname in train_data.filename] # get the paths
images = pd.DataFrame(file_paths, columns=['filename', 'filepaths'])
train_data = pd.merge(images, train_data, on = 'filename')
train_data.sample(7)

Let us see what some of the images look like.

In [None]:
plt.subplots(2,3,True,True, figsize=(10,6))
plt.subplot(2,3,1)
plt.imshow(cv2.imread(train_data.filepaths[121]))
plt.title(train_data.label[121])
plt.subplot(2,3,2)
plt.imshow(cv2.imread(train_data.filepaths[520]))
plt.title(train_data.label[520])
plt.subplot(2,3,3)
plt.imshow(cv2.imread(train_data.filepaths[356]))
plt.title(train_data.label[356])
plt.subplot(2,3,4)
plt.imshow(cv2.imread(train_data.filepaths[0]))
plt.title(train_data.label[0])
plt.subplot(2,3,5)
plt.imshow(cv2.imread(train_data.filepaths[109]))
plt.title(train_data.label[109])
plt.show()

In [None]:
data_train = []
for i in range(len(train_data)): # get the actual images
  img_array = cv2.imread(train_data['filepaths'][i])
  new_img_array = cv2.resize(img_array, (150, 150)) # resize them for the sake of consistency
  data_train.append(img_to_array(new_img_array)) # append them
print(data_train[0].shape) # it is list for now
data_train_arr = np.stack(data_train)/255 # make it an array and scale it
print(data_train_arr.shape)
label_train = train_data.label.astype('category').cat.codes.values # get the labels
print(label_train.shape)

In [None]:
temp = dict(zip(label_train, train_data.label))

# Setting up and testing model

In [None]:
INPUT_SHAPE = (150, 150, 3) # define the input size
model = tf.keras.models.Sequential() # initialize the model
model.add(tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), strides=(1, 1),
activation='relu', padding='valid', input_shape=INPUT_SHAPE))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2))) # maxpool
model.add(tf.keras.layers.Flatten()) # flatten
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(rate=0.3)) # dropout
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(rate=0.3)) # dropout
model.add(tf.keras.layers.Dense(5, activation='softmax')) # identification
# compilation
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
  metrics=['accuracy'])
# view model layers
model.summary()

In [None]:
# callback to avoid unnecessary complete execution
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2,
verbose=1)

history = model.fit(data_train_arr, label_train, batch_size=64,
          callbacks=[es_callback], validation_split=0.3, epochs=100,
          verbose=1)

In [None]:
# performance of the model
fig, ax = plt.subplots(1, 2, figsize=(10, 4))

history_df = pd.DataFrame(history.history)
history_df[['loss', 'val_loss']].plot(kind='line', ax=ax[0])
history_df[['accuracy', 'val_accuracy']].plot(kind='line', ax=ax[1])
plt.show()

## VGG19 model

We will now VGG19 layer to help improve our model's performance.

In [None]:
vgg_layers = vgg19.VGG19(weights='imagenet', include_top=False,
input_shape=INPUT_SHAPE)
for layer in vgg_layers.layers:
  layer.trainable = True
# vgg_layers.summary()

In [None]:
# define sequential model
model = tf.keras.models.Sequential()
model.add(vgg_layers) # add the vgg19 layer
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(rate=0.3))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(rate=0.3))
model.add(tf.keras.layers.Dense(5, activation='softmax'))
# compilation
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,
verbose=1, min_delta=0.01, restore_best_weights=True)

history = model.fit(data_train_arr, label_train, batch_size=32,
          callbacks=[es_callback], validation_split=0.3, epochs=100,
          verbose=1)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 4))

history_df = pd.DataFrame(history.history)
history_df[['loss', 'val_loss']].plot(kind='line', ax=ax[0])
history_df[['accuracy', 'val_accuracy']].plot(kind='line', ax=ax[1]);
# With VGG19 the performance is better
plt.show()

In [None]:
# test the model on training data
valid_labels = np.argmax(model.predict(data_train_arr), axis=1)
accuracy_score(valid_labels, label_train)
# looks great!

# Submission

Now we will generate predictions and submit the file!

In [None]:
test_data = pd.read_csv('Testing_set.csv')
file_paths = [[fname, '/content/test/' + fname] for fname in test_data.filename]
images = pd.DataFrame(file_paths, columns=['filename', 'filepaths'])
test_data = pd.merge(images, test_data, on = 'filename')

data_test = []
for i in range(len(test_data)):
  img_array = cv2.imread(test_data['filepaths'][i])
  new_img_array = cv2.resize(img_array, (150, 150))
  data_test.append(img_to_array(new_img_array))


In [None]:
data_test_arr = np.stack(data_test)/255
a = np.argmax(model.predict(data_test_arr), axis=1)
# print([temp[i] for i in a])
predictions = pd.DataFrame([temp[i] for i in a], columns=['label']) # get the original labels 
# predictions.head()
predictions.to_csv('DPhi_Deep_Learning_Akshar.csv', index=False, encoding='utf-8')

In [None]:
from google.colab import files 
files.download('DPhi_Deep_Learning_Akshar.csv')

In [None]:
rm weather*.zip # to prevent duplicate files when wget code is executed