# Exploratory Data Analysis

## Importing Libraries

In [58]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
import tensorflow as tf


from functools import partial

## Downloading the Data
This is a one time process as I download the data and save it in my google drive for further use.

In [2]:
# !pip uninstall kaggle -y --quiet
# !pip install kaggle --quiet
# !mkdir -p ~/.kaggle
# !cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
# !kaggle competitions download -c tpu-getting-started
# !unzip /content/tpu-getting-started.zip -d /content/drive/MyDrive/Projects/Flower_Classification/input

In [70]:
CLASSES = [
    'pink primrose', 'hard-leaved pocket orchid', 'canterbury bells', 'sweet pea', 
    'wild geranium', 'tiger lily', 'moon orchid', 'bird of paradise', 'monkshood', 
    'globe thistle', 'snapdragon', "colt's foot", 'king protea', 'spear thistle', 
    'yellow iris', 'globe-flower', 'purple coneflower', 'peruvian lily', 
    'balloon flower', 'giant white arum lily', 'fire lily', 'pincushion flower', 
    'fritillary', 'red ginger', 'grape hyacinth', 'corn poppy', 
    'prince of wales feathers', 'stemless gentian', 'artichoke', 'sweet william', 
    'carnation', 'garden phlox', 'love in the mist', 'cosmos',  'alpine sea holly', 
    'ruby-lipped cattleya', 'cape flower', 'great masterwort',  'siam tulip', 
    'lenten rose', 'barberton daisy', 'daffodil',  'sword lily', 'poinsettia', 
    'bolero deep blue',  'wallflower', 'marigold', 'buttercup', 'daisy', 
    'common dandelion', 'petunia', 'wild pansy', 'primula',  'sunflower', 
    'lilac hibiscus', 'bishop of llandaff', 'gaura',  'geranium', 'orange dahlia', 
    'pink-yellow dahlia', 'cautleya spicata',  'japanese anemone', 
    'black-eyed susan', 'silverbush', 'californian poppy',  'osteospermum', 
    'spring crocus', 'iris', 'windflower',  'tree poppy', 'gazania', 'azalea', 
    'water lily',  'rose', 'thorn apple', 'morning glory', 'passion flower',  
    'lotus', 'toad lily', 'anthurium', 'frangipani',  'clematis', 'hibiscus', 
    'columbine', 'desert-rose', 'tree mallow', 'magnolia', 'cyclamen ', 
    'watercress',  'canna lily', 'hippeastrum ', 'bee balm', 'pink quill',  
    'foxglove', 'bougainvillea', 'camellia', 'mallow',  'mexican petunia',  
    'bromelia', 'blanket flower', 'trumpet creeper',  'blackberry lily', 
    'common tulip', 'wild rose']

## Preparing the dataset

As our data is in the format of tfrecord so first we need to make a proper dataset from it before we could look inside our dataset. The code in this section is taken from <a href='https://www.kaggle.com/dimitreoliveira/flower-classification-with-tpus-eda-and-baseline'>here<a>.

In [3]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
IMAGE_SIZE = [512, 512]

In [4]:
def decode_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.reshape(image, [*IMAGE_SIZE, 3])
    return image

In [5]:
def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['class'], tf.int32)
    return image, label

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "id": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['id']
    return image, idnum # returns a dataset of image(s)

In [6]:
def load_dataset(filenames, labeled=True, ordered=False):
  ignore_order = tf.data.Options()
  if not ordered:
    ignore_order.experimental_deterministic = False
  dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
  dataset = dataset.with_options(ignore_order)
  dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls=AUTOTUNE)
  return dataset

In [7]:
filenames = glob.glob('/content/drive/MyDrive/Projects/Flower_Classification/input/tfrecords-jpeg-512x512/train/*.tfr*')

In [8]:
dataset = load_dataset(filenames)

## Looking at the dataset

In [42]:
def counts(ds):
  total = 0
  num_classes = set()
  num_samples = {}
  for element in ds:
    num_classes.add(element[1].numpy())
    total += 1
    try:
      num_samples[element[1].numpy()] += 1
    except Exception as e:
      num_samples[element[1].numpy()] = 1

  return num_classes, num_samples, total

In [43]:
num_classes, num_samples, total = counts(dataset)

In [45]:
print('Total no of samples: ', total)
print('Total no of classes: ', len(num_classes))

Total no of samples:  12753
Total no of classes:  104


In [65]:
x = [i for i in num_samples.keys()]
y = [i for i in num_samples.values()]

In [84]:
label2class = {k:v for k,v in zip(range(len(CLASSES)), CLASSES)}

In [85]:
px.bar(x=x, y=y, color=x, title='Distribution of samples', hover_name=CLASSES)

### Observations:


*   The class imbalance is quite apperant and hence we will use ROC as the metric.
*   We will need to use augmentations to increase our data samples.



## Looking at some images

In [86]:
samples = []
for i in dataset.take(20):
  samples.append([i[0].numpy(), i[1].numpy()])

In [87]:
plt.figure(figsize=(20,20))
for idx, (image, label) in enumerate(samples):
  plt.subplot(5,4,idx+1)
  plt.imshow(image)
  plt.tight_layout(pad=0.2)
  plt.axis('off')
  plt.title(label2class[label])

Output hidden; open in https://colab.research.google.com to view.

## Looking at augmentations

In [96]:
def data_augment(image, label):
    crop_size = tf.random.uniform([], int(512*.7), 512, dtype=tf.int32)
        
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_saturation(image, lower=0, upper=2)
    image = tf.image.random_crop(image, size=[crop_size, crop_size, 3])
    image = tf.image.resize(image, size=[512, 512])

    return image, label

In [97]:
plt.figure(figsize=(20,20))
for idx, (image, label) in enumerate(samples):
  plt.subplot(5,4,idx+1)
  plt.imshow(data_augment(image, label)[0])
  plt.tight_layout(pad=0.2)
  plt.axis('off')
  plt.title(label2class[label])

Output hidden; open in https://colab.research.google.com to view.

## Conclusion:
For a given image we have a class label. There is huge class imbalance in the dataset. Also the flowers are quite easy to classify from a human eye and hence the bayes error should also be high.