In [1]:
# here we link our notebook to our gdrive space
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import matplotlib.image as img
from PIL import Image
import imageio
import imgaug as ia
import imgaug.augmenters as iaa
import random
import string
import csv
import cv2
import shutil

pd.set_option('max_colwidth', 500)
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [3]:
#path of data
base_line='/content/drive/MyDrive/Zindi/makerere-fall-armyworm-crop-challenge/data/'
image_base= base_line+'images/'

In [4]:
# Load files
train = pd.read_csv(os.path.join(base_line)+'Train.csv')
test = pd.read_csv(os.path.join(base_line)+'Test.csv')
samplesubmission = pd.read_csv(os.path.join(base_line)+'SampleSubmission.csv')

In [5]:
# build dict of images
train_images_dict={}

for img_id in train['Image_id']:
    train_images_dict[img_id] = (imageio.imread(image_base+str(img_id)))

In [None]:
test_images_dict={}
for img_id in test['Image_id']:
    test_images_dict[img_id] = (imageio.imread(image_base+str(img_id)))

In [None]:
# build dict of labels
train_labels_dict={}
index = 0
for img_id in train['Image_id']:
    train_labels_dict[img_id] = train['Label'][index] # label
    index += 1

In [None]:
#train_labels_dict.keys()
train_images_dict.keys()

dict_keys(['id_02amazy34fgh2.jpg', 'id_02mh3w48pmyc9.jpg', 'id_02rpb463h9d3w.jpg', 'id_02wc3jeeao8ol.jpg', 'id_03t2hapb8wz8p.jpg', 'id_04xrxyd43rlgz.jpg', 'id_05gqx7h8y6iye.jpg', 'id_06le9g89kqth5.jpg', 'id_082w0qygo3eth.jpg', 'id_086yvmu3nnvbe.jpg', 'id_09v3am3ppzesz.jpg', 'id_0bbnpqpkpbod3.jpg', 'id_0bd0chrhwa88r.jpg', 'id_0d6mm3xcpbqkw.jpg', 'id_0ff2fm63gzxvs.jpg', 'id_0g00jpbynmgfr.jpg', 'id_0gur3rc3nuft9.jpg', 'id_0k7gdyyx5zunr.jpg', 'id_0klqzvk8id4bt.jpg', 'id_0mytnx3vd7ebk.jpg', 'id_0ncnn142aww4v.jpg', 'id_0qm8i331u09jq.jpg', 'id_0qrjnjx2nbbde.jpg', 'id_0t41fhqaaqux9.jpg', 'id_0t76933bwfswn.jpg', 'id_0t7gw6z26pabf.jpg', 'id_0ub75edrc3qvg.jpg', 'id_0unlhj59tv0p4.jpg', 'id_0vn4379vhh52p.jpg', 'id_0wq0ldha65pkf.jpg', 'id_0wtod6711rn7r.jpg', 'id_0xb2nvq2kqdst.jpg', 'id_0yb73f56l5r4f.jpg', 'id_0z6gb99lwj01f.jpg', 'id_0ztqu05sh2a8j.jpg', 'id_10gekjcx26yu7.jpg', 'id_10xmisqifmnpb.jpg', 'id_10zte1vif5nq3.jpg', 'id_11udz3qrokt8s.jpg', 'id_14ck0mycohnrq.jpg', 'id_16cotxnq0qoa6.jpg', 'id_1

In [None]:
# class repartitions
zeros = [x for x in train_labels_dict.values() if x==0]
uns = [x for x in train_labels_dict.values() if x==1]
print("class 0: ", len(zeros))
print("class 1: ", len(uns))
print("diff: ", -len(zeros)+len(uns))

class 0:  810
class 1:  809
diff:  -1


In [6]:
# align all images vertically
def rotate_image(image):
  rotated_image = []
  img_height, img_width, c = image.shape
  if (img_width > img_height):
    rotated_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
    return rotated_image
  return image

def preprocessing():
  src = image_base
  dest = base_line + 'processed_dataV2/processed_images/'

  files = os.listdir(src)
  index = 1
  for fn in files:
    print(index, "/1619")
    image = cv2.imread(src+fn, 1)
    # Make a copy of the image
    image_copy = np.copy(image)
    # Change color to RGB (from BGR)
    image_copy = cv2.cvtColor(image_copy, cv2.COLOR_BGR2RGB)

    # step 1: rotate image
    rotated_image = rotate_image(image_copy)

    # step 2: resize image
    resized_image = cv2.resize(rotated_image, (224,224), interpolation = cv2.INTER_CUBIC)

    cv2.imwrite(os.path.join(dest, fn), resized_image)
    index += 1

# split processed data into train, val, test
def split_dataset():
  src = base_line+'processed_dataV2/processed_images/'
  dest_train = base_line+'processed_dataV2/train/'
  dest_val = base_line+'processed_dataV2/val/'
  dest_test = base_line+'processed_dataV2/test/'

  dataset_size = len(train_images_dict.keys()) // 2
  train_size = int(dataset_size * 0.85)
  val_size = int(dataset_size * 0.15)

  # ids of images labeled 0
  tr_zeros = [x for x in train_images_dict.keys() if train_labels_dict[x]==0]
  # ids of images labeled 1
  tr_uns = [x for x in train_images_dict.keys() if train_labels_dict[x]==1]

  train_zeros = tr_zeros[:train_size]
  val_zeros = tr_zeros[train_size:]
  train_uns = tr_uns[:train_size]
  val_uns = tr_uns[train_size:]

  # train set
  for id in train_zeros:
    shutil.copy(src+id, dest_train+'0/')
  for id in train_uns:
    shutil.copy(src+id, dest_train+'1/')

  # val set
  for id in val_zeros:
    shutil.copy(src+id, dest_val+'0/')
  for id in val_uns:
    shutil.copy(src+id, dest_val+'1/')

  # test set
  for id in test_images_dict.keys():
    shutil.copy(src+id, dest_test)

# data augmentation
def augment_data(source, destination):
  for fn in os.listdir(source):
    img = imageio.imread(source+fn)
      
    rotate=iaa.Affine(rotate=(-15, 15))
    rotated_image=rotate.augment_image(img)
    rotated_image6 = Image.fromarray(rotated_image)

    shear = iaa.Affine(shear=(-0.2,0.2))
    shear_image=shear.augment_image(img)
    shear_image6 = Image.fromarray(shear_image)

    scalex = iaa.Affine(scale={"x": (0.5, 1.5), "y": 1.0})
    scalex_image=scalex.augment_image(img)
    scalex_image6 = Image.fromarray(scalex_image)

    scaley = iaa.Affine(scale={"x": 1.0, "y": (0.5, 1.5)})
    scaley_image=scaley.augment_image(img)
    scaley_image6 = Image.fromarray(scaley_image)

    flip_hr=iaa.Fliplr(p=1.0)
    flip_hr_image= flip_hr.augment_image(img)
    flip_hr_image6 = Image.fromarray(flip_hr_image)
    
    #img_transform1 = [rotated_image, shear_image, scalex_image, scaley_image]
    #img_transform2 = [flip_hr_image]
    img_transform1 = []
    img_transform2 = [scalex_image]

    # apply these random transforms 5 times on dataset
    for _ in range(5):
      for transf in  img_transform1:
        letters = string.ascii_uppercase
        rnd = ''.join(random.choice(letters) for i in range(10))
        fn = 'IDAUG_'+rnd
        cv2.imwrite(os.path.join(destination, fn+'.jpg'), transf)
          
    # apply this random tansform once    
    for transf in  img_transform2:
      letters = string.ascii_uppercase
      rnd = ''.join(random.choice(letters) for i in range(10))
      fn = 'IDAUG_'+rnd
      cv2.imwrite(os.path.join(destination, fn+'.jpg'), transf)

In [None]:
#preprocessing()
#split_dataset()

In [7]:
source = base_line+'processed_dataV2/train/0/'
destination = base_line+'processed_dataV2/train_aug_2k/0/'
augment_data(source, destination)

source = base_line+'processed_dataV2/train/1/'
destination = base_line+'processed_dataV2/train_aug_2k/1/'
augment_data(source, destination)

In [11]:
source = base_line+'processed_dataV2/train_aug/0/'
print(len(os.listdir(source)))

14427


In [9]:
source = base_line+'processed_dataV2/train/0/'
dest = base_line+'processed_dataV2/train_aug_2k/0/'
for fn in os.listdir(source):
  shutil.copy(source+fn, dest+fn)

source = base_line+'processed_dataV2/train/1/'
dest = base_line+'processed_dataV2/train_aug_2k/1/'
for fn in os.listdir(source):
  shutil.copy(source+fn, dest+fn)