In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import requests # to get image from the web
import shutil # to save it

In [None]:
# ls gdrive/MyDrive/project_6/images/train/

## Step 1. Function to Download Images

### 1.1. Function

In [None]:
# Credit: https://towardsdatascience.com/how-to-download-an-image-using-python-38a75cfa21c

# Function to download images

def get_images(base_url, img_id, folder_dir):
    """
    Input:
            base_url: address to Azure blob container where images are stored
            img_id: image id
            folder_dir: file path to a directory where we want to save images to
    Output:
            image saved to the folder_dir
    """
    
    # Set up image URL
    image_url = base_url+img_id

    # Setup file path on local machine
    file_name = img_id.split("/")[-1]
    file_path = folder_dir + file_name

    # Open the url image, set stream to True, this will return the stream content.
    r = requests.get(image_url, stream = True)

    # Check if the image was retrieved successfully
    if r.status_code == 200:
        # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
        r.raw.decode_content = True

        # Open a file on Google Drive with wb ( write binary ) permission.
        with open(file_path,'wb') as f:
            shutil.copyfileobj(r.raw, f)

        print('Image sucessfully Downloaded: ',file_name)
    else:
        print('Image Couldn\'t be retreived', file_name)
    
    return

### 1.2. Test the Function

In [None]:
# Provide the base URL and image id
base_url = "https://lilablobssc.blob.core.windows.net/noaa-kotz/"
img_id = "Images/fl04/CENT/test_kotz_2019_fl04_C_20190510_000310.667291_rgb.jpg"

# Provide target directory
train_folder_dir = "gdrive/MyDrive/project_6/images/train/"

# Download and save the image
get_images(base_url, img_id, train_folder_dir)

Image sucessfully Downloaded:  test_kotz_2019_fl04_C_20190510_000310.667291_rgb.jpg


## Step 2. Download Annotated Images

### 2.1. List of Images

In [None]:
#1 bounding box per row
#4,113 rgb images with 14,311 bounding boxes
annot_images = pd.read_csv("gdrive/MyDrive/project_6/csv_files/surv_test_kamera_detections_20210212_full_paths.csv")

In [None]:
annot_images.info()
annot_images.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14311 entries, 0 to 14310
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   flight           14311 non-null  object 
 1   camera_view      14311 non-null  object 
 2   dt               14311 non-null  object 
 3   detection_id     14311 non-null  object 
 4   detection_type   14311 non-null  object 
 5   detection_score  14311 non-null  float64
 6   rgb_image_name   14311 non-null  object 
 7   rgb_left         14311 non-null  int64  
 8   rgb_right        14311 non-null  int64  
 9   rgb_top          14311 non-null  int64  
 10  rgb_bottom       14311 non-null  int64  
 11  ir_image_name    14088 non-null  object 
 12  ir_left          14088 non-null  float64
 13  ir_right         14088 non-null  float64
 14  ir_top           14088 non-null  float64
 15  ir_bottom        14088 non-null  float64
 16  rgb_image_path   14311 non-null  object 
 17  ir_image_pat

Unnamed: 0,flight,camera_view,dt,detection_id,detection_type,detection_score,rgb_image_name,rgb_left,rgb_right,rgb_top,rgb_bottom,ir_image_name,ir_left,ir_right,ir_top,ir_bottom,rgb_image_path,ir_image_path
0,fl04,C,20190510_000310.667291,test_kamera_fl04_C_81,ringed_seal,0.986021,test_kotz_2019_fl04_C_20190510_000310.667291_r...,3403,3471,474,418,test_kotz_2019_fl04_C_20190510_000310.667291_i...,344.486,350.924,73.9964,68.676,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...
1,fl04,C,20190510_000311.417294,test_kamera_fl04_C_82,ringed_seal,0.723258,test_kotz_2019_fl04_C_20190510_000311.417294_r...,3114,3176,3534,3484,test_kotz_2019_fl04_C_20190510_000311.417294_i...,315.49,321.359,363.045,358.294,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...


In [None]:
#get all the unique rgb image paths
#there are 4,113 annotated rgb images 
images_list = annot_images["rgb_image_path"].unique()
images_list.shape

(4113,)

In [None]:
#use 1,000 images for train
train, test = train_test_split(images_list, train_size = 1000, random_state=42)
train.shape, test.shape

((1000,), (3113,))

In [None]:
#get all the bounding boxes for the train images
#1 bounding box per row
#1,000 images with 3,509 bounding boxes
annot_train = annot_images[annot_images["rgb_image_path"].isin(train)]
annot_train.shape

(3509, 18)

In [None]:
#get all the bounding boxes for the test images
#3,113 images with 10,802 bounding boxes
annot_test = annot_images[annot_images["rgb_image_path"].isin(test)]
annot_test.shape

(10802, 18)

In [None]:
#save to csv files
annot_train.to_csv("/content/gdrive/MyDrive/project_6/csv_files/annot_train.csv", index=False)
annot_test.to_csv("/content/gdrive/MyDrive/project_6/csv_files/annot_test.csv", index=False)

### 2.2. Download 1,000 Train Images

In [None]:
# Provide the base URL and image id
base_url = "https://lilablobssc.blob.core.windows.net/noaa-kotz/"

# Provide target directory
train_folder_dir = "/content/gdrive/MyDrive/project_6/images/train/"

for img_id in train:
  get_images(base_url, img_id, train_folder_dir)

All 1,000 images have been successfully downloaded. The first 5 out of 1,000 output messages are shown below.

Image sucessfully Downloaded:  polar_bear_2019_fl07_C_20190512_002853.877478_rgb.jpg

Image sucessfully Downloaded:  test_kotz_2019_fl05_L_20190510_054244.070383_rgb.jpg

Image sucessfully Downloaded:  test_kotz_2019_fl04_C_20190510_002748.161252_rgb.jpg

Image sucessfully Downloaded:  polar_bear_2019_fl07_C_20190511_232943.930450_rgb.jpg

Image sucessfully Downloaded:  test_kotz_2019_fl04_C_20190510_010217.050262_rgb.jpg

### 2.3. Download another 3,000 Train Images

In [None]:
annot_images = pd.read_csv("gdrive/MyDrive/project_6/csv_files/surv_test_kamera_detections_20210212_full_paths.csv")

In [None]:
annot_test = pd.read_csv("/content/gdrive/MyDrive/project_6/csv_files/annot_test.csv")
annot_test.info()
annot_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10802 entries, 0 to 10801
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   flight           10802 non-null  object 
 1   camera_view      10802 non-null  object 
 2   dt               10802 non-null  object 
 3   detection_id     10802 non-null  object 
 4   detection_type   10802 non-null  object 
 5   detection_score  10802 non-null  float64
 6   rgb_image_name   10802 non-null  object 
 7   rgb_left         10802 non-null  int64  
 8   rgb_right        10802 non-null  int64  
 9   rgb_top          10802 non-null  int64  
 10  rgb_bottom       10802 non-null  int64  
 11  ir_image_name    10670 non-null  object 
 12  ir_left          10670 non-null  float64
 13  ir_right         10670 non-null  float64
 14  ir_top           10670 non-null  float64
 15  ir_bottom        10670 non-null  float64
 16  rgb_image_path   10802 non-null  object 
 17  ir_image_pat

Unnamed: 0,flight,camera_view,dt,detection_id,detection_type,detection_score,rgb_image_name,rgb_left,rgb_right,rgb_top,rgb_bottom,ir_image_name,ir_left,ir_right,ir_top,ir_bottom,rgb_image_path,ir_image_path
0,fl04,C,20190510_000310.667291,test_kamera_fl04_C_81,ringed_seal,0.986021,test_kotz_2019_fl04_C_20190510_000310.667291_r...,3403,3471,474,418,test_kotz_2019_fl04_C_20190510_000310.667291_i...,344.486,350.924,73.9964,68.676,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...
1,fl04,C,20190510_000443.666902,test_kamera_fl04_C_223,ringed_seal,0.993174,test_kotz_2019_fl04_C_20190510_000443.666902_r...,6114,6180,2413,2360,test_kotz_2019_fl04_C_20190510_000443.666902_i...,598.734,604.982,258.363,253.327,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...
2,fl04,C,20190510_000503.166834,test_kamera_fl04_C_288,ringed_seal,0.996813,test_kotz_2019_fl04_C_20190510_000503.166834_r...,1635,1696,1742,1703,test_kotz_2019_fl04_C_20190510_000503.166834_i...,177.209,182.977,193.083,189.372,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...
3,fl04,C,20190510_000503.166834,test_kamera_fl04_C_289,ringed_seal,0.752098,test_kotz_2019_fl04_C_20190510_000503.166834_r...,1596,1653,1869,1796,test_kotz_2019_fl04_C_20190510_000503.166834_i...,173.461,178.873,205.067,198.144,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...
4,fl04,C,20190510_000539.916845,test_kamera_fl04_C_307,ringed_seal,0.998974,test_kotz_2019_fl04_C_20190510_000539.916845_r...,115,157,3260,3170,test_kotz_2019_fl04_C_20190510_000539.916845_i...,33.1479,37.1564,335.893,327.371,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...,Images/fl04/CENT/test_kotz_2019_fl04_C_2019051...


In [None]:
len(annot_test["rgb_image_path"].unique())
#this is 3,113 images left after the 1,000 train images
#we will use 3,000 more for train

3113

In [None]:
#use another 3,000 images for train
#the leftover is only 113 images
train3000, test113 = train_test_split(annot_test["rgb_image_path"].unique(), train_size = 3000, random_state=42)
train3000.shape, test113.shape

((3000,), (113,))

In [None]:
#get all the bounding boxes for the train images
#1 bounding box per row
#3,000 images with 10,328 bounding boxes
annot_train3000 = annot_images[annot_images["rgb_image_path"].isin(train3000)]
annot_train3000.shape

(10328, 18)

In [None]:
#get all the bounding boxes for the test images
#113 images with 474 bounding boxes
annot_test113 = annot_images[annot_images["rgb_image_path"].isin(test113)]
annot_test113.shape

(474, 18)

In [None]:
#save to csv files
annot_train3000.to_csv("/content/gdrive/MyDrive/project_6/csv_files/annot_train3000.csv", index=False)
annot_test113.to_csv("/content/gdrive/MyDrive/project_6/csv_files/annot_test113.csv", index=False)

In [None]:
# Provide the base URL and image id
base_url = "https://lilablobssc.blob.core.windows.net/noaa-kotz/"

# Provide target directory
train_folder_dir = "/content/gdrive/MyDrive/project_6/images/train/"

for img_id in train3000:
  get_images(base_url, img_id, train_folder_dir)

#the download got stuck in the middle, not sure why

### 2.4. Download 113 Images for Evaluation