In [None]:
import os
import os.path as osp
import cv2 as cv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from glob import glob
from datetime import datetime

In [None]:
DATA_PATH = osp.join(os.environ['DATA_PATH'], 'cciw/Data')

In [None]:
# Search for all video files on Google Drive...
all_images = glob(os.path.join(DATA_PATH,'Videos_and_stills/GLNI/*/*/*/Images/Quad*/*.jpg'))
#all_images = glob(os.path.join(DATA_PATH,'Videos_and_stills/GLNI/*/*/May.*/Stills/Quad*/*.jpg'))

In [None]:
imagetable_path = os.path.join(DATA_PATH, 'Tables', 'ImageTable.csv')
image_df = pd.read_csv(imagetable_path, index_col=0)

In [None]:
analysis_path = os.path.join(DATA_PATH, 'Tables', 'Analysis.csv')
dive_path = os.path.join(DATA_PATH, 'Tables', 'Dives.csv')

analysis_df = pd.read_csv(analysis_path, index_col=0, dtype={'Count':float})
dive_df = pd.read_csv(dive_path, index_col=0, parse_dates=['Date'])
data_df = pd.merge(analysis_df, dive_df, on='Dive Index', how='outer')
data_df.columns

In [None]:
# Number of dives for each year-month
dive_df['Year-Month'] = dive_df['Date'].dt.strftime('%Y-%m')
dive_df_by_year_month = dive_df.groupby(by='Year-Month').count()['PSN']
dive_df_by_year_month.index = [datetime.strptime(year_month, '%Y-%m') for year_month in dive_df_by_year_month.index]
#dive_df_by_year_month

In [None]:
YM = '2017-07'

save_path = os.path.join(os.environ['DATA_PATH'], 'cciw/dataset_raw/Train', YM + '/port/')
print(save_path)

dive_idx = dive_df[dive_df['Year-Month'].values == YM].index #['Dive Index']
print(dive_idx)

In [None]:
relevant_dives_df = data_df.loc[data_df['Dive Index'].isin(dive_idx)]

In [None]:
# analysis indices for data collected in 2018-07
indices = relevant_dives_df['Analysis Index'].values

In [None]:
relevant_images_df = image_df[image_df['Analysis Index'].isin(indices)]
fnames = relevant_images_df['Name']
#fnames

In [None]:
root = os.path.join(DATA_PATH,'Videos_and_stills/GLNI/')

In [None]:
# only load the highest number
fnames = fnames.to_list()
fnames.sort()
print(len(fnames))

In [None]:
# only keep the images with highest suffix number (highest quality)
# run twice
for i, f in enumerate(fnames):
    t = f.split('.')[0].split('_')
    img_nb = int(t[-1].split('-')[1])
    # check the image number
    if img_nb > 1:
        # sometimes numbers are non-contiguous
        while (img_nb - 1) > 0:
            try:
                to_rm = t[0] + '_' + t[1] + '_' + t[2] + '_' + t[3].split('-')[0] + '-' + str(img_nb - 1) + '.nef'
                #print(to_rm)
                fnames.remove(to_rm)
            except:
                pass
            img_nb -= 1
print(len(fnames))

In [None]:
#fnames[0].split('/')

In [None]:
if YM.split('-')[1] == '08':
    month = 'Aug.'
elif YM.split('-')[1] == '07':
    month = 'Jul.'
elif YM.split('-')[1] == '06':
    month = 'Jun.'
else:
    print('invalid')
print('got month ', month)

In [None]:
#from tqdm import tqdm_notebook

In [None]:
dim = []
for i, f in enumerate(fnames):
    t = f.split('.')[0].split('_')
    #print(tokens)
    PSN = t[1].split('-')[0]
    QUAD = t[1].split('-')[1]
    file = root + PSN + '/2017/' + month + t[2].split('-')[-1] + '/Images/Quad' + QUAD + '/' + f.split('.')[0] + '.jpg'
    im = cv.imread(file)
    #rgb = cv.cvtColor(im, cv.COLOR_BGR2RGB)
    #plt.imshow(im[::4, ::4, :])
    #plt.title('frame ' + str(i))
    #plt.pause(0.1)
    dim.append((i, im.shape[0]))
dim = np.asarray(dim)

In [None]:
vals, cts = np.unique(dim[:, 1], return_counts=True)
print(vals)
print(cts)
print(cts.sum())

port_mode = dim[:, 0][dim[:, 1] == 7378]
land_mode = dim[:, 0][dim[:, 1] == 4924]

In [None]:
landscape = np.asarray(fnames)[land_mode].tolist()
print(len(landscape))
#landscape

In [None]:
portrait = np.asarray(fnames)[port_mode].tolist()
print(len(portrait))
portrait

In [None]:
i = 25
f = portrait[i]

img_idx = relevant_images_df.loc[relevant_images_df['Name'] == f]['Analysis Index']
biomass = relevant_dives_df.loc[img_idx]['Biomass'].values
count = relevant_dives_df.loc[img_idx]['Count'].values
live_cv = relevant_dives_df.loc[img_idx]['Live Coverage'].values

t = f.split('.')[0].split('_')
PSN, QUAD = t[1].split('-')[0], t[1].split('-')[1]
file = root + PSN + '/2017/' + month + t[2].split('-')[-1] + '/Images/Quad' + QUAD + '/' + f.split('.')[0] + '.jpg'
im = cv.imread(file)
rgb = cv.cvtColor(im, cv.COLOR_BGR2RGB)
plt.figure(figsize=(14, 12))
plt.imshow(rgb)

title_str = portrait[i] + ',  Biomass %.1f, Count %d, Live Coverage %.1f%%' % (biomass, count, live_cv)
plt.title(title_str)
plt.show()

In [None]:
# for portrait mode (7378, 4924)
y_start = 2200
x_start = 1050

# for landscape mode (4924, 7378)
#y_start = 1000
#x_start = 2200
y_end = y_start + 3000
x_end = x_start + 3000
print(x_end - x_start)
print(y_end - y_start)
imc = im[y_start:y_end, x_start:x_end, :]
plt.figure(figsize=(12, 12))
plt.imshow(imc)

In [None]:
width = int(imc.shape[0] * scale_percent / 100)
height = int(imc.shape[1] * scale_percent / 100)
imcr = cv.resize(imc, (width, height)) # resize image
cv.imwrite(os.path.join(save_path, f.split('.')[0] + '_crop.jpg'), imcr)

In [None]:
save_path

In [None]:
scale_percent = 75 # percent of original size

In [None]:
"""
# for landscape mode (4924, 7378)
x_start, x_end = 2100, 5100
#x_start, x_end = 2200, 5200
#y_start, y_end = 800, 3800
y_start, y_end = 1000, 4000
"""

In [None]:
#%matplotlib inline
#i = 2
#root_fname = fnames.values[portrait_mode][i].split('/')[-1].split('.')[0]
#guid = image_df[image_df['Name'].str.contains(root_fname)]['Analysis Index'].astype('int64')
#data_df[data_df['Analysis Index'].values == guid.values]

guid = image_df[image_df['Name'].str.contains('3801-1_2018-08')]['Analysis Index'].astype('int64')
data_df[data_df['Analysis Index'].values == np.unique(guid.values)]