# For plotting statistics from the lab analysis only

This notebook should be self contained, you may need to change the `DATA_PATH` if running locally so it can find the data tables.

In [None]:
import csv
import sys

import os
import os.path as osp

import glob

import numpy as np

# for comparing predictions to lab analysis data frames
import pandas as pd

# for plotting
import matplotlib
# enable LaTeX style fonts
matplotlib.rc('text', usetex=True)
import matplotlib.pyplot as plt
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

# evaluation metrics
from sklearn.metrics import r2_score

from plot_utils import *

IN_COLAB = 'google.colab' in sys.modules

In [None]:
'''
root_path = '/scratch/ssd/gallowaa/cciw/dataset_raw/Test/Lab/done/'
jpeg_files = glob.glob(root_path + '*.jpg')
jpeg_files.sort()
# Should equal 40
print(len(jpeg_files)) 

csvfile = 'lab-v100-originals-files.csv'

with open(csvfile, 'w') as f:
    csvwriter = csv.writer(f, delimiter=',')#, #escapechar='', quoting=csv.QUOTE_NONE)
    for i in range(len(jpeg_files)):
        csvwriter.writerow([jpeg_files[i].split('/')[-1]])
'''

The list of files is now stored in a csv file in this repo for portability

In [None]:
jpeg_files = []

csvfile = 'lab-v100-originals-files.csv'
with open(csvfile, 'r') as f:
    spamreader = csv.reader(f, delimiter=',', quotechar='|')
    for row in spamreader:
        jpeg_files.append(row[0])
print(len(jpeg_files))        

In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_PATH = r'/content/drive/My Drive/Data'
else:
    DATA_PATH = osp.join(os.environ['DATA_PATH'], 'cciw/Data')

imagetable_path = os.path.join(DATA_PATH, 'Tables', 'ImageTable.csv')
image_df = pd.read_csv(imagetable_path, index_col=0)
analysis_path = os.path.join(DATA_PATH, 'Tables', 'Analysis.csv')
dive_path = os.path.join(DATA_PATH, 'Tables', 'Dives.csv')
analysis_df = pd.read_csv(analysis_path, index_col=0, dtype={'Count':float})
dive_df = pd.read_csv(dive_path, index_col=0, parse_dates=['Date'])
data_df = pd.merge(analysis_df, dive_df, on='Dive Index', how='outer')

In [None]:
Y = data_df[['Live Coverage', 'Empty Coverage', 'Biomass', 'Count', '16mm', '14mm', '12.5mm', '10mm', '8mm', '6.3mm', '4mm', '2mm']]
Y

In [None]:
fontsize = 16

left = 0.02  # the left side of the subplots of the figure
right = 0.98   # the right side of the subplots of the figure
bottom = 0.05  # the bottom of the subplots of the figure
top = 0.95     # the top of the subplots of the figure
wspace = 0.15  # the amount of width reserved for space between subplots,
# expressed as a fraction of the average axis width
hspace = 0.1  # the amount of height reserved for space between subplots,
# expressed as a fraction of the average axis height

BIOMASS_IDX = 0
COUNT_IDX = 1
PRED_COUNT_IDX = 2

In [None]:
lab_targets = np.zeros((len(jpeg_files), 3)) # 0 = biomass, 1 = count

names = ['16mm', '14mm', '12.5mm', '10mm', '8mm', '6.3mm', '4mm', '2mm']
sieves = np.array([16, 14, 12.5, 10, 8, 6.3, 4, 2])

for i in range(len(jpeg_files)):
    
    root_fname = jpeg_files[i].split('/')[-1].split('.')[0].split('_image')[0].split('Lab_')[1]
    guid = image_df[image_df['Name'].str.contains(root_fname)]['Analysis Index'].astype('int64')
    row = data_df[data_df['Analysis Index'].values == np.unique(guid.values)]
    
    lab_targets[i, BIOMASS_IDX] = row['Biomass'].values
    lab_targets[i, COUNT_IDX] = row['Count'].values
    
    size_dist = np.zeros(len(names))
    for j in range(len(names)):
        size_dist[j] = row[names[j]].values
        
    lab_targets[i, PRED_COUNT_IDX] = (lab_targets[i, BIOMASS_IDX] * size_dist * 2 / sieves).sum()

lab_targets[np.isnan(lab_targets)] = 0

biomass = lab_targets[:, BIOMASS_IDX] / lab_targets[:, BIOMASS_IDX].max()
a_count = lab_targets[:, COUNT_IDX] / lab_targets[:, COUNT_IDX].max()
p_count = lab_targets[:, PRED_COUNT_IDX] / lab_targets[:, PRED_COUNT_IDX].max()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=True)

ax[0].scatter(biomass, a_count, marker='o', s=40, facecolors='none', edgecolors='k')
ax[0].set_ylabel('Count', fontsize=fontsize)
ax[0].set_xlabel('Biomass (g)', fontsize=fontsize)

ax[1].scatter(p_count, a_count, marker='o', s=40, facecolors='none', edgecolors='k')
ax[1].set_xlabel('Biomass (g) \n corrected by sieve', fontsize=fontsize)

draw_lines(ax[0], biomass, a_count)
draw_lines(ax[1], p_count, a_count)

draw_rsquared(ax[0], a_count, biomass, fontsize)
draw_rsquared(ax[1], a_count, p_count, fontsize)

draw_sublabel(ax[0], r'\textbf{a)}', fontsize)
draw_sublabel(ax[1], r'\textbf{b)}', fontsize)

pretty_axis(ax[0], fontsize)
pretty_axis(ax[1], fontsize)

plt.tight_layout()

In [None]:
#fname = 'lab_count_from_biomass'
#fig.savefig(fname + '.png')
#fig.savefig(fname + '.eps', format='eps')

# Plot biomass from live coverage

In [None]:
root_path = '/media/angus/cciw/VOCdevkit/Train-v120-originals/'
#root_path = '/media/angus/cciw/VOCdevkit/Train-v111-originals/JPEGImages/'
#root_path = '/scratch/ssd/gallowaa/cciw/VOCdevkit/Validation-v101-originals/JPEGImages/'

jpeg_files = glob.glob(root_path + 'JPEGImages/*.jpg')
jpeg_files.sort()

# Should equal 121
print(len(jpeg_files))    

label_path = os.path.join(root_path, 'SegmentationClass')
all_images = glob.glob(osp.join(label_path, '*.png'))
all_images.sort()
print(len(all_images))

In [None]:
'''
jpeg_files = []

csvfile = 'train-v111-originals-files.csv'
with open(csvfile, 'r') as f:
    spamreader = csv.reader(f, delimiter=',', quotechar='|')
    for row in spamreader:
        jpeg_files.append(row[0])
print(len(jpeg_files))        
'''

In [None]:
lab_targets = np.zeros((len(jpeg_files), 3)) # 0 = biomass, 1 = count, 2 = live coverage

LIVE_COVERAGE = 2

for i in range(len(jpeg_files)):
    
    root_fname = jpeg_files[i].split('/')[-1].split('.')[0].split('_image')[0].split('GLNI_')[1]
    guid = image_df[image_df['Name'].str.contains(root_fname)]['Analysis Index'].astype('int64')
    row = data_df[data_df['Analysis Index'].values == np.unique(guid.values)]
    
    lab_targets[i, BIOMASS_IDX] = row['Biomass'].values
    lab_targets[i, COUNT_IDX] = row['Count'].values
    lab_targets[i, LIVE_COVERAGE] = row['Live Coverage'].values

valid_mask = np.invert(np.isnan(lab_targets[:, 0]))
valid_mask = valid_mask & np.invert(np.isnan(lab_targets[:, 1]))
valid_mask = valid_mask & np.invert(np.isnan(lab_targets[:, 2]))

biomass = lab_targets[:, BIOMASS_IDX][valid_mask]
count = lab_targets[:, COUNT_IDX][valid_mask]
live_coverage = lab_targets[:, LIVE_COVERAGE][valid_mask]

biomass = biomass / biomass.max()
count = count / count.max()
live_coverage = live_coverage / live_coverage.max()

In [None]:
valid_mask.sum()

In [None]:
def draw_lines(ax, x, y, xlim):

    x_ = np.linspace(0, xlim)

    A = np.vstack([x, np.ones(len(x))]).T
    (m, c), res, r, s = np.linalg.lstsq(A, y, rcond=-1)
    std = np.sqrt(res[0] / len(y))

    ax.plot(x_, m * x_ + c, 'k', linestyle='-')
    ax.plot(x_, m * x_ + c + 1.96 * std, '--', color='gray')
    ax.plot(x_, m * x_ + c - 1.96 * std, '--', color='gray')

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=False)
ax[0].scatter(live_coverage * 100, biomass, marker='o', s=40, facecolors='none', edgecolors='k')
ax[0].set_xlabel('Live Coverage (\%)', fontsize=fontsize)
ax[0].set_ylabel('Biomass (g)', fontsize=fontsize)
ax[1].scatter(live_coverage * 100, count, marker='o', s=40, facecolors='none', edgecolors='k')
ax[1].set_ylabel('Count', fontsize=fontsize)
ax[1].set_xlabel('Live Coverage (\%)', fontsize=fontsize)
#ax[0].plot(x * 100, y_hat, 'r', linestyle='-')
draw_lines(ax[0], live_coverage*100, biomass, 100)
draw_lines(ax[1], live_coverage*100, count, 100)
ax[0].set_title(r'$\mathbf{R^2}$ = %.3f' % r2_score(biomass, linear_regression(live_coverage, biomass)), fontsize=fontsize + 1)
ax[1].set_title(r'$\mathbf{R^2}$ = %.3f' % r2_score(count, linear_regression(live_coverage, count)), fontsize=fontsize + 1)

draw_sublabel(ax[0], r'\textbf{a)}', fontsize, (.1, .85))
draw_sublabel(ax[1], r'\textbf{b)}', fontsize, (.1, .85))

pretty_axis(ax[0], fontsize, 105, 1.05)
pretty_axis(ax[1], fontsize, 105, 1.05)
plt.tight_layout()

fname = 'train_v120_predict_biomass_and_count_from_live_coverage_ab'
#fig.savefig(fname + '.eps', format='eps')
#fig.savefig(fname + '.png', format='png')

# Live Coverage, Biomass, Count from Segmentation Masks

In [None]:
import cv2
from tqdm import tqdm

In [None]:
pix_ct = []
for i in tqdm(range(len(all_images))):
    im   = cv2.imread(all_images[i])
    _, cts = np.unique(im, return_counts=True) 
    try:
        pix_ct.append(cts[1] / cts.sum())
    except:
        pix_ct.append(0)
pix_ct_np = np.asarray(pix_ct)
pix_ct_np = pix_ct_np / pix_ct_np.max()

In [None]:
x = pix_ct_np[valid_mask]

In [None]:
# v111
'''
mask_y = live_coverage > 0.4
mask_x = x < 0.2
upper_left = mask_x & mask_y
mask_y = live_coverage > 0.38
mask_x = x < 0.1
upper_left |= (mask_x & mask_y)
mask_y = live_coverage < 0.6
mask_x = x > 0.6
bottom_right = mask_x & mask_y
outliers = upper_left | bottom_right
inliers = np.invert(outliers)
'''
# v120
mask_y = live_coverage >= 0.8
mask_x = x < 0.21
upper_left = mask_x & mask_y

mask_y = live_coverage < 0.25
mask_x = x > 0.25
bottom_right = mask_x & mask_y
outliers = upper_left | bottom_right
inliers = np.invert(outliers)

xin = x[inliers]
livein = live_coverage[inliers]
countin = count[inliers]
biomassin = biomass[inliers]

print('Live Coverage R^2 value on %d inliers = %.4f' % (len(xin), r2_score(livein, linear_regression(xin, livein))))
print('Live Coverage R^2 value on %d inliers = %.4f' % (len(x), r2_score(live_coverage, linear_regression(x, live_coverage))))

In [None]:
xin = xin / xin.max()

In [None]:
xin.max()

In [None]:
#143 - 126

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(12, 4.5), sharex=True, sharey=False)

ax[0].scatter(xin, livein, marker='o', s=40, facecolors='none', edgecolors='k')
ax[0].scatter(x[outliers], live_coverage[outliers], marker='o', s=40, facecolors='none', edgecolors='r')
ax[0].set_ylabel('Live Coverage', fontsize=fontsize)
ax[0].set_title(r'$\mathbf{R^2}$ = %.3f in, %.3f all' % (r2_score(livein, linear_regression(xin, livein)), r2_score(live_coverage, linear_regression(x, live_coverage))), fontsize=fontsize + 1)
draw_lines(ax[0], xin, livein, 1)

ax[1].scatter(xin, biomassin, marker='o', s=40, facecolors='none', edgecolors='k')
ax[1].scatter(x[outliers], biomass[outliers], marker='o', s=40, facecolors='none', edgecolors='r')
ax[1].set_ylabel('Biomass (g)', fontsize=fontsize)
ax[1].set_xlabel('Fraction of Mussel Pixels \n in Segmentation Mask', fontsize=fontsize)
ax[1].set_title(r'$\mathbf{R^2}$ = %.3f in, %.3f all' % (r2_score(xin, linear_regression(xin, biomassin)), r2_score(x, linear_regression(x, biomass))),  fontsize=fontsize + 1)
draw_lines(ax[1], xin, biomassin, 1)

ax[2].scatter(xin, countin, marker='o', s=40, facecolors='none', edgecolors='k')
ax[2].scatter(x[outliers], count[outliers], marker='o', s=40, facecolors='none', edgecolors='r')
ax[2].set_ylabel('Count', fontsize=fontsize)
ax[2].set_title(r'$\mathbf{R^2}$ = %.3f in, %.3f all' % (r2_score(xin, linear_regression(xin, countin)), r2_score(x, linear_regression(x, count))),  fontsize=fontsize + 1)
draw_lines(ax[2], xin, countin, 1)

draw_sublabel(ax[0], r'\textbf{a)}', fontsize, (0.85, 0.05))
draw_sublabel(ax[1], r'\textbf{b)}', fontsize, (0.85, 0.05))
draw_sublabel(ax[2], r'\textbf{c)}', fontsize, (0.85, 0.05))

pretty_axis(ax[0], fontsize, 1.05, 1.05)
pretty_axis(ax[1], fontsize, 1.05, 1.05)
pretty_axis(ax[2], fontsize, 1.05, 1.05)

plt.tight_layout()
fname = 'train_v120_livecov_biomass_and_count_from_masks_abc'
fig.savefig(fname + '.png')
fig.savefig(fname + '.eps', format='eps')

In [None]:
def pretty_axis(ax, fontsize, xlim, ylim):
    ax.set_ylim(0, ylim)
    ax.set_xlim(0, xlim)
    ax.grid()
    ax.set_aspect('equal')
    ax.tick_params(labelsize=fontsize - 2)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=False)

ax[0].scatter(xin, 100*livein, marker='o', s=40, facecolors='none', edgecolors='k')
ax[0].scatter(x[outliers], 100*live_coverage[outliers], marker='o', s=40, facecolors='none', edgecolors='r')
ax[0].set_ylabel('Live Coverage (\%)', fontsize=fontsize)
ax[0].set_xlabel('Fraction of Mussel Pixels \n in Segmentation Mask', fontsize=fontsize)
ax[0].set_title(r'$\mathbf{R^2}$ = %.3f' % r2_score(livein, linear_regression(xin, livein)), fontsize=fontsize + 1)
draw_lines(ax[0], xin, 100*livein, 1)

ax[1].scatter(xin, countin, marker='o', s=40, facecolors='none', edgecolors='k')
ax[1].scatter(x[outliers], count[outliers], marker='o', s=40, facecolors='none', edgecolors='r')
ax[1].set_ylabel('Biomass (g)', fontsize=fontsize)
#ax[1].set_xlabel('Fraction of Mussel Pixels \n (Mask)', fontsize=fontsize)
ax[1].set_xlabel('Fraction of Mussel Pixels \n in Segmentation Mask', fontsize=fontsize)
ax[1].set_title(r'$\mathbf{R^2}$ = %.3f' % r2_score(xin, linear_regression(xin, countin)),  fontsize=fontsize + 1)
draw_lines(ax[1], xin, countin, 1)

draw_sublabel(ax[0], r'\textbf{e)}', fontsize, (0.85, 0.05))
draw_sublabel(ax[1], r'\textbf{f)}', fontsize, (0.85, 0.05))

#pretty_axis(ax[0], fontsize)
pretty_axis(ax[0], fontsize, 1.05, 105)
pretty_axis(ax[1], fontsize, 1.05, 1.05)
#pretty_axis(ax[1], fontsize)

plt.tight_layout()
fname = 'train_v120_live_coverage_and_biomass_from_masks_cd'
#fig.savefig(fname + '.png')
#fig.savefig(fname + '.eps', format='eps')

In [None]:
def train(X, y):
    sol1 = np.linalg.lstsq(X, y) # Solve linear system (least-square solution)
    return sol1[0], sol1[1]

def predict(X, a):
    return X @ a

In [None]:
def build_X(x):
    x0 = np.ones(x.shape)
    x1 = x
    X = np.array([x0, x1])
    return X.T.reshape(-1, 2)

In [None]:
def linear_prediction(x='Count', y='Biomass'):
    xs = Y[x]
    ys = Y[y]
    valid = np.logical_and(xs.notnull(), ys.notnull())
    x_train = xs[valid].values.reshape(-1,1)
    y_train = ys[valid].values.reshape(-1,1)
    x_train = x_train
    y_train = y_train
    X = build_X(x_train)
    sol, res = train(X, y_train)
    a = sol[0]
    b = sol[1]
    y_pred = predict(X, sol)
    std = np.sqrt(res[0]/len(x_train))
    R2 = r2_score(y_train,y_pred)
    fig = plt.figure()
    ax = plt.gca()
    Y.plot(ax=ax, x=x, y=y, style='.')
    x_ = np.linspace(0, x_train.max(), 100)
    ax.plot(x_, a+x_*b, '-')
    ax.plot(x_, 1.96*std+a+x_*b, '--', color='k', alpha=0.5)
    ax.plot(x_, -1.96*std+a+x_*b, '--', color='k', alpha=0.5)
    plt.title(f"R2: {R2:.03}")
    plt.show()
    return a[0], b[0], R2

In [None]:
def power_law_prediction(xs, ys):
    #xs = Y[x]
    #ys = Y[y]
    valid = np.logical_and(xs>0, ys>0)
    x_train = xs[valid].reshape(-1, 1)
    y_train = ys[valid].reshape(-1, 1)
    x_train = np.log(x_train)
    y_train = np.log(y_train) 
    X = build_X(x_train)
    sol, res = train(X, y_train)
    a = np.exp(sol[0])
    b = sol[1]
    y_pred = predict(X, sol)
    std = np.sqrt(res[0]/len(x_train))
    R2 = r2_score(y_train, y_pred)
    fig = plt.figure()
    ax = plt.gca()
    #Y.plot(ax=ax, x=x, y=y, style='.', loglog=True)
    x_ = np.logspace(-1, 4, 100)
    ax.plot(x_, a*x_**b, '-')
    ax.plot(x_, np.exp(1.96*std)*a*x_**b, '--', color='k', alpha=0.5)
    ax.plot(x_, np.exp(-1.96*std)*a*x_**b, '--', color='k', alpha=0.5)
    plt.title(f"R2: {R2:.03}")
    plt.show()
    return a[0], b[0], R2

In [None]:
xs = live_coverage.copy()
ys = count.copy()
valid = np.logical_and(xs>0, ys>0)
x_train = xs[valid]
y_train = ys[valid]
x_train = np.log(x_train)
y_train = np.log(y_train) 
X = build_X(x_train)
sol, res = train(X, y_train)
a = np.exp(sol[0])
b = sol[1]
y_pred = predict(X, sol)
std = np.sqrt(res[0]/len(x_train))
R2 = r2_score(y_train, y_pred)
print(R2)

In [None]:
power_law_prediction(live_coverage, count)

# Deprecated Code

In [None]:
'''
fig, ax = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=False)
ax[0].scatter(biomass, live_coverage*100, marker='o', s=40, facecolors='none', edgecolors='k')
ax[0].set_ylabel('Live Coverage (\%)', fontsize=fontsize)
ax[0].set_xlabel('Biomass (g)', fontsize=fontsize)
ax[1].scatter(count, live_coverage*100, marker='o', s=40, facecolors='none', edgecolors='k')
ax[1].set_ylabel('Live Coverage (\%)', fontsize=fontsize)
ax[1].set_xlabel('Count', fontsize=fontsize)
draw_lines(ax[0], biomass, live_coverage*100, 1)
draw_lines(ax[1], count, live_coverage*100, 1)
ax[0].set_title(r'$\mathbf{R^2}$ = %.4f' % r2_score(live_coverage, linear_regression(biomass, live_coverage)), fontsize=fontsize + 1)
ax[1].set_title(r'$\mathbf{R^2}$ = %.4f' % r2_score(live_coverage, linear_regression(count, live_coverage)), fontsize=fontsize + 1)
draw_sublabel(ax[0], r'\textbf{c)}', fontsize)
draw_sublabel(ax[1], r'\textbf{d)}', fontsize)
pretty_axis(ax[0], fontsize, 1.05, 105)
pretty_axis(ax[1], fontsize, 1.05, 105)
plt.tight_layout()
fname = 'train_v111_predict_live_coverage_from_biomass_and_count_cd'
'''
#fig.savefig(fname + '.eps', format='eps')