In [None]:
cd ..

In [None]:
import os

In [None]:
import numpy as np
import scipy.stats

In [None]:
from tqdm.autonotebook import tqdm

In [None]:
import echofilter.raw
import echofilter.raw.shardloader

In [None]:
root_data_dir = echofilter.raw.loader.ROOT_DATA_DIR

In [None]:
partition = 'train'
partitioning_version = 'firstpass'
dataset = 'mobile'

In [None]:
max_depth = 70

In [None]:
transect_pths = echofilter.raw.loader.get_partition_list(
    partition,
    dataset=dataset,
    partitioning_version=partitioning_version,
    root_data_dir=root_data_dir,
    full_path=True,
    sharded=True,
)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
transect_pth = transect_pths[0]
with open(os.path.join(transect_pth, 'n_segment.txt'), 'r') as f:
    n_segment = int(f.readline().strip())

i_seg = 0
transect = echofilter.raw.shardloader.load_transect_from_shards_abs(
    os.path.join(transect_pth, str(i_seg))
)
transect['Sv'] = transect['Sv'][1:, transect['depths'] <= max_depth]
transect['Sv'] = transect['Sv'].astype(np.float32)
print('mean', np.mean(transect['Sv']))
print('median', np.median(transect['Sv']))

In [None]:
np.nanpercentile([5, 3, 5, 4, np.nan], 50)

In [None]:
transect['Sv']

In [None]:
sns.distplot(transect['Sv'])

In [None]:
qs = [0, .1, 1, 5, 7, 10, 25, 50, 75, 90, 93, 95, 99, 99.9, 100]
ps = np.percentile(transect['Sv'], qs)
for q, p in zip(qs, ps):
    print('{:5.1f} {:7.2f}'.format(q, p))

In [None]:
np.diff(np.percentile(transect['Sv'], [10, 90])) / 2.56

In [None]:
np.diff(np.percentile(transect['Sv'], [7, 93])) / 3

In [None]:
np.diff(np.percentile(transect['Sv'], [25, 75])) / 1.35

In [None]:
np.std(transect['Sv'][1:])

In [None]:
mad = np.median(np.abs(transect['Sv'] - np.median(transect['Sv'])))
print(mad)
print(mad * 1.4826)

In [None]:
np.percentile(transect['Sv'], [60, 55, 50, 45, 40, 35, 30, 25, 20, 15, 10])

In [None]:
np.abs(np.diff(np.percentile(transect['Sv'], [60, 55, 50, 45, 40, 35, 30, 25, 20, 15, 10])))

In [None]:
np.percentile(transect['Sv'], [40, 35, 30])

In [None]:
for i in tqdm(range(10)):
    transect_pth = transect_pths[i]
    with open(os.path.join(transect_pth, 'n_segment.txt'), 'r') as f:
        n_segment = int(f.readline().strip())

    i_seg = 0
    transect = echofilter.raw.shardloader.load_transect_from_shards_abs(
        os.path.join(transect_pth, str(i_seg))
    )
    transect['Sv'] = transect['Sv'][1:, transect['depths'] <= max_depth]
    transect['Sv'] = transect['Sv'].astype(np.float32)

    plt.figure(figsize=(12, 9))
    sns.distplot(transect['Sv'])
    plt.show()

    print('{:6s} {:7.2f}'.format('mean', np.mean(transect['Sv'])))
    print('{:6s} {:7.2f}'.format('median', np.median(transect['Sv'])))
    print('{:6s} {:7.2f}'.format('stdev', np.std(transect['Sv'])))
    print('{:6s} {:7.2f}'.format('mad', np.median(np.abs(transect['Sv'][1:] - np.median(transect['Sv'])))))
    print('{:6s} {:7.2f}'.format('iqr', np.diff(np.percentile(transect['Sv'], [25, 75]))[0]))
    print('{:6s} {:7.2f}'.format('idr', np.diff(np.percentile(transect['Sv'], [10, 90]))[0]))
    print('{:6s} {:7.2f}'.format('i7r', np.diff(np.percentile(transect['Sv'], [7, 93]))[0]))

In [None]:
means = []
stdevs = []
medians = []
mads = []
percentiles = []
std25 = []

qs = [0, .1, 1, 5, 7, 10, 15, 20, 25, 30, 35, 40, 50, 75, 90, 93, 95, 99, 99.9, 100]

for transect_pth in tqdm(transect_pths):

    try:
        # Check how many segments the transect was divided into
        with open(os.path.join(transect_pth, 'n_segment.txt'), 'r') as f:
            n_segment = int(f.readline().strip())

        for i_seg in range(n_segment):
            transect = echofilter.raw.shardloader.load_transect_from_shards_abs(
                os.path.join(transect_pth, str(i_seg))
            )
            transect['Sv'] = transect['Sv'][1:, transect['depths'] <= max_depth]
            if len(transect['Sv']) < 2:
                continue
            transect['Sv'] = transect['Sv'].astype(np.float32)
            means.append(np.nanmean(transect['Sv']))
            stdevs.append(np.nanstd(transect['Sv']))
            median = np.nanmedian(transect['Sv'])
            medians.append(median)
            mads.append(np.nanmedian(np.abs(transect['Sv'] - median)))
            percentiles.append(np.nanpercentile(transect['Sv'], qs))
            pc25 = np.nanpercentile(transect['Sv'], 25)
            std25.append(np.sqrt(np.nanmean(np.power(transect['Sv'] - pc25, 2))))

    except Exception as ex:
        print('Error loading shard from {}'.format(transect_pth))
        print(ex)

MEAN = np.nanmean(means)
print('mean = {}'.format(MEAN))
print('mean of medians = {}'.format(np.nanmean(medians)))

qs = np.array(qs)
percentiles = np.array(percentiles)

In [None]:
variances = []

for transect_pth in tqdm(transect_pths):

    try:
        # Check how many segments the transect was divided into
        with open(os.path.join(transect_pth, 'n_segment.txt'), 'r') as f:
            n_segment = int(f.readline().strip())

        for i_seg in range(n_segment):
            transect = echofilter.raw.shardloader.load_transect_from_shards_abs(
                os.path.join(transect_pth, str(i_seg))
            )
            transect['Sv'] = transect['Sv'][1:, transect['depths'] <= max_depth]
            if len(transect['Sv']) < 2:
                continue
            transect['Sv'] = transect['Sv'].astype(np.float32)
            variances.append(np.nanmean(np.power(transect['Sv'] - MEAN, 2)))
    except Exception as ex:
        print('Error loading shard from {}'.format(transect_pth))
        print(ex)


VARIANCE = np.mean(variances)
print('variance = {}'.format(VARIANCE))
print('stdev = {}'.format(np.sqrt(VARIANCE)))

In [None]:
iqrs = percentiles[:, np.nonzero(qs == 75)[0][0]] - percentiles[:, np.nonzero(qs == 25)[0][0]]
IQR = np.mean(iqrs)
print(IQR)

In [None]:
idrs = percentiles[:, np.nonzero(qs == 90)[0][0]] - percentiles[:, np.nonzero(qs == 10)[0][0]]
IDR = np.mean(idrs)
print(IDR)

In [None]:
i7rs = percentiles[:, np.nonzero(qs == 93)[0][0]] - percentiles[:, np.nonzero(qs == 7)[0][0]]
I7R = np.mean(i7rs)
print(I7R)

In [None]:
print(np.sqrt(VARIANCE))
print(np.mean(stdevs))

print(np.mean(mads) * 1.4826)
print(IQR / 1.35)
print(IDR / 2.56)
print(I7R / 3.0)

In [None]:
print(
    '{:6s} {:6s}  {:6s}  {:6s}  {:6s}  {:5s}  {:5s}  {:5s}'
    .format('name', 'SEM', 'mean', 'min', 'max', 'pcerr', 'pcstd', 'pcran')
)
for name, estimator in [
    ('mean', means),
    ('median', medians),
    ('stdev', stdevs),
    ('MAD', mads),
    ('IQR', iqrs),
    ('IDR', idrs),
    ('I7R', i7rs),
    ('std25', std25),
]:
    print(
        '{:6s} {:6.4f}  {:6.1f}  {:6.1f}  {:6.1f}  {:5.3f}  {:5.2f}  {:5.1f}'
        .format(
            name,
            scipy.stats.sem(estimator),
            np.mean(estimator),
            np.min(estimator),
            np.max(estimator),
            scipy.stats.sem(estimator) / np.abs(np.mean(estimator)) * 100,
            np.std(estimator) / np.abs(np.mean(estimator)) * 100,
            (np.max(estimator) - np.min(estimator)) / np.abs(np.mean(estimator)) * 100,
        )
    )

In [None]:
print(
    '{:6s}  {:6s}  {:6s}  {:6s}  {:6s}  {:5s}  {:5s}  {:5s}'
    .format('percentile', 'SEM', 'mean', 'min', 'max', 'pcerr', 'pcstd', 'pcran')
)
for iq, q in enumerate(qs):
    estimator = percentiles[:, iq]
    print(
        '{:10.1f}  {:6.4f}  {:6.1f}  {:6.1f}  {:6.1f}  {:5.3f}  {:5.2f}  {:5.1f}'
        .format(
            q,
            scipy.stats.sem(estimator),
            np.mean(estimator),
            np.min(estimator),
            np.max(estimator),
            scipy.stats.sem(estimator) / np.abs(np.mean(estimator)) * 100,
            np.std(estimator) / np.abs(np.mean(estimator)) * 100,
            (np.max(estimator) - np.min(estimator)) / np.abs(np.mean(estimator)) * 100,
        )
    )

In [None]:
plt.figure(figsize=(15, 9))
sns.distplot(means)
plt.title('mean estimates')
plt.show()

In [None]:
plt.figure(figsize=(15, 9))
sns.distplot(medians)
plt.title('median estimates')
plt.show()

In [None]:
plt.figure(figsize=(15, 9))
sns.distplot(stdevs)
plt.title('standard deviation estimates')
plt.show()

In [None]:
plt.figure(figsize=(15, 9))
sns.distplot(mads)
plt.title('MAD estimates')
plt.show()

In [None]:
plt.figure(figsize=(15, 9))
sns.distplot(iqrs)
plt.title('IQR estimates')
plt.show()

In [None]:
plt.figure(figsize=(15, 9))
sns.distplot(idrs)
plt.title('IDR estimates')
plt.show()

In [None]:
plt.figure(figsize=(15, 9))
sns.distplot(i7rs)
plt.title('7-93 estimates')
plt.show()