In [None]:
from collections import OrderedDict
import csv
import datetime
import os

In [None]:
import torch
# import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
root_dir = '/media/scott/scratch/Datasets/dsforce/'

In [None]:
fname = os.path.join(root_dir, 'surveyExports/Survey17/Survey17_GR1_N0A_E_Sv_raw.csv')

In [None]:
# df = pd.read_csv(fname)
#
# Can't use pandas because of inconsistent columns. Attempting to do so generates this error:
#
# ParserError: Error tokenizing data. C error: Expected 2544 fields in line 3, saw 5977

In [None]:
SURVEY_FIELD_TYPES = {
    'Ping_index': int,
    'Distance_gps': float,
    'Distance_vl': float,
    'Ping_date': str,
    'Ping_time': str,
    'Ping_milliseconds': float,
    'Latitude': float,
    'Longitude': float,
    'Depth_start': float,
    'Depth_stop': float,
    'Range_start': float,
    'Range_stop': float,
    'Sample_count': int,
}


def survey_reader(fname):
    '''
    Creates a generator which iterates through a survey csv file.
    
    Parameters
    ----------
    fname: str
        Path to survey CSV file.
    
    Returns
    -------
    generator
        Yields a tupule of `(metadata, data)`, where metadata is a dict,
        and data is a `numpy.ndarray`. Each yield corresponds to a single
        row in the data. Every row (except for the header) is yielded.
    '''
    metadata_header = []
    with open(fname, 'r', encoding='utf-8-sig') as hf:
        for i_row, row in enumerate(csv.reader(hf)):
            row = [entry.strip() for entry in row]
            if i_row == 0:
                metadata_header = row
                continue;
            metadata = row[:len(metadata_header)]
            metadata_d = OrderedDict()
            for k, v in zip(metadata_header, metadata):
                if k in SURVEY_FIELD_TYPES:
                    metadata_d[k] = SURVEY_FIELD_TYPES[k](v)
                else:
                    metadata_d[k] = v
            data = np.array([float(x) for x in row[len(metadata_header):]])
            yield metadata_d, data

In [None]:
def count_lines(filename):
    '''
    Count the number of lines in a file.

    Credit: https://stackoverflow.com/a/27518377
    
    Parameters
    ----------
    filename : str
        Path to file.

    Returns
    int
        Number of lines in file.
    '''
    f = open(filename)                  
    lines = 0
    buf_size = 1024 * 1024
    read_f = f.read  # loop optimization

    buf = read_f(buf_size)
    while buf:
        lines += buf.count('\n')
        buf = read_f(buf_size)

    return lines

In [None]:
for meta, data in survey_reader(fname):
    print(meta, data)
    break

In [None]:
count_lines(fname)

In [None]:
def survey_loader(fname, skip_lines=1):
    '''
    Loads an entire survey CSV.
    
    Parameters
    ----------
    fname : str
        Path to survey CSV file.
    skip_lines : int, optional
        Number of initial entries to skip. Default is 1.
    
    Returns
    -------
    numpy.ndarray
        Timestamps for each row, in seconds. Note: not corrected for timezone.
    numpy.ndarray
        Depth of each column, in metres.
    numpy.ndarray
        Survey signal (echo strength, units unknown).
    '''

    # We remove one from the line count because of the header
    # which is excluded from output
    n_lines = count_lines(fname) - 1
    n_distances = 0
    depth_start = None
    depth_stop = None

    # Initialise output array
    for i_line, (meta, row) in enumerate(survey_reader(fname)):
        if i_line < skip_lines:
            continue
        n_depths = len(row)
        depth_start = meta['Depth_start']
        depth_stop = meta['Depth_stop']
        break

    data = np.empty((n_lines - skip_lines, n_depths))
    timestamps = np.empty((n_lines - skip_lines))
    depths = np.linspace(depth_start, depth_stop, n_depths)

    for i_line, (meta, row) in enumerate(survey_reader(fname)):
        if i_line < skip_lines:
            continue
        i_entry = i_line - skip_lines
        data[i_entry, :] = row
        timestamps[i_entry] = datetime.datetime.strptime(
            '{}T{}.{:06d}'.format(
                meta['Ping_date'],
                meta['Ping_time'],
                int(1000 * float(meta['Ping_milliseconds'])),
            ),
            '%Y-%m-%dT%H:%M:%S.%f',
        ).timestamp()

    # Turn NaNs into NaNs (instead of extremely negative number)
    data[data < -1e6] = np.nan

    return timestamps, depths, data

In [None]:
timestamps, depths, signals = survey_loader(fname)

In [None]:
timestamps

In [None]:
depths

In [None]:
signals

In [None]:
len(signals)

In [None]:
plt.imshow(signals)

In [None]:
signals.shape

In [None]:
np.prod(signals.shape)

In [None]:
plt.hist(np.reshape(signals[::10, ::10], -1), bins=100)

In [None]:
plt.figure(figsize=(20, 20))
plt.imshow(signals.T)
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
plt.pcolormesh(timestamps, -depths, signals.T)
plt.show()

In [None]:
def evl_reader(fname):
    '''
    EVL file reader

    Parameters
    ----------
    fname : str
        Path to .evl file.

    Returns
    -------
    generator
        A generator which yields the timestamp (in seconds) and depth (in metres)
        for each entry. Note that the timestamp is not corrected for timezone
        (so make sure your timezones are internally consistent).
    '''
    with open(fname, 'r') as hf:
        continuance = True
        for i_row, row in enumerate(csv.reader(hf, delimiter=' ')):
            if i_row == 0:
                continue
            if len(row) < 4:
                if not continuance:
                    raise ValueError('Trying to skip data after parsing began')
                continue
            continuance = False

            timestamp = datetime.datetime.strptime(
                row[0] + 'T' + row[1],
                '%Y%m%dT%H%M%S%f',
            ).timestamp()

            if len(row[2]) > 0:
                raise ValueError('row[2] was non-empty: {}'.format(row[2]))

            yield timestamp, float(row[3])

In [None]:
def evl_loader(fname):
    '''
    EVL file loader
    
    Parameters
    ----------
    fname : str
        Path to .evl file.

    Returns
    -------
    numpy.ndarray
        Timestamps, in seconds.
    numpy.ndarary
        Depth, in metres.
    '''
    timestamps = []
    values = []
    for timestamp, value in evl_reader(fname):
        timestamps.append(timestamp)
        values.append(value)
    return np.array(timestamps), np.array(values)

In [None]:
bottom_fname = os.path.join(root_dir, 'surveyExports/Survey17/Survey17_GR1_N0A_E_bottom.evl')

In [None]:
for t, v in evl_reader(bottom_fname):
    print(t, v)

In [None]:
evl_loader(bottom_fname)

In [None]:
top_fname = os.path.join(root_dir, 'surveyExports/Survey17/Survey17_GR1_N0A_E_turbulence.evl')

In [None]:
evl_loader(top_fname)

In [None]:
plt.figure(figsize=(12, 12))

plt.pcolormesh(timestamps, -depths, signals.T)

t_bottom, d_bottom = evl_loader(bottom_fname)
t_top, d_top = evl_loader(top_fname)

plt.plot(t_bottom, -d_bottom, 'b')
plt.plot(t_top, -d_top, 'c')

plt.show()