# Data Creation/Training Notebook 

In [4]:
import os
import pandas as pd
import numpy as np
import datetime as dt

# custom
from utilities import extract_position_time
from image_generator import plot_sky, create_times_array, get_waypoints, create_times_array, DRCalc

# skyfield
from skyfield.api import load, utc, Topos

# Project Overview
---
### Notebooks

1. [Exploratory Data Analysis](../code/1_eda_notebook.ipynb)
2. [Data Cleaning and preparation](../code/2_data_cleaning_notebook.ipynb) [This Notebook]
3. [Modeling](../code/3_modeling_notebook.ipynb) 

### Scripts
- [image generation](../code/image_generator.py)
- [utility functions](../code/utilities.py)

---


## II. Data Cleaning Notebook

This notebook's purpose is to create and clean the images for the training grid. The actual plotting functions are largely located in `image_generation` script linked above. As a result, the annotations in this notebook are more sparse than in the other two. 

---

# I. Creating Training Images

Create Waypoints 

In [1]:
# starting and ending latitude and longitude
start = (39, -140)
end = (37, -138)

In [5]:
# create waypoints
waypoints = get_waypoints(start, end, n=100, list_of_mile_displacements=[x for x in range(0, 10, 1)])

Number of unique waypoints:  1919


Create Times

In [6]:
# start and end time
start_time = dt.datetime(2020, 3, 13, 4, 0, 0,)
end_time = dt.datetime(2020, 3, 13, 8, 0, 0)

# create times array evenly spaced by specified time interval n
times = create_times_array(start_time, end_time, n=1/60)

### Iterate Through Times for Each Position

Due to the large number of plots it is necessary to create the the images in batches to avoid memory issues.

In [5]:

batch = range(0,8)
print(f'Creating {len(waypoints)*(len(times[0][batch]))} sky images, for grid {start} to {end}')

def batch_image_generator(batch, waypoints, times, base_directory, cloud_cover = 0):
    """
    Generates sky images for a given batch of waypoints and times
    
    Args:
        batch (range): time range to generate images for
        waypoints (np.array): list of waypoints
        times (skyfield timelib.Time): array of times
        base_directory (directory): directory to hold cloud cover image directories
        cloud_cover (int, optional): cloud cover to generate images for. Defaults to 0.
    """
    for p in waypoints:
      for t in times[5:8]:
        observer = Topos(p[0], p[1])
        # create a directory for each cloud cover
        directory = base_directory + str(0) + '/'
        os.makedirs(directory, exist_ok=True)
        plot_sky(t, observer, cloud_cover = cloud_cover, img_directory = directory)

# batch_image_generator(batch, waypoints, times, base_directory = 'data/sky_images/', cloud_cover = 0)

Creating 15352 sky images, for grid (39, -140) to (37, -138)


The above function creates an image of the sky for every specified time and for every specified position in the training grid. For memory allocation purposes, it is necessary to do this in batches so that we don't crash kernel. 

# II. Missing Images and Out of Order Image Sequences

The batch system creates the potential for missing images and sequences that are out of sequence. Since or goal is to train the model what the sequence of changing stars is for a specific position and time interval, it is important that the image sequences are complete and in the correct order. 

In [6]:
# training
files_train = os.listdir('../images/train')
files_train = [f for f in files_train if f.endswith('.png')]

# files is a list of all the files in train and valid directories
files = files_train 

# print number of files
print(f'Number of files: {len(files)}')

Number of files: 78679


## Create Dataframe of Times and Positions

In [7]:

data_df = pd.DataFrame({'times':[extract_position_time(f)[1] for f in files]})
data_df['positions'] = [extract_position_time(f)[0] for f in files]
data_df.sort_values(by = 'times', inplace = True)
complete_times = data_df['times'].unique()

In [8]:
# for each position, find which times are missing
def fill_missing():
    # iterate for each position and find which times are missing
    missing_times = []
    for position in data_df['positions'].unique():
        # get all times for this position
        times = data_df[data_df['positions'] == position]['times'].unique()
        # find which times are missing
        missing = np.setdiff1d(complete_times, times)
        if len(missing) > 0:
            missing = [pd.to_datetime(m) for m in missing]
            # get times array
            times_array = create_times_array(missing[0], missing[-1], 6)
            for time in times_array:
                observer = Topos(position[0], position[1])
                plot_sky(t = time, observer = observer, cloud_cover= 0/8, img_directory='../images/train/')

The `fill_missing` function examines each unique position and looks for times that are missing from the list of training times. If any position is missing a time the appropriate images are generated and sent to the training images directory. 

Every position now has an image for every time, in the correct sequence!

In [9]:
data_df.groupby('times').count()

Unnamed: 0_level_0,positions
times,Unnamed: 1_level_1
2020-03-13 04:00:00,1919
2020-03-13 04:06:00,1919
2020-03-13 04:12:00,1919
2020-03-13 04:18:00,1919
2020-03-13 04:24:00,1919
2020-03-13 04:30:00,1919
2020-03-13 04:36:00,1919
2020-03-13 04:42:00,1919
2020-03-13 04:48:00,1919
2020-03-13 04:54:00,1919


# III. Create Validation Images 

In order to simulate the performance of our model on our fictioious voyage, we need to recreate the imagery that we would have seen overhead, the images may or may not align exactly with the times and positions that we trained the model on. 

To do this, we will use the custom `DRCalc` class, which takes a starting position, course, speed, and time interval and returns the Dead Reckoned position for that time interval up to 4 hours (our model's current temporal boundary). The sequence is:

1. Take a starting position (39 N, 140 W) and time (2020 March 13, 0400 UTC)
2. Use Mercator Sailing to determine the dead reckoned position at 0430 UTC using a course of 142 True and 20 Kts speed.
3. DR from 0430 position to a 0500 position using a Mercator sailing and so on until 0800..

We will assume that the vessel is tracking its position perfectly, i.e. current and wind are not displacing the vessel from its intended track at all. Modern autopilot and waypoint control systems such as Warstila's Trackpilot enable a vessel to maintain a track with a crosstrack error of 1 meter or less so this is not un-realistic. 

In [8]:


# Use DRcalc to calculate the position of the ship every 15 minutes for 4 hours
positions = []
lat = 39
long = -140
positions.append(np.array([lat, long]))
for i in range(len(times[1])-1):
    ship = DRCalc(lat, long, dt.timedelta(minutes=1/60).total_seconds(), 110, 20)
    lat = ship.drlatfwds
    long = ship.drlongfwds

    positions.append(np.array([lat, long]))

# show the first 10 positions
positions[:10]

[array([  39, -140]),
 array([  38.99996833, -139.9998885 ]),
 array([  38.99993666, -139.999777  ]),
 array([  38.99990499, -139.9996655 ]),
 array([  38.99987333, -139.99955399]),
 array([  38.99984166, -139.99944249]),
 array([  38.99980999, -139.99933099]),
 array([  38.99977832, -139.99921949]),
 array([  38.99974665, -139.99910799]),
 array([  38.99971498, -139.99899649])]

In [1]:
def create_voyage_images(times, positions, directory, cloud_cover=0/8):
    """Creates validation images for a given set of times and positions

    Args:
        times (Skyfield.timelib.Time): Skyfield time array.
        positions (np.array): Array of positions.
        directory (directrory): Directory to hold validation images. 
        cloud_cover (int, optional): Cloud cover of sky images. Defaults to 0/8.
    """
    for i, position in enumerate(positions):
        observer = Topos(latitude_degrees = position[0], longitude_degrees=position[1])
        print('Creating sky For :',times[1][i], position)
        plot_sky(times[0][i], observer, cloud_cover=0/8, img_directory='../images_val/voyage_sims/voyages/60_seconds/')

# create_voyage_images(times, positions, directory='../images_val/voyage_sims/voyages/60_seconds/')