# Less-Referenced Mosaic Evaluation

This notebook provides a cross-section of the Less-Referenced Mosaic creation process.


# Setup


## Imports


In [1]:
import copy
import os

In [2]:
import cv2
import numpy as np
import pandas as pd
import scipy
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.utils import check_random_state
import yaml

In [3]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
sns.set_style('white')

In [4]:
from night_horizons import utils, preprocess, reference, mosaic, raster, pipelines, features

## Settings


In [5]:
with open('./config.yml', "r", encoding='UTF-8') as file:
    settings = yaml.load(file, Loader=yaml.FullLoader)

In [6]:
local_settings = {
    'include_raw_images': True,
    'mosaic_filepath': 'mosaics/less_referenced.tiff',
    'random_state': 1682142,
    'train_size': {
        0: 40,
        1: 40,
        2: 40,
    },
    'n_loops': None,

    # Cuts for movement and altitude
    'gyro_mag_cut': 0.02,
    'percent_for_cruising': 85.,
    
    # Choices for search region
    'padding': 2.,
    'use_approximate_georeferencing': True,
    # This set of choices assumes we have really good starting positions.
    # This is useful for debugging.
    # 'padding': 0.1,
    # 'use_approximate_georeferencing': False,

    # The fraction of non-nan georeferencings required to claim success.
    'acceptance_fraction': 0.85,
    'attempted_acceptance_fraction': 0.95,

    # Other mosaic options
    'mosaic_options': {
        'checkpoint_freq': 100,
        'memory_snapshot_freq': 100,
        'save_return_codes': ['linalg_err', 'opencv_err', 'bad_det'],
        'bad_images_dir': '../test/test_data/feature_matching/',

        # These values will be logged and checked for consistency.
        'log_keys': [
            'i',
            'ind',
            'return_code',
            'abs_det_M',
            'snapshot',
        ],
    },

    # Choices for how images are combined
    'image_joiners_options': {
        'defaults': {
            'feature_detector': 'AKAZE',
            'feature_matcher': 'BFMatcher',
        },
        'variations': [
            {'n_matches_used': 100, },
            {'n_matches_used': 500, },
            {'n_matches_used': 1000, },
            {'n_matches_used': 10, },
        ],
    },
}
settings.update(local_settings)

## Parse Settings


In [7]:
settings['mosaic_filepath'] = os.path.join(settings['data_dir'], settings['mosaic_filepath'])

In [8]:
for key, relpath in settings['paths_relative_to_data_dir'].items():
    settings[key] = os.path.join(settings['data_dir'], relpath)

In [9]:
random_state = check_random_state(settings['random_state'])

In [10]:
palette = sns.color_palette(settings['color_palette'])

In [11]:
crs = settings['crs']

In [12]:
constructor_kwargs = dict(
    image_joiner=features.ImageJoinerQueue(**settings['image_joiners_options']),
    filepath=settings['mosaic_filepath'],
    padding=settings['padding'],
    crs=crs,
    **settings['mosaic_options']
)

# Prepare Data

The first part is to prepare the data (AKA extract/transform/load).


## Get filepaths


In [13]:
# Get the referenced filepaths, divided according to camera number
referenced_fps = {i: utils.discover_data(settings['referenced_images_dir'], ['tif', 'tiff'], pattern=r'Geo\s\d+_' + f'{i}.tif') for i in range(3)}

In [14]:
raw_fps = utils.discover_data(settings['images_dir'], ['tif', 'tiff', 'raw'])

## Train-Test Split

We split the data into training data (data that is georeferenced) and test data (data that is not georeferenced, or for which we don't use the georeferencing information when we're building the models).

We set the train size to some small number, because ideally the user only needs to georeference a couple of images manually.


In [15]:
# Get the training sample for each camera
referenced_fps_train = []
referenced_fps_test = []
for camera_num, train_size_i in settings['train_size'].items():

    # When there's no training for this camera
    if train_size_i == 0:
        referenced_fps_test.append(referenced_fps[camera_num])
        continue
        
    referenced_fps_train_i, referenced_fps_test_i = train_test_split(
        referenced_fps[camera_num],
        train_size=train_size_i,
        random_state=settings['random_state'],
        shuffle=True,
    )
    referenced_fps_train.append(referenced_fps_train_i)
    referenced_fps_test.append(referenced_fps_test_i)


In [16]:
referenced_fps_train = pd.concat(referenced_fps_train, ignore_index=True)
referenced_fps_test = pd.concat(referenced_fps_test, ignore_index=True)

## Combine Referenced and Raw


In [17]:
# Adjust the index so we don't have duplicates
raw_fps.index += referenced_fps_test.size

In [18]:
# Actual combination
fps_train = referenced_fps_train
fps_test = referenced_fps_test
if settings['include_raw_images']:
    fps = pd.concat([referenced_fps_test, raw_fps])
else:
    fps = referenced_fps_test

## Preprocessing


### y values

We get the y-values first because we use a model fitted to them to get the X values


In [19]:
preprocessing_pipeline_y = preprocess.GeoTIFFPreprocesser(crs=crs)

In [20]:
# Get the geo-transforms used for training
y_train = preprocessing_pipeline_y.fit_transform(fps_train)
y_test = preprocessing_pipeline_y.fit_transform(fps_test)

100%|████████████████████████████████████████| 120/120 [00:00<00:00, 190.39it/s]
100%|██████████████████████████████████████████| 41/41 [00:00<00:00, 209.30it/s]


### X values

We use the sensor (high-altitude balloon) positions to provide approximate georeferencing, which will be useful for saving computational time when building the unreferenced mosaic.


In [21]:
# This is the pipeline for approximate georeferencing
preprocessing_steps = pipelines.PreprocessingPipelines.nitelite_preprocessing_steps(
    crs=crs,
    use_approximate_georeferencing=settings['use_approximate_georeferencing'],
)
preprocessing_pipeline = Pipeline(preprocessing_steps)
preprocessing_pipeline

In [22]:
# Fit the pipeline
preprocessing_pipeline = preprocessing_pipeline.fit(
    fps_train,
    y_train,
    metadata__img_log_fp=settings['img_log_fp'],
    metadata__imu_log_fp=settings['imu_log_fp'],
    metadata__gps_log_fp=settings['gps_log_fp'],
)

In [23]:
# Get out the X values we'll use for the mosaic
X_train = preprocessing_pipeline.transform(fps_train)
X = preprocessing_pipeline.transform(fps)



In [24]:
# Truncate according to n_loops
if settings['n_loops'] is None:
    n_loops = X.index.size
else:
    n_loops = settings['n_loops']
X_full = X.copy()
X = X.iloc[:n_loops]

In [25]:
# Drop the y values and filepaths that were filtered out
fps = fps.loc[X.index]
X_test = X.loc[X.index.isin(y_test.index)]
y_test = y_test.loc[X_test.index]

# Build the Mosaic


## Initialization


In [26]:
less_reffed_mosaic = mosaic.LessReferencedMosaic(
    **constructor_kwargs
)

In [27]:
# This creates the dataset and adds the referenced mosaic.
less_reffed_mosaic.fit(
    X=y_train,
    approx_y=X_full,
)

Found checkpoint file. Will fast forward to i=1601


## Run


In [29]:
y_pred = less_reffed_mosaic.predict(
    X,
)

 10%|███▉                                  | 1613/15806 [00:18<02:46, 85.41it/s]


UnboundLocalError: cannot access local variable 'return_code' where it is not associated with a value

# Evaluate


## Return Codes


In [None]:
# Check how many were successful
return_codes = pd.Series([log['return_code'] for log in less_reffed_mosaic.logs])
n_good = (return_codes == 'success').sum()
n_bad = n_loops - n_good
print(
    f"{n_bad} failures, for a success rate of {n_good / n_loops:.2g}. "
    f"Requested success rate is {settings['acceptance_fraction']}. "
    f'Return codes are...\n{return_codes.value_counts()}'
)
if n_good / n_loops < settings['acceptance_fraction']:
    print('Success rate not met!')


In [None]:
# Check how many were successful, and how many attempted were successful
attempted_return_codes = return_codes.loc[return_codes != 'dark_frame']
n_attempted = len(attempted_return_codes)
n_good = (attempted_return_codes == 'success').sum()
n_bad = n_attempted - n_good
print(
    f"Of the {n_attempted} attempted joins, {n_bad} failures, for a success rate of {n_good / n_attempted:.2g}. "
    f"Requested success rate is {settings['attempted_acceptance_fraction']}. "
    f'Return codes are...\n{attempted_return_codes.value_counts()}'
)
if n_good / n_attempted < settings['attempted_acceptance_fraction']:
    print('Success rate not met!')

In [None]:
i_first_failure = np.argmin(attempted_return_codes == 'success')
print(f'First failure at i={i_first_failure}')

## Locations of Images


In [None]:
center_coords = preprocessing_pipeline.named_steps['order'].center_
y_pred['d_to_center'] = np.linalg.norm(
    y_pred[['x_center', 'y_center']] - center_coords,
    axis=1,
)

In [None]:
# Merge into a comparison dataframe
y_pred_for_eval = y_pred.reindex(y_test.index)

In [None]:
fig = plt.figure(figsize=(10,10))
ax = plt.gca()

s = ax.scatter(
    y_pred['x_center'],
    y_pred['y_center'],
    c=y_pred['d_to_center'],
    cmap='viridis_r',
    label='predicted',
    s=10,
)
plt.colorbar(s)

ax.scatter(
    y_pred_for_eval['x_center'],
    y_pred_for_eval['y_center'],
    color=palette[1],
    label='referenced-predicted',
    s=50,
)

ax.scatter(
    y_test['x_center'],
    y_test['y_center'],
    color=palette[3],
    label='referenced-test',
    s=50,
)

ax.set_xlabel('x_center [m]')
ax.set_ylabel('y_center [m]')

ax.legend()

ax.set_aspect('equal')

## Georeference Accuracy


In [None]:
# Estimate the consistency with the manual geotransforms
y_err = y_test - y_pred_for_eval
y_err['d_to_center'] = y_pred_for_eval['d_to_center']
y_err['offset'] = np.sqrt(y_err['x_center']**2. + y_err['y_center']**2.)

In [None]:
# Visualize the errors
fig = plt.figure()
ax = plt.gca()

sns.scatterplot(
    data=y_err,
    x='d_to_center',
    y='offset',
    ax=ax,
)

ax.set_ylim(0, ax.get_ylim()[1])

## Memory Usage


In [None]:
snapshots = [
    log['snapshot'] for log in less_reffed_mosaic.logs
    if 'snapshot' in log
]

In [None]:
diffs = [_.compare_to(snapshots[0], 'lineno') for _ in snapshots]
sizes = np.array([sum([_.size_diff for _ in diff]) for diff in diffs])
sizes_GB = sizes / (1024)**3
iterations = np.arange(sizes_GB.size) * less_reffed_mosaic.memory_snapshot_freq

In [None]:
fig = plt.figure(figsize=(10,5))
ax = plt.gca()

ax.scatter(
    iterations,
    sizes_GB,
)

ax.set_xlabel('iteration')
ax.set_ylabel('total memory usage (GB)')

ax.set_xlim(0, n_loops)
ax.set_ylim(0, ax.get_ylim()[1])


In [None]:
delta_mem_per_checkpoint = scipy.interpolate.interp1d(iterations, sizes_GB)(less_reffed_mosaic.checkpoint_freq)
delta_mem_per_iteration = delta_mem_per_checkpoint / less_reffed_mosaic.checkpoint_freq
highest_possible_iteration = 16. / delta_mem_per_iteration
print(
    f'The memory increases at an average of {delta_mem_per_iteration:.2g} GB per iteration,\n'
    f'    and the highest possible iteration is expected to be {int(highest_possible_iteration)}'
)



In [None]:
snap_sizes_GB = np.array([_.size for _ in diffs[-1]]) / (1024)**3

In [None]:
fig = plt.figure(figsize=(10,5))
ax = plt.gca()

log_snap_sizes_GB = np.log10(snap_sizes_GB[snap_sizes_GB > 0])
bins = np.logspace(
    np.nanmin(log_snap_sizes_GB),
    np.nanmax(log_snap_sizes_GB),
    16,
)

ax.hist(
    snap_sizes_GB,
    bins,
)

ax.set_xscale('log')
ax.set_yscale('log')

ax.set_xlabel('memory usage [GB]')
ax.set_ylabel('count of objects')

## Correlation Coefficient Analysis


### Training Data Correlation Coefficient


In [None]:
# Perform scoring
total_score_train = less_reffed_mosaic.score(X_train)
score_train = less_reffed_mosaic.scores_


In [None]:
# Get the distance to the center
y_train['d_to_center'] = np.linalg.norm(
    y_train[['x_center', 'y_center']] - center_coords,
    axis=1,
)

In [None]:
fig = plt.figure(figsize=(10,5))
ax = plt.gca()

ax.scatter(
    y_train['d_to_center'],
    score_train,
)

ax.set_ylim(0, 1)

ax.set_xlabel('distance to center [m]')
ax.set_ylabel(f'normalized correlation coefficient')
ax.set_title('training set accuracy')