In [None]:
# Dylan H.
# This notebook collects & sorts still images from the Global Irminger Sea Array
# for the iceflow detection model

# Getting Irminger Data

Irminger data can be found here:
[R00007](https://rawdata.oceanobservatories.org/files/GI01SUMO/R00007/instruments/dcl13/vics/imageData/)
[R00009](https://rawdata.oceanobservatories.org/files/GI01SUMO/R00009/instruments/dcl13/vics/imageData/)
[R00010](https://rawdata.oceanobservatories.org/files/GI01SUMO/R00010/cg_data/dcl13/vics/vics_host_downloaded/imageData/)
[R00005](https://rawdata.oceanobservatories.org/files/GI01SUMO/R00005/cg_data/dcl13/vics-hourly/)
[R00006](https://rawdata.oceanobservatories.org/files/GI01SUMO/R00006/cg_data/dcl13/vics/imageData/)

In [5]:
import os
import shutil
from pathlib import Path

In [None]:
# make dir for raw data if it doesn't exist
os.makedirs("raw_data", exist_ok=True)

# sources
sources = ["https://rawdata.oceanobservatories.org/files/GI01SUMO/R00007/instruments/dcl13/vics/imageData/",
           "https://rawdata.oceanobservatories.org/files/GI01SUMO/R00009/instruments/dcl13/vics/imageData/",
           "https://rawdata.oceanobservatories.org/files/GI01SUMO/R00010/cg_data/dcl13/vics/vics_host_downloaded/imageData/",
           "https://rawdata.oceanobservatories.org/files/GI01SUMO/R00005/cg_data/dcl13/vics-hourly/",
           "https://rawdata.oceanobservatories.org/files/GI01SUMO/R00006/cg_data/dcl13/vics/imageData/"
          ]
# pull raw data
for url in sources:
    !wget -r -np -e robots=off -P raw_data/ {url} -q --show-progress

rawdata.oceanobserv     [ <=>                ]  24.34K  --.-KB/s    in 0s      
rawdata.oceanobserv     [ <=>                ]  24.34K  --.-KB/s    in 0s      
rawdata.oceanobserv     [ <=>                ]  24.34K  --.-KB/s    in 0s      
rawdata.oceanobserv     [ <=>                ]  24.34K  --.-KB/s    in 0s      
rawdata.oceanobserv     [ <=>                ]  24.34K  --.-KB/s    in 0s      
rawdata.oceanobserv     [ <=>                ]   2.25K  --.-KB/s    in 0s      
rawdata.oceanobserv     [ <=>                ]  22.31K  --.-KB/s    in 0s      
rawdata.oceanobserv     [ <=>                ]  21.35K  --.-KB/s    in 0s      
rawdata.oceanobserv     [ <=>                ]  27.08K  --.-KB/s    in 0s      
rawdata.oceanobserv     [ <=>                ]  23.26K  --.-KB/s    in 0s      
rawdata.oceanobserv     [ <=>                ]  24.22K  --.-KB/s    in 0s      
rawdata.oceanobserv     [ <=>                ]  20.40K  --.-KB/s    in 0s      
rawdata.oceanobserv     [ <=>           

### Moving Data

In [None]:
# We're going to move all the images to a single directory for ease

raw_path = Path("./raw_data/rawdata.oceanobservatories.org/files/GI01SUMO")
log_file = Path("./relocation_log.txt")
halo_images_filtered = Path("./image_data/halo_filtered")
halo_images = Path("./image_data/halo_cam_2")

# make the dir if it doesn't exist
halo_images_filtered.mkdir(parents=True, exist_ok=True)
halo_images.mkdir(parents=True, exist_ok=True)

# start logging
with open(log_file, "a") as log:
    # make a list to filter images
    halo_cam_filters = ("cam61A0399F.jpg", "cam37C1DD9F.jpg", ".cam2.jpg")
    # walk-thru all directories and pull the .jpgs
    for root, dirs, files in os.walk(raw_path):
        for file in files:
            if file.lower().endswith(".jpg"):
                # Catching cam2 photos and filtering them out (these look at the halo and aren't really useful...)
                if any(filter in file for filter in halo_cam_filters):
                    
                    src_file = Path(root) / file
                    dst_file = halo_images / file
                    
                    # catch-all for potential naming duplicates
                    catcher = 1
                    while dst_file.exists():
                        dst_file = halo_images_filtered / f"{src_file.stem}_{catcher}{src_file.suffix}"
                        catcher += 1
        
                    # move the file
                    shutil.move(str(src_file), str(dst_file))
        
                    # generate a log file
                    log.write(f"Moved: {src_file} -> {dst_file}\n")
                # the remaining cam photos
                else:
                    src_file = Path(root) / file
                    dst_file = halo_images_filtered / file
        
                    # catch-all for potential naming duplicates
                    catcher = 1
                    while dst_file.exists():
                        dst_file = halo_images_filtered / f"{src_file.stem}_{catcher}{src_file.suffix}"
                        catcher += 1
        
                    # move the file
                    shutil.move(str(src_file), str(dst_file))
        
                    # generate a log file
                    log.write(f"Moved: {src_file} -> {dst_file}\n")


### Begin filtering by cruise schedule
## NOTE: This wasn't very useful in the image analysis but keeping in case we need to sort by datetime later

In [None]:
# With the halo cams removed we can use the cruise schedule to only include relevant time-frame images
from datetime import datetime
import pandas as pd
from pathlib import Path
import shutil

# images path
images_path = Path("./image_data/halo_filtered")
# destination path
cruise_filtered = Path("./image_data/cruise_filtered")
# make directory
cruise_filtered.mkdir(parents=True, exist_ok=True)

# set-up timeframes
timeframes = [
    ("20200808_121800", "20200904_170200"),
    ("20210803_160600", "20210826_080500"),
    ("20220620_200000", "20220716_083500"),
    ("20230828_100000", "20230921_120800"),
    ("20240602_160200", "20240702_085000")
]

# toos files into a list
files = list(images_path.glob("*.jpg"))

# make a dataframe
df = pd.DataFrame({'filename': [f.name for f in files]})
df['ts_str'] = df['filename'].str.slice(0,15) #take yymmdd_hhmmss to a new column
df['ts'] = pd.to_datetime(df['ts_str'], format = '%Y%m%d_%H%M%S') # convert to datetime object

# declare ranges from timeframes 
ranges = [
    (pd.to_datetime(s, format='%Y%m%d_%H%M%S'),
     pd.to_datetime(e, format='%Y%m%d_%H%M%S'))
    for s, e in timeframes
]

# initialize false matches
mask = pd.Series(False, index=df.index)
# iterate through timeframe and swap to true if they line up
for start, end in ranges:
    mask |= df['ts'].between(start, end)
# gather photos that align
filtered = df[mask].reset_index(drop=True)
# gather photos that don't align
nonmatch = df[~mask].reset_index(drop=True)

# move images that matched into filtered directory

for filename in filtered['filename']:
    src = images_path / filename
    dst = cruise_filtered / filename
    shutil.move(src, dst)

