In [None]:
import bs4
import requests
from datetime import datetime
from tqdm.auto import tqdm
from urllib.parse import urljoin
import re
from collections import defaultdict
import json

# <3 ChatGPT
def extract_cardinal_direction(text):
    direction_pattern = re.compile(r'\b(N|NNE|NE|ENE|E|ESE|SE|SSE|S|SSW|SW|WSW|W|WNW|NW|NNW)\b', re.IGNORECASE)
    match = direction_pattern.search(text)
    
    if match:
        return match.group(1)
    else:
        return None
    
def extract_timestamp_and_urls(text):
    url_pattern = re.compile(r'urls\[\d+\]=(.*?);', re.DOTALL)
    urls = url_pattern.findall(text)

    timestamps = []
    
    for url in urls:
        url = url.strip(' "\'')
        timestamp_str = re.search(r'(\d{8}_\d{6})\.png', url).group(1)
        timestamps.append(timestamp_str)

    return timestamps[0], timestamps[-1]

In [None]:
def extract_event(cols):
    
    link = cols[1].find('a')
    
    event = {}
    
    event['datetime'] = f"{cols[0].text} {cols[2].text}"
    event['type'] = link.text

    comment = cols[3].text

    event['faint'] = 'faint' in comment
    event['narrow'] = 'narrow' in comment
    event['wide'] = 'wide' in comment
    event['fast'] = 'fast' in comment
    event['visible'] = not 'seen' in comment
    event['direction'] = extract_cardinal_direction(comment)

    video = urljoin(url, link["href"]).strip()            
    tstart, tend = extract_timestamp_and_urls(requests.get(video).text)
    event['event_start_time'] = tstart
    event['event_stop_time'] = tend

    return event

In [None]:
def get_event_row(catalog_page) -> dict:
    
    soup = bs4.BeautifulSoup(requests.get(catalog_page).text)
    rows = soup.find_all('tr')
    
    out = defaultdict(list)
    
    pbar = tqdm(rows)
    
    for row in pbar:
        cols = row.find_all('td')
        
        if len(cols) < 2:
            continue
            
        link = cols[1].find('a')
        
        if link is None:
            continue
        
        if 'CME' in link.text:
            # Stereo A
            a = extract_event(cols[:6])
            
            # Stereo B
            b = extract_event(cols[6:])
            
            out['stereo_a'].append(a)catherine@trillium.tech
            out['stereo_b'].append(b)
                
    return out

In [None]:
url = "https://cor1.gsfc.nasa.gov/catalog/cme/2014/Daniel_Hong_COR1_preliminary_event_list_2014-02.html"
events = get_event_row(url)

In [None]:
import os
import shutil

In [None]:
date_folders = glob("/media/josh/josh_tuf_a/data/fdl/2023/onboard/*")
os.makedirs("/media/josh/josh_tuf_a/data/fdl/2023/wrong", exist_ok=True)

In [None]:
for date_folder in date_folders:

    date = os.path.basename(date_folder)

    for objects in os.walk(top = date_folder):
        dirpath, dirnames, filenames = objects
        if 'cor' not in dirpath:
            continue
        
        for file in filenames:
            if date not in file and date[:4] == file[:4]:
                shutil.move(os.path.join(dirpath, file), "/media/josh/josh_tuf_a/data/fdl/2023/wrong")

In [None]:
os.path.basename(date_folder)

In [None]:
date