In [95]:
# import libraries related to querying links and downloading files from the web
import requests
import urllib.request
from bs4 import BeautifulSoup
import os
from IPython.display import Video

In [96]:
lapalma_url = 'http://tsih3.uio.no/lapalma/'
# recursively get all the subdirectories in the parent url directory
r = requests.get(lapalma_url)
soup = BeautifulSoup(r.text, 'html.parser')
obs_years = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('/')]
# choose the subdirs of the of form 20?? and ignore the rest
obs_years = [s for s in obs_years if s.startswith('20')]
# print the observation years withouth the trailing slash
for year in obs_years:
    print(year[:-1])

2013
2014
2015
2016
2017
2018
2019
2020
2021
2022


In [97]:
# recursively get all the subdirectories in the bs_years list
obs_dates = []
for subdir in obs_years:
    r = requests.get(lapalma_url + subdir)
    soup = BeautifulSoup(r.text, 'html.parser')
    obs_dates.extend([subdir + a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('/')])
# select the directories that are of the form 20??/20??-??-??/ and ignore the rest
obs_dates = [s for s in obs_dates if s.startswith('20') and s.count('/') == 2]
# print the first, last and total number of directories
print(f'first: {obs_dates[0][:-1]}, last: {obs_dates[-1][:-1]}, total: {len(obs_dates)}')

first: 2013/2013-06-30, last: 2022/2022.09.08, total: 111


In [98]:
# define a function to print all the observing dates for a given year in the obs_dates list
def print_obs_dates(year):
    obs_dates_year = [s for s in obs_dates if s.startswith(year)]
    print(f'first: {obs_dates_year[0][:-1]}, last: {obs_dates_year[-1][:-1]}, total: {len(obs_dates_year)}')
    # remove the trailing slash and the year in the front, and return the list in the form 20??-??-??
    obs_dates_year = [s[5:-1] for s in obs_dates_year]
    formatted_obs_dates_year = []
    # print an enumerated list of the observing dates
    for i, obs_date in enumerate(obs_dates_year):
       # obs_date = [s.replace('.', '-') for s in obs_date.split('/')]
        # print i with 2 digits and the observing date
        print(f'{i+1:02d}: {obs_date}')
        formatted_obs_dates_year.append(obs_date)
    return None

In [99]:
# define a function that takes a partial string and returns all the strings that match it from the obs_dates list
def find_obs_dates(partial_string):
    obs_dates_partial = [s for s in obs_dates if partial_string in s]
    # remove the trailing slash and the year in the front, and return the list in the form 20??-??-??
    obs_dates_partial = [s[5:-1] for s in obs_dates_partial]
    formatted_obs_dates_partial = []
    # print an enumerated list of the observing dates
    for i, obs_date in enumerate(obs_dates_partial):
        # print i with 2 digits and the observing date
        print(f'{i+1:02d}: {obs_date}')
        formatted_obs_dates_partial.append(obs_date)
    # if no match is found, print a message
    if len(obs_dates_partial) == 0:
        print('No observation dates found')
    return None

In [100]:
find_obs_dates('2020-04-')

No observation dates found


In [101]:
# write a function that takes obs_dates as input a returns a list of dates in the format 20??-??-??
def get_obs_dates(obs_dates):
    # remove the trailing slash and the year in the front, and return the list in the form 20??-??-??
    obs_dates = [s[5:-1] for s in obs_dates]
    # if the sepatator is not a dash, replace it with a dash
    obs_dates = [s.replace('.', '-') for s in obs_dates]
    return obs_dates

In [102]:
obs_dates_list = get_obs_dates(obs_dates)

In [103]:
obs_dates_list[110]

'2022-09-08'

In [104]:
obs_dates[110]

'2022/2022.09.08/'

In [105]:
# define a function that takes a url and a file extension as input and returns a list of files with the given extension, if the files are not founds it searches the subdirectories
def get_files(url, file_extension):
    # get the list of files with the given extension
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    files = [url + a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith(file_extension)]
    # if the list is empty, recursively search the subdirectories
    if not files:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        subdirs = [url + a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('/')]
        for subdir in subdirs:
            files.extend(get_files(subdir, file_extension))
    return files

In [106]:
len(obs_dates_list)

111

In [107]:
# for the obs_dates list, get the list of files with either .mp4 or .mov extension and save it as a dictionary wih the key being the observing date if the files are not founds then add a None value to the dictionary
video_links = {}
i = 0
for obs_date in obs_dates:
    # get the list of files with either .mp4 or .mov extension
    files = get_files(lapalma_url + obs_date + '/', '.mp4') + get_files(lapalma_url + obs_date + '/', '.mov')
    # if the list is not empty, save it as a dictionary wih the key being the observing date
    key = obs_date[5:-1]
    # replace the dots with dashes
    key = key.replace('.', '-')
    if files:
        video_links[key] = files
    # if the list is empty, add a None value to the dictionary
    else:
        video_links[key] = 'File not found'

In [108]:
image_links = {}
i = 0 
for obs_date in obs_dates:
    # get the list of files with either .mp4 or .mov extension
    files = get_files(lapalma_url + obs_date + '/', '.jpg')
    # if the list is not empty, save it as a dictionary wih the key being the observing date
    key = obs_date[5:-1]
    # replace the dots with dashes
    key = key.replace('.', '-')
    if files:
        image_links[key] = files
    # if the list is empty, add a None value to the dictionary
    else:
        image_links[key] = 'File not found'

In [109]:
# write a function that plays the mp4 file in the url
def play_mp4(url):
    return Video(url, width=720, height=480)

In [119]:
# define a function called video that takes a date string and prints an enumerated list of the video files for that date from the video_links dictionary
def video(date_string):
    # if the date is in the dictionary, print the list of video files
    if date_string in video_links:
        for i, video_link in enumerate(video_links[date_string]):
            print(f'{i:02d}: {video_link}')
    # if the date is not in the dictionary, print a message
    else:
        print('No video files found')
    return None

In [116]:
find_obs_dates('2020-09-')

01: 2020-09-18
02: 2020-09-19
03: 2020-09-20
04: 2020-09-27
05: 2020-09-28
06: 2020-09-29


In [120]:
video('2020-09-29')

00: http://tsih3.uio.no/lapalma/2020/2020-09-29//./nb_6173_2020-09-29T08:56:47_scans=0-74_stokes_corrected_im_-80_stokesV.mp4
01: http://tsih3.uio.no/lapalma/2020/2020-09-29//./nb_6563_2020-09-29T08:56:47_scans=0-74_corrected_im_+0.mp4
02: http://tsih3.uio.no/lapalma/2020/2020-09-29//./nb_6563_2020-09-29T08:56:47_scans=0-74_corrected_im_+1000.mp4
03: http://tsih3.uio.no/lapalma/2020/2020-09-29//./nb_6563_2020-09-29T08:56:47_scans=0-74_corrected_im_-1000.mp4
04: http://tsih3.uio.no/lapalma/2020/2020-09-29//./nb_8542_2020-09-29T08:56:47_scans=0-74_stokes_corrected_im_0_stokesI.mp4
05: http://tsih3.uio.no/lapalma/2020/2020-09-29//./wb_6563_2020-09-29T08:56:47_scans=0-74_corrected_im.mp4
06: http://tsih3.uio.no/lapalma/2020/2020-09-29//./Chromis-N_quick_2020-09-29_08:40:38_3969_3969_+0.mov
07: http://tsih3.uio.no/lapalma/2020/2020-09-29//./Chromis-N_quick_2020-09-29_08:40:38_3969_3969_+1560.mov
08: http://tsih3.uio.no/lapalma/2020/2020-09-29//./Chromis-N_quick_2020-09-29_08:56:47_3969_3969

In [121]:
play_mp4(video_links['2020-09-29'][1])