In [148]:
# import libraries related to querying links and downloading files from the web
import requests
import urllib.request
from bs4 import BeautifulSoup
import os
from IPython.display import Video
from tqdm import tqdm
import IPython
from IPython.display import display
import ipywidgets as widgets
import numpy as np
import csv


In [116]:
lapalma_url = 'http://tsih3.uio.no/lapalma/'
# recursively get all the subdirectories in the parent url directory
r = requests.get(lapalma_url)
soup = BeautifulSoup(r.text, 'html.parser')
obs_years = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('/')]
# choose the subdirs that are of the form 20?? and ignore the rest
obs_years = [s for s in obs_years if s.startswith('20')]
# print the observation years withouth the trailing slash
for year in obs_years:
    print(year[:-1])

2013
2014
2015
2016
2017
2018
2019
2020
2021
2022


In [117]:
# recursively get all the subdirectories in the bs_years list
obs_dates = []
for subdir in obs_years:
    r = requests.get(lapalma_url + subdir)
    soup = BeautifulSoup(r.text, 'html.parser')
    obs_dates.extend([subdir + a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('/')])
# select the directories that are of the form 20??/20??-??-??/ and ignore the rest
obs_dates = [s for s in obs_dates if s.startswith('20') and s.count('/') == 2]
# print the first, last and total number of directories
print(f'first: {obs_dates[0][:-1]}, last: {obs_dates[-1][:-1]}, total: {len(obs_dates)}')

first: 2013/2013-06-30, last: 2022/2022.09.08, total: 111


In [118]:
# define a function to print all the observing dates for a given year in the obs_dates list
def print_obs_dates(year):
    obs_dates_year = [s for s in obs_dates if s.startswith(year)]
    print(f'first: {obs_dates_year[0][:-1]}, last: {obs_dates_year[-1][:-1]}, total: {len(obs_dates_year)}')
    # remove the trailing slash and the year in the front, and return the list in the form 20??-??-??
    obs_dates_year = [s[5:-1] for s in obs_dates_year]
    formatted_obs_dates_year = []
    # print an enumerated list of the observing dates
    for i, obs_date in enumerate(obs_dates_year):
        # obs_date = [s.replace('.', '-') for s in obs_date.split('/')]
        # print i with 2 digits and the observing date
        print(f'{i+1:02d}: {obs_date}')
        formatted_obs_dates_year.append(obs_date)
    return None

In [119]:
# define a function that takes a partial string and returns all the strings that match it from the obs_dates list
def find_obs_dates(partial_string):
    obs_dates_partial = [s for s in obs_dates if partial_string in s]
    # remove the trailing slash and the year in the front, and return the list in the form 20??-??-??
    obs_dates_partial = [s[5:-1] for s in obs_dates_partial]
    formatted_obs_dates_partial = []
    # print an enumerated list of the observing dates
    for i, obs_date in enumerate(obs_dates_partial):
        # print i with 2 digits and the observing date
        print(f'{i+1:02d}: {obs_date}')
        formatted_obs_dates_partial.append(obs_date)
    # if no match is found, print a message
    if len(obs_dates_partial) == 0:
        print('No observation dates found')
    return None

In [120]:
find_obs_dates('2020-04-')

No observation dates found


In [121]:
# write a function that takes obs_dates as input a returns a list of dates in the format 20??-??-??
def get_obs_dates(obs_dates):
    # remove the trailing slash and the year in the front, and return the list in the form 20??-??-??
    obs_dates = [s[5:-1] for s in obs_dates]
    # if the sepatator is not a dash, replace it with a dash
    obs_dates = [s.replace('.', '-') for s in obs_dates]
    return obs_dates

In [122]:
obs_dates_list = get_obs_dates(obs_dates)

In [123]:
obs_dates_list[110]

'2022-09-08'

In [124]:
obs_dates[110]

'2022/2022.09.08/'

In [125]:
# define a function that takes a url and a file extension as input and returns a list of files with the given extension, if the files are not founds it searches the subdirectories
def get_files(url, file_extension):
    # get the list of files with the given extension
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    files = [url + a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith(file_extension)]
    # if the list is empty, recursively search the subdirectories
    if not files:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        subdirs = [url + a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('/')]
        for subdir in subdirs:
            files.extend(get_files(subdir, file_extension))
    return files

In [126]:
len(obs_dates_list)

111

In [127]:
# for the obs_dates list, get the list of files with either .mp4 or .mov extension and save it as a dictionary wih the key being the observing date if the files are not founds then add a None value to the dictionary
file_video_links = 'video_links.csv'

if os.path.isfile(file_video_links) is False:
    video_links = {}
    i = 0
    for obs_date in tqdm(obs_dates):
        # get the list of files with either .mp4 or .mov extension
        files = get_files(lapalma_url + obs_date + '/', '.mp4') + get_files(lapalma_url + obs_date + '/', '.mov')
        # if the list is not empty, save it as a dictionary wih the key being the observing date
        key = obs_date[5:-1]
        # replace the dots with dashes
        key = key.replace('.', '-')
        if files:
            video_links[key] = files
        # if the list is empty, add a None value to the dictionary
        else:
            video_links[key] = 'File not found'

        # save the image_links in a csv file:
        w = csv.writer(open(file_video_links, "w"))
        for key, val in image_links.items():
            w.writerow([key, val])

else:
    # load the image_links in a csv file:
    reader = csv.reader(open(file_image_links, 'r'))
    image_links = {}
    for row in reader:
        k, v = row
        image_links[k] = v
    print(file_video_links+' loaded.')

100%|██████████| 111/111 [00:12<00:00,  9.13it/s]

video_links.csv loaded.





In [128]:
file_image_links = 'image_links.csv'

if os.path.isfile(file_image_links) is False:
    image_links = {}
    i = 0 
    for obs_date in tqdm(obs_dates):
        # get the list of files with either .mp4 or .mov extension
        files = get_files(lapalma_url + obs_date + '/', '.jpg')
        # if the list is not empty, save it as a dictionary wih the key being the observing date
        key = obs_date[5:-1]
        # replace the dots with dashes
        key = key.replace('.', '-')
        if files:
            image_links[key] = files
        # if the list is empty, add a None value to the dictionary
        else:
            image_links[key] = 'File not found'
            
    # save the image_links in a csv file:
    w = csv.writer(open(file_image_links, "w"))
    for key, val in image_links.items():
        w.writerow([key, val])

else:
    # load the image_links in a csv file:
    reader = csv.reader(open(file_image_links, 'r'))
    image_links = {}
    for row in reader:
        k, v = row
        image_links[k] = v
    print(file_image_links+' loaded.')


image_links.csv loaded.


In [129]:
# write a function that plays the mp4 file in the url
def play_mp4(url):
    return Video(url, width=720, height=480)

In [130]:
# define a function called video that takes a date string and prints an enumerated list of the video files for that date from the video_links dictionary
def video(date_string):
    # if the date is in the dictionary, print the list of video files
    if date_string in video_links:
        for i, video_link in enumerate(video_links[date_string]):
            print(f'{i:02d}: {video_link}')
    # if the date is not in the dictionary, print a message
    else:
        print('No video files found')
    return None

In [131]:
# find_obs_dates('2020-09-')
find_obs_dates('2015-09-0')

01: 2015-09-05
02: 2015-09-09


In [132]:
# video('2020-09-29')
video('2015-09-09')

00: http://tsih3.uio.no/lapalma/2015/2015-09-09//Bz+Bh_09Sep2015.mp4
01: http://tsih3.uio.no/lapalma/2015/2015-09-09//Bz+Bh_09Sep2015_B250G.mp4
02: http://tsih3.uio.no/lapalma/2015/2015-09-09//hacore+blos+sji2796_3pan_2015-09-09_075958.mp4


In [133]:
# play_mp4(video_links['2020-09-29'][4])
play_mp4(video_links['2015-09-09'][1])

# Creating the database

In [134]:
import pandas as pd

raw_data = {
    'ID': ['2015-09-09 01'],
    'date': ['2015-09-09'],
    'url': ['2015-09-09'],
    'Target': ['Sunspot'],
    'Polarimetry': [False],
    'Instruments':['CRISP+CHROMIS'],
    'Photosphere': [False],
    'Chromosphere': [True],
    'Flux balance': [42],
    'Dynamism': ['Quiet'],
    }

df = pd.DataFrame(raw_data)
# Do not use print but display from IPython
# print(df)

In [None]:
# Read https://ipywidgets.readthedocs.io/en/8.0.2/examples/Widget%20List.html for more information.

In [135]:
# Create a pandas-table of all the movies in the database:
video_dates = list(video_links.keys())
ids = []; dates = []; urls = [];
for days in range(len(video_dates)):
    # Every id is the date and the video of that day
    for movie in range(len(video_links[video_dates[days]])):
        ids.append(video_dates[days]+'_'+str(movie))
        dates.append(video_dates[days])
        urls.append(video_links[video_dates[days]][movie])

# Creating the container for the database:    
from_links = {
    'ID': ids,
    'date': dates,
    'url': urls,
    'Target': np.empty_like(dates),
    }
db_from_links = pd.DataFrame(from_links)

# Display the database:
display(db_from_links)

Unnamed: 0,ID,date,url,Target
0,2013-06-30_0,2013-06-30,http://tsih3.uio.no/lapalma/2013/2013-06-30//halpha_SDO_8pan_2013-06-30_091550.mp4,
1,2013-06-30_1,2013-06-30,http://tsih3.uio.no/lapalma/2013/2013-06-30//halpha_scan_30Jun2013_giant_tornado.mp4,
2,2013-06-30_2,2013-06-30,http://tsih3.uio.no/lapalma/2013/2013-06-30//wb6563_30Jun2013_ff_framesel_1s.mp4,
3,2013-06-30_3,2013-06-30,http://tsih3.uio.no/lapalma/2013/2013-06-30//./wb_6563_2013-06-30T09:15:50_scans=0-2133_histoopt.mp4,
4,2013-06-30_4,2013-06-30,http://tsih3.uio.no/lapalma/2013/2013-06-30//./wb_6563_2013-06-30T09:15:50_scans=0-2133_minmax.mp4,
...,...,...,...,...
3911,2022-09-08_10,2022-09-08,http://tsih3.uio.no/lapalma/2022/2022.09.08//./08:40:11/./Chromis-N_quick_2022-09-08_08:40:11.mov,
3912,2022-09-08_11,2022-09-08,http://tsih3.uio.no/lapalma/2022/2022.09.08//./08:40:11/./Chromis-N_quick_2022-09-08_08:40:11_3934_3934_+0.mov,
3913,2022-09-08_12,2022-09-08,http://tsih3.uio.no/lapalma/2022/2022.09.08//./08:40:11/./Chromis-N_quick_2022-09-08_08:40:11_3934_3934_+657.mov,
3914,2022-09-08_13,2022-09-08,http://tsih3.uio.no/lapalma/2022/2022.09.08//./08:40:11/./Chromis-N_quick_2022-09-08_08:40:11_3934_3934_-657.mov,


In [151]:
video_dates_widget = widgets.Dropdown(options=video_dates, value=video_dates[-1], description='Dates:', disabled=False)
display(video_dates_widget)

Dropdown(description='Dates:', index=108, options=('2013-06-30', '2013-09-01', '2014-09-09', '2014-09-15', '20…

In [152]:
# Display all the things for a given day:
display(db_from_links[db_from_links['date']==video_dates_widget.value])

Unnamed: 0,ID,date,url,Target
3665,2022-09-17_0,2022-09-17,http://tsih3.uio.no/lapalma/2022/2022-09-17//halpha+blos+AIA+HMI_6pan_2022.09.17_102405.mp4,
3666,2022-09-17_1,2022-09-17,http://tsih3.uio.no/lapalma/2022/2022-09-17//halpha+blos+AIA_6pan_2022.09.17_102405.mp4,
3667,2022-09-17_2,2022-09-17,http://tsih3.uio.no/lapalma/2022/2022-09-17//halpha+blos_3pan_2022.09.17_102405.mp4,
3668,2022-09-17_3,2022-09-17,http://tsih3.uio.no/lapalma/2022/2022-09-17//halpha+blos_3pan_2022.09.17_125133.mp4,
3669,2022-09-17_4,2022-09-17,http://tsih3.uio.no/lapalma/2022/2022-09-17//halpha_3pan_2022-09-17_102405.mp4,
...,...,...,...,...
3764,2022-09-17_99,2022-09-17,http://tsih3.uio.no/lapalma/2022/2022-09-17//./14:34:13/./Chromis-N_quick_2022-09-17_14:34:13_3969_3969_+0.mov,
3765,2022-09-17_100,2022-09-17,http://tsih3.uio.no/lapalma/2022/2022-09-17//./14:34:13/./Crisp-R_quick_2022-09-17_14:34:13.mov,
3766,2022-09-17_101,2022-09-17,http://tsih3.uio.no/lapalma/2022/2022-09-17//./14:34:13/./Crisp-R_quick_2022-09-17_14:34:13_6563_6563_+0.mov,
3767,2022-09-17_102,2022-09-17,http://tsih3.uio.no/lapalma/2022/2022-09-17//./14:34:13/./Crisp-R_quick_2022-09-17_14:34:13_6563_6563_+800.mov,


In [138]:
df.to_csv('raw_data.csv', index=False)