In [1]:
# import libraries related to querying links and downloading files from the web
from datetime import datetime
import glob
from IPython.display import display, clear_output, Video
import importlib
import ipywidgets as widgets
import pandas as pd
from pipmag import pipmag as pm
import re
import os

In [2]:
# reload the pipmag module to make sure that the latest version is used
importlib.reload(pm)
print('Imported updated pipmag.py')

Imported updated pipmag.py


### ⚙️ Generating the dataframe from SST quicklooks

In [3]:
# Print the years for which the La Palma Observatory has data at UiO
obs_years = pm.get_obs_years()

In [4]:
# Get the observing dates for all the years
obs_dates = pm.get_obs_dates(obs_years)
obs_dates_list = pm.get_obs_dates_list(obs_dates)
# print the first, last and total number of observing dates
print(f'first entry: {obs_dates_list[0]}\nlast entry : {obs_dates_list[-1]}\ntotal observing dates: {len(obs_dates_list)}')

first entry: 2013-06-30
last entry : 2023-05-16
total observing dates: 122


In [5]:
# get the latest file from the list of files in the data directory
latest_all_media_links_file = pm.get_latest_file('../data/all_media_links*')

Latest file: ../data/all_media_links_20230215_192642.pkl


In [6]:
# check if all_media_links.pkl exists then load the pickle file, otherwise get the links
if latest_all_media_links_file is None:
    video_links = pm.get_video_liks(obs_dates) # get the video links, one for each observing date
    image_links = pm.get_image_links(obs_dates) # get the image links, one for each observing date
    all_image_links = pm.get_all_links(image_links) # get all the image links, one for each image
    all_video_links = pm.get_all_links(video_links) # get all the video links, one for each video
    # print the number of video and image links and all the video and image links 
    print(f'number of video links: {len(all_video_links)}\nnumber of image links: {len(all_image_links)}')
    print(f'video links: {len(all_video_links)}\nimage links: {len(all_image_links)}')
    all_media_links = all_image_links + all_video_links # combine the image and video links
    all_media_links = sorted(all_media_links) #sort the list of links
    # print the total number of media links
    print(f'total number of media links: {len(all_media_links)}')
    # save all the media links as a pickle file
    pm.save_pickle(all_media_links, 'data/'+ pm.add_timestamp('all_media_links.pkl'))
else:
    # load the latest pickle file
    all_media_links = pm.load_pickle(latest_all_media_links_file )
    print(f'total number of media links: {len(all_media_links)}')

loaded ../data/all_media_links_20230215_192642.pkl successfully
total number of media links: 6878


In [7]:
# get the date and time from the links and find the links that do not have date and time and save them as a list
date_time_from_all_media_links, date_time_not_found = pm.get_date_time_from_link_list(all_media_links)
# remove all the links that do not have a date and time from all_media_links
all_media_links_with_date_time = [link for link in all_media_links if link not in date_time_not_found]
# print the number of links that contain date and time and the number of links that do not contain date and time
print(f'number of links with date and time: {len(all_media_links_with_date_time)}\nnumber of links without date and time: {len(date_time_not_found)}')
invalid_dates = pm.get_invalid_dates(date_time_from_all_media_links)
# remove the entries from date_time_from_all_media_links that are not in the correct format
date_time_from_all_media_links = [date for date in date_time_from_all_media_links if date not in invalid_dates]
# find the string pattern before the underscore in the invalid dates and search for the pattern in the links with date and time and save the links that contain the pattern in a list
invalid_dates_pattern = [re.search(r'(.+?)_', date).group(1) for date in invalid_dates]
# find the links that contain the pattern in invalid_dates_pattern and save them in a list
invalid_dates_links = [link for link in all_media_links_with_date_time if any(pattern in link for pattern in invalid_dates_pattern)]

number of links with date and time: 6817
number of links without date and time: 61
All dates in date_time_list are valid


In [8]:

# convert the date and time to datetime format
date_time_from_all_media_links_datetime = pm.convert_to_datetime(date_time_from_all_media_links)
# get the unique date_time_from_all_media_links_datetime  values
unique_date_time_from_all_media_links_datetime = list(set(date_time_from_all_media_links_datetime))
# print the number of unique date_time_from_all_media_links_datetime values
print(f'number of unique date_time_from_all_media_links_datetime values: {len(unique_date_time_from_all_media_links_datetime)}')
# create a dataframe with the date_time_from_all_media_links_datetime as the index and the all_media_links as the column
df = pd.DataFrame(all_media_links_with_date_time, index=date_time_from_all_media_links_datetime, columns=['links'])
#print first, last and total number of entries in the dataframe
print(f'first entry: {df.index[0]}\nlast entry : {df.index[-1]}\ntotal entries: {len(df.index)}')

number of unique date_time_from_all_media_links_datetime values: 748
first entry: 2013-06-30 09:15:50
last entry : 2022-09-08 08:40:11
total entries: 6817


In [9]:
# group the dataframe by the time index and combine the links into a list
df = df.groupby(df.index).agg({'links': lambda x: list(x)})
# print the first, last and total number of entries in the dataframe
print(f'first entry: {df.index[0]}\nlast entry : {df.index[-1]}\ntotal entries: {len(df.index)}')

first entry: 2013-06-30 09:15:50
last entry : 2022-09-17 14:34:13
total entries: 748


In [10]:
# add a column called 'obs_id' and set it equal to the row number of the dataframe
# add the 'id' column
df['obs_id'] = range(0, len(df))
# set the index as 'obs_id' and add a column for the date and time
df['date_time'] = df.index
df = df.set_index('obs_id')
# add a column for the number of links in each row
df['num_links'] = df['links'].apply(lambda x: len(x))
# add columns for the year, month and day to the dataframe
df['year'] = df['date_time'].apply(lambda x: x.year)
df['month'] = df['date_time'].apply(lambda x: x.month)
df['day'] = df['date_time'].apply(lambda x: x.day)
# add a column for the time of day
df['time'] = df['date_time'].apply(lambda x: x.time())
# add a column called 'target' and set it equal to None
df['target'] = None
df['comments'] = None
df['polarimetry'] = None
instrument_keywords={'CRISP': ['wb_6563','ha','Crisp','6173','8542','6563','crisp'],'CHROMIS':['Chromis','cak','4846'],'IRIS':['sji']}
# apply the get_instrument_info function to the 'links' column of the dataframe and add the result to a new column called 'instruments'
df['instruments'] = df['links'].apply(lambda x: pm.get_instrument_info(x, instrument_keywords))
# apply the get_links_with_string function to the 'links' column of the dataframe with the strings 'mp4' and 'mov' and add the result to a new column called 'video_links'
df['video_links'] = df['links'].apply(lambda x: pm.get_links_with_string(x, ['mp4','mov']))
# apply the get_links_with_string function to the 'links' column of the dataframe with the strings 'jpg' and 'png' and add the result to a new column called 'image_links'
df['image_links'] = df['links'].apply(lambda x: pm.get_links_with_string(x, ['jpg','png']))
#pm.get_links_with_string(df.iloc[0]['links'], ['mp4','mov'])
# make the columns date-time, year, month, day, time, instruments, target, video_links, image_links, links, num_links
df = df[['date_time', 'year', 'month', 'day', 'time', 'instruments', 'target', 'comments','video_links', 'image_links', 'links', 'num_links','polarimetry']]
# print a summary of the dataframe
df.info()
# save the dataframe as a pickle file
df.to_pickle('../data/'+ pm.add_timestamp('la_palma_obs_data.pkl'))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 748 entries, 0 to 747
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date_time    748 non-null    datetime64[ns]
 1   year         748 non-null    int64         
 2   month        748 non-null    int64         
 3   day          748 non-null    int64         
 4   time         748 non-null    object        
 5   instruments  746 non-null    object        
 6   target       0 non-null      object        
 7   comments     0 non-null      object        
 8   video_links  748 non-null    object        
 9   image_links  748 non-null    object        
 10  links        748 non-null    object        
 11  num_links    748 non-null    int64         
 12  polarimetry  0 non-null      object        
dtypes: datetime64[ns](1), int64(4), object(8)
memory usage: 81.8+ KB


### ➡️ Start here : Load the existing dataframe

In [2]:
# get the latest pickle file
latest_updated_la_palma_obs_data_file = pm.get_latest_file('../data/la_palma_obs_data_*.pkl')

Latest file: ../data/la_palma_obs_data_20230706_092445.pkl


In [3]:
df = pd.read_pickle(latest_updated_la_palma_obs_data_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 748 entries, 0 to 747
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date_time    748 non-null    datetime64[ns]
 1   year         748 non-null    int64         
 2   month        748 non-null    int64         
 3   day          748 non-null    int64         
 4   time         748 non-null    object        
 5   instruments  746 non-null    object        
 6   target       0 non-null      object        
 7   comments     0 non-null      object        
 8   video_links  748 non-null    object        
 9   image_links  748 non-null    object        
 10  links        748 non-null    object        
 11  num_links    748 non-null    int64         
 12  polarimetry  0 non-null      object        
dtypes: datetime64[ns](1), int64(4), object(8)
memory usage: 81.8+ KB


In [6]:
# create a widget to display movies based on year, month, day and time
# and to update the target, instrumnets and comments columns of the dataframe
selector = pm.VideoSelector2(df, ['target', 'instruments', 'polarimetry', 'comments'])
selector.create_widget()

Dropdown(description='Year:', options=(2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022), value=2013…

Dropdown(description='Month:', options=(), value=None)

Dropdown(description='Day:', options=(), value=None)

Dropdown(description='Time:', options=(), value=None)

Dropdown(description='Links:', options=(), value=None)

Button(description='Show', style=ButtonStyle())

Output()

Text(value='', description='target:')

Text(value='', description='instruments:')

Text(value='', description='polarimetry:')

Text(value='', description='comments:')

Button(description='Update', style=ButtonStyle())

In [7]:
# 🔍 ADS Search
index = 36
search = pm.ADS_Search(df)
search.get_results(index, pretty_print=True)

#,Title,First author,Bibcode,URL
1,On the dynamics of spicules and mass flows in the solar atmosphere,"Bose, Souvik",2021arXiv211010656B,https://ui.adsabs.harvard.edu/abs/2021arXiv211010656B
2,Evidence of the multi-thermal nature of spicular downflows. Impact on solar atmospheric heating,"Bose, Souvik",2021A&A...654A..51B,https://ui.adsabs.harvard.edu/abs/2021A&A...654A..51B
3,Spicules and downflows in the solar chromosphere,"Bose, Souvik",2021A&A...647A.147B,https://ui.adsabs.harvard.edu/abs/2021A&A...647A.147B
4,Analysis of Pseudo-Lyapunov Exponents of Solar Convection Using State-of-the-Art Observations,"Viavattene, Giorgio",2021Entrp..23..413V,https://ui.adsabs.harvard.edu/abs/2021Entrp..23..413V
5,Characterization and formation of on-disk spicules in the Ca II K and Mg II k spectral lines,"Bose, Souvik",2019A&A...631L...5B,https://ui.adsabs.harvard.edu/abs/2019A&A...631L...5B


In [355]:
# save the updated dataframe as a pickle file
df.to_pickle('data/'+ pm.add_timestamp('la_palma_obs_data.pkl'))