# Tutorial

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

In [None]:
import sys, os
# Get the current script’s directory
current_directory = os.getcwd()
# Go back one folder level
parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
sys.path.insert(0, parent_directory)
from mtsthelens import preprocessing_functions, manipulation_functions, plotting_functions

## Read & preprocess data

In [None]:
# Read input data
raw_data =  preprocessing_functions.read_data('../example/example_data/example_data_eruption.csv')

# Data smoothing
raw_data = raw_data.rolling('6H', center=True).median()

# Remove outliers
raw_data = raw_data.apply(preprocessing_functions.mask_df,axis=0) # peak detection
raw_data

## Data manipulation

### Stack in Time

In [None]:
# Find the seasonal trends in the data, and create a new dataframe with the seasonality removed
data_seasonal_trends, data_seasonality_removed = manipulation_functions.stack_in_time(raw_data)
data_seasonality_removed

### Stack in Space

In [None]:
# Find the differences between the stations, and the average 
data_median_stackspace, data_stackspace_year = manipulation_functions.stack_in_space(raw_data)
data_yearly_params = manipulation_functions.stack_space_year_param(data_stackspace_year)
data_yearly_params

### Apply Filter

In [None]:
raw_data = raw_data.fillna(0)
data_filtered = manipulation_functions.filter_data(raw_data)
data_filtered

## Data plotting

In [None]:
# read extrusion rate data
extrusion_data = pd.read_csv('../example/example_data/dome_extrusion.txt', header=0, skiprows=0)
extrusion_data.set_index('Date of photography',inplace=True)
extrusion_data.index = pd.to_datetime(extrusion_data.index).tz_localize(None)
extrusion_data.head()

In [None]:
# Plotting Time Stack vs Raw Data
plotting_functions.plot_stack_vs_raw(data_seasonality_removed, raw_data)

In [None]:
# Plotting Filtered Timestack vs Raw Data
plotting_functions.plot_stack_vs_raw(data_filtered, raw_data)

In [None]:
# Plotting the Yearly parameters of the Space Stack
plotting_functions.plot_space_params(data_yearly_params)

In [None]:
# Plotting the extrusion rates and comparing with Raw, Seasonality Removed and Filtered Data
plotting_functions.plot_extrusion(extrusion_data, raw_data, data_yearly_params, data_filtered)

### Bring data into used shape

In [None]:
df.head(), df_seasonality_removed.head(), df_filter.head()

In [None]:
df


In [None]:
df_stat = manipulation_functions.stackSpace_yearParam(df) # extract statistical values

# append latitude and longitude of the station as rows for plotting
df_stat.loc['latitude'] = [df_sta.loc[df_sta['Station'] == sta, 'latitude'].values[0] for sta in df_stat.columns]
df_stat.loc['longitude'] = [df_sta.loc[df_sta['Station'] == sta, 'longitude'].values[0] for sta in df_stat.columns]
df_stat

In [None]:
# Create a dictionary with Date as key for, DataFrames
dict_test = manipulation_functions.df2dict(df, 'year')
dict_stat = {}
for key, value in dict_test.items():
    df_stat = manipulation_functions.stackSpace_yearParam(value) # extract statistical values

    # append latitude and longitude of the station as rows for plotting
    df_stat.loc['latitude'] = [df_sta.loc[df_sta['Station'] == sta, 'latitude'].values[0] for sta in df_stat.columns]
    df_stat.loc['longitude'] = [df_sta.loc[df_sta['Station'] == sta, 'longitude'].values[0] for sta in df_stat.columns]
    dict_stat[key] = df_stat

# Save the dictionary as npy
np.save('output/data/my_file.npy', dict_stat) 

### Create Plots for Animation

In [None]:
stations_data = np.load('output/data/my_file.npy',allow_pickle='TRUE').item()
stations_data

In [None]:
plotting_functions.animation(stations_data, 'median', 'inferno')

### Plot Station sorted by distance

This part of the project is still in progress. The example date should produce an output but we did not wrote the final functions and tests for this part.

In [None]:
# load station coordinates and drop the stations which are not of interresst
sta_list =['BLIS', 'CDF', 'EDM', 'ELK', 'FL2', 'HOA', 'HSR', 'JRO', 'JUN', # specify the stations you want to use
           'LOO', 'MIDE', 'NED', 'RAFT', 'REM', 'SEP', 'SHW', 'SOS', 'SPN5',
           'STD', 'SUG', 'SWFL', 'TDL', 'USFR', 'VALT', 'YEL'] 

df_sta = pd.read_csv('./example_data/sta_log_long.txt', sep='|', header=0) # coordinates
df_sta = df_sta[~df_sta['Station'].isin(list(set(df_sta.Station)-set(sta_list)))] # delete Stations which are not of interresst
df_sta = df_sta.drop_duplicates(subset=['Station']) # drop one station if the station is not unique
df_sta = df_sta.reset_index(drop=True)
df_sta.head()

In [None]:
# get the distance between the stations and sort them in increasing order (relative to station SEP -> crater center)
ref_sta = 'SEP' # define the reference station, we will get distance from all other stations to this station
df_sta['dist'] = df_sta.apply(lambda x: preprocessing_functions.calculate_distance(x['latitude'],df_sta.latitude[df_sta['Station']==ref_sta].values[0] , x['longitude'],df_sta.longitude[df_sta['Station']==ref_sta].values[0] ), axis=1)
df_sta = df_sta.sort_values(by=['dist'])
sta_sorted = df_sta['Station'].to_list()
df_sta.head()

In [None]:
# sort the columns corresponding to crater distane
def sort_columns(df, column_order):
    # Filter the list to only include existing columns in the DataFrame
    valid_columns = [col for col in column_order if col in df.columns]

    # Sort the DataFrame columns based on the valid columns list
    sorted_columns = valid_columns + [col for col in df.columns if col not in valid_columns]

    # Return the DataFrame with sorted columns
    return df[sorted_columns]

# column names and drop too short stations
# value = df.rename(columns={"YEL/VALT": "YEL"})
# value = df.drop(['NED','JRO'], axis=1)

# resample
df = df.resample('1D').median()

# resample
df = df.rolling('2D', center=True).median()

# Sort the columns based on the desired order
df_sorted = sort_columns(df, sta_sorted)

# Display the result
df_sorted

In [None]:
 # Normalize the columns
df_sorted_norm = df_sorted.apply(preprocessing_functions.norm, axis=0)

# Display the result
df_sorted_norm

In [None]:
# read txt-file with UTC times and type of activity
df_activity = pd.read_csv('./example_data/mt_st_helens_activity.txt', header=1, skiprows=11)
df_activity.set_index('UTC',inplace=True)
df_activity.index = pd.to_datetime(df_activity.index).tz_localize(None)

activity_dome = df_activity.copy()
activity_dome_start = activity_dome[activity_dome['activity' ]=='d'].take([0,2,3,4,5,6])
activity_dome_end = activity_dome[activity_dome['activity' ]=='ed'].take([1,2,3,4,5,6])
activity_dome_start, activity_dome_end

In [None]:
from matplotlib.pyplot import cm
# Plot each column with an offset along the y-axis
year_plot = [2004, 2005]

fig, ax = plt.subplots(figsize=(6.4*2, 4.8))
offset = 0

for col in df_sorted_norm.columns:
    y_values = df_sorted_norm[col] + offset
    ax.plot(df_sorted_norm.index, y_values, label=col)
    offset += 0.5  # Adjust the offset as needed

ax.set_yticks(np.arange(0.25,(len(df_sorted_norm.columns))/2,0.5))
ax.set_yticklabels(df_sorted_norm.columns)
    
plt.xlim([datetime.datetime(year_plot[0],1,1), datetime.datetime(year_plot[-1],1,1)])

color = cm.gray(np.linspace(0, 1, len(activity_dome_start)+2))

for i in range(len(activity_dome_start)):
    sdate = activity_dome_start.index[i].to_pydatetime() # start date
    edate = activity_dome_end.index[i].to_pydatetime() # end date
    ax.axvspan(sdate, edate, alpha=0.25, color=color[i])
    
plt.title('DSAR')
# plt.savefig('output_path', dpi=300, bbox_inches='tight')
plt.show()