In [None]:
import matplotlib.pyplot as plt
# Create labels for the x-axis (temperature ranges)
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
import sys
parent_dir = os.path.dirname(os.environ["GTE_DIR"].replace("Glaciation_time_estimator",""))
GTE_DIR=os.environ["GTE_DIR"]
sys.path.insert(0, parent_dir)
from Glaciation_time_estimator.Auxiliary_func.config_reader import read_config
from Glaciation_time_estimator.Data_postprocessing.Job_result_fp_generator import generate_tracking_filenames
import seaborn as sns

In [None]:
config=read_config(os.path.join(GTE_DIR,'/config_half.yaml'))
t_deltas = config['t_deltas']
agg_fact = config['agg_fact']
min_temp_array, max_temp_array = config['min_temp_arr'],config['max_temp_arr']
folder_name=f"{config['start_time'].strftime(config['time_folder_format'])}_{config['end_time'].strftime(config['time_folder_format'])}"

In [None]:
ice_cont_crit_frac=0.05

Load data

In [None]:
def Extract_array_from_df(series:pd.Series):
    if series.empty:
        return None
    return np.stack(series.values)

In [None]:
from datetime import timedelta
start_ice_content_list=[]
higher_final_IF_counter_temp=np.zeros(len(min_temp_array))
labels = [f"{min_temp_array[i]} to {max_temp_array[i]}" for i in range( len(min_temp_array))]
# Create a figure and two subplots side-by-side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
for i in range(len(min_temp_array)-1,-1,-1):
    min_temp=min_temp_array[i]
    max_temp=max_temp_array[i]
    df=cloud_properties_df_list[i][np.where(agg_fact_list==3)[0][0]]
    end_ice_content = Extract_array_from_df(df["end_ice_fraction"][(df["max_ice_fraction"]>ice_cont_crit_frac) & (df["track_length"]>timedelta(minutes=60))])
    higher_final_IF_counter_temp[i]=(start_ice_content.mean(axis=1)<end_ice_content.mean(axis=1)).sum()
    bins=np.linspace(0,0.2,10)
    ax1.hist(np.var(start_ice_content,axis=1),bins=bins,label=labels[i])
    ax2.hist(np.var(start_ice_content,axis=1),bins=bins,label=labels[i])
ax1.set_xlim(0,0.2)
ax2.set_xlim(0,0.2)
ax1.set_title("First 1h")
ax2.set_title("Last 1h")
ax1.set_xlabel("Variance")
ax2.set_xlabel("Variance")
ax1.set_ylabel("Cloud number")
ax2.set_ylabel("Cloud number")
fig.suptitle("Variance of ice concentraion at start and end of track")
ax1.legend()
ax2.legend()
plt.savefig('/cluster/work/climate/dnikolo/n2o/Glaciation_time_estimatior/Result_graphs/ice_content_variance_hist.png', dpi=400)
higher_final_IF_counter_temp

In [None]:
# Create labels for the x-axis (temperature ranges)
labels = [f"{min_temp_array[i]} to {max_temp_array[i]}" for i in range( len(min_temp_array))]

# Create the bar graph
plt.figure(figsize=(10, 6))
plt.bar(labels, higher_final_IF_counter_temp, color='lightblue', label="N glaciations")

# Add title and labels
plt.title('Glaciation Occurrences Across Temperature Ranges with weak definition', fontsize=14, fontweight='bold')
plt.xlabel('Temperature Range (°C)', fontsize=12)
plt.ylabel('Glaciation Number', fontsize=12)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Add gridlines for better visualization
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Format y-axis to show only integer values
plt.gca().yaxis.get_major_locator().set_params(integer=True)

# Show the plot
plt.tight_layout()
plt.savefig('/cluster/work/climate/dnikolo/n2o/Glaciation_time_estimatior/Result_graphs/glaciation_counter_temp_weak_def.png', dpi=400)
plt.show()

In [None]:

# Initialize an empty list to store the individual dataframes
cloud_properties_df_list = []

# Iterate over each temperature range
for i in range(len(min_temp_array)):
    cloud_properties_df_list.append([])
    min_temp = min_temp_array[i]
    max_temp = max_temp_array[i]
    
    # Iterate over each pole
    for pole in config["pole_folders"]:
        # Construct the file path
        fp = os.path.join(
            config['postprocessing_output_dir'],
            pole,
            folder_name,
            f"Agg_{agg_fact:02}_T_{abs(round(min_temp)):02}_{abs(round(max_temp)):02}.parquet"
        )
        
        # Read the parquet file into a dataframe
        df = pd.read_parquet(fp)
        
        # Add columns for min_temp, max_temp, and pole
        df['min_temp'] = min_temp
        df['max_temp'] = max_temp
        df['pole'] = pole
        df['Hemisphere'] = "South" if pole == "sp" else "North"
        df['Lifetime [h]'] = df['track_length'] / pd.Timedelta(hours=1)
        
        # Append the dataframe to the sublist
        cloud_properties_df_list[i].append(df)

# Combine all dataframes into a single dataframe
combined_df = pd.concat([df for sublist in cloud_properties_df_list for df in sublist], ignore_index=True)

In [None]:
def normalize_IF_hist(row):
    lst = row['ice_frac_hist']
    normalized_length = 100
    original_indices = np.linspace(0, 1, len(lst))
    target_indices = np.linspace(0, 1, normalized_length)
    return np.interp(target_indices, original_indices, lst)

def max_normalize_IF_hist(row):
    lst = row['ice_frac_hist']
    max_val = np.max(lst)
    normalized_length=100
    if max_val == 0:
        normalized_list = np.zeros(normalized_length)
        max_idx = None
    else:
        max_idx = np.argmax(lst)
        original_indices = np.linspace(0, max_idx, len(lst[:max_idx + 1]))
        target_indices = np.linspace(0, max_idx, normalized_length)
        normalized_list = np.interp(target_indices, original_indices, lst[:max_idx + 1])
        normalized_list = np.pad(normalized_list, (0, normalized_length - len(normalized_list)), constant_values=np.nan)
    return normalized_list, max_idx

In [None]:
norm_IF_hist_list= []
max_norm_IF_hist_list =[]
Timesteps_to_max_list= []
for i, row in combined_df.iterrows():
    if i%10000==0:
        print(i/10000)
    max_norm_IF_hist, Timesteps_to_max =max_normalize_IF_hist(row)
    norm_IF_hist_list.append(normalize_IF_hist(row))
    max_norm_IF_hist_list.append(max_norm_IF_hist)
    Timesteps_to_max_list.append(Timesteps_to_max)
combined_df["Timesteps_to_max"] = Timesteps_to_max_list
combined_df['max_norm_IF_hist'] = max_norm_IF_hist_list
combined_df['norm_IF_hist'] = norm_IF_hist_list

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Number of poles and temperature ranges
num_poles = len(config["pole_folders"])
num_temps = len(min_temp_array)

# Create a figure with subplots
fig, axes = plt.subplots(num_poles, num_temps, figsize=(20, 10), sharex=True, sharey=True)

# Iterate through temperature ranges and pole folders
for t_ind in range(num_temps):
    for pole_ind in range(num_poles):
        ax = axes[pole_ind, t_ind]  # Select the appropriate subplot
        df = cloud_properties_df_list[t_ind][pole_ind]
        min_temp = min_temp_array[t_ind]
        max_temp = max_temp_array[t_ind]

        # Plot histogram
        sns.histplot(
            x=df["avg_size[km]"], 
            bins=10, 
            kde=True, 
            log_scale=[True, False], 
            color='skyblue', 
            edgecolor='black', 
            linewidth=0.8, 
            ax=ax
        )

        # Set title for each subplot
        hemisphere= "Southern hemisphere" if pole_ind == 1  else  "Northern hemisphere"
        ax.set_title(f"T: {min_temp}\u00B0 to {max_temp}\u00B0", fontsize=10)

        # Customize x-axis and y-axis labels
        if pole_ind == num_poles - 1:  # Bottom row
            ax.set_xlabel("Time-Averaged Cloud Area [km²]", fontsize=8)
        if t_ind == 0:  # First column
            ax.set_ylabel("Number of Clouds", fontsize=8)

        # Set x-axis scale and grid
        ax.set_xlim(1e3, 1e6)
        ax.set_xscale('log')
        ax.grid(visible=True, which='both', linestyle='--', linewidth=0.5, alpha=0.7)
        
        # Customize ticks
        ax.tick_params(axis='both', which='major', labelsize=8)

fig.text(0.5, 0.95, "Northern Hemisphere", ha='center', va='center', fontsize=14, weight='bold')
fig.text(0.5, 0.50, "Southern Hemisphere", ha='center', va='center', fontsize=14, weight='bold')

# Adjust layout
# fig.tight_layout(rect=[0, 0, 1, 0.92]) 
# plt.subplots_adjust(top=0.9, hspace=0.4)  # Increase top margin and spacing between rows
# Adjust layout to avoid overlapping
# plt.tight_layout()

# Save the figure
filename=os.path.join(GTE_DIR,"/Result_graphs/size_hist_temp_subplots")
plt.savefig(filename+'.png', dpi=400)
plt.savefig(filename+'.pdf', dpi=400)

# Show the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting
plt.figure(figsize=(10, 6))
sns.histplot(data=combined_df, x="avg_size[km]",hue='Hemisphere',bins=20, kde=True, log_scale=[True, False], color='skyblue', edgecolor='black', linewidth=0.8)
# plt.legend(title='Hemisphere', loc='upper right', labels=['North', 'South'])
# Labels and title
plt.xlabel("Time-Averaged Cloud Area [km²]", fontsize=12)
plt.ylabel("Number of Clouds", fontsize=12)
plt.title("Histogram of Cloud Area between", fontsize=14, fontweight='bold')

# Customize x-axis
plt.xlim(1e3, 1e6)
plt.xscale('log')
plt.grid(visible=True, which='both', linestyle='--', linewidth=0.5, alpha=0.7)

# Ticks
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

# Save the figure
plt.tight_layout()  # Adjust layout to avoid clipping
# plt.savefig(os.path.join(GTE_DIR,'/Result_graphs/size_hist_T_5_0_np.png', dpi=400)
filename=os.path.join(GTE_DIR,"/Result_graphs/size_hist_by_pole")
plt.savefig(filename+'.png', dpi=400)
plt.savefig(filename+'.pdf', dpi=400)

# Show the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting
plt.figure(figsize=(10, 6))
sns.histplot(data=combined_df, x="Lifetime [h]",hue='Hemisphere',binwidth=1, color='skyblue',log_scale=[False, True], edgecolor='black', linewidth=0.8)
# plt.legend(title='Hemisphere', loc='upper right', labels=['North', 'South'])
# Labels and title
plt.xlabel("Tracking lifetime [h]", fontsize=12)
plt.ylabel("Number of Clouds", fontsize=12)
plt.title("Histogram of Cloud Lifetimes", fontsize=14, fontweight='bold')

# Customize x-axis
# plt.xlim(1e3, 1e6)
plt.yscale('log')
plt.grid(visible=True, which='both', linestyle='--', linewidth=0.5, alpha=0.7)

# Ticks
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.xlim(left=0, right=60)
# Save the figure
plt.tight_layout()  # Adjust layout to avoid clipping
# plt.savefig(os.path.join(GTE_DIR,'/Result_graphs/size_hist_T_5_0_np.png', dpi=400)
filename=os.path.join(GTE_DIR,"/Result_graphs/lifetime_hist_by_pole")
plt.savefig(filename+'.png', dpi=400)
plt.savefig(filename+'.pdf', dpi=400)

# Show the plot
plt.show()

In [None]:
def check_cloud(row, params):
    norm_ice_frac_hist = row['norm_IF_hist']
    if (max(norm_ice_frac_hist) >=params['crit_max_IF']) and row['Timesteps_to_max'] >=params['min_time_to_max'] and row['track_length']>=params['crit_lifetime'] and (max(norm_ice_frac_hist) >= norm_ice_frac_hist[:10].mean()+params['min_delta_IF']):
        return True
    return False



In [None]:
from itertools import product
crit_max_IF_vals = np.arange(0.1,0.91,0.2)
crit_lifetime_vals = [timedelta(minutes=45), timedelta(hours=1), timedelta(hours=2) ,timedelta(hours=3)]
min_delta_IF_vals = [-1,0,0.3,0.6,0.9]
min_time_to_max_vals = [0,4,8]
print(crit_max_IF_vals, crit_lifetime_vals, min_delta_IF_vals)

all_combinations = product(crit_max_IF_vals, crit_lifetime_vals, min_delta_IF_vals, min_time_to_max_vals)

# Create a dataframe
params_df = pd.DataFrame(all_combinations, columns=['crit_max_IF', 'crit_lifetime', 'min_delta_IF', 'min_time_to_max'])


In [None]:
from functools import partial
from multiprocessing import Pool

In [None]:
def check_param_comb(params_row):
    # print(f"It")
    # check_params = {
    #     'crit_max_IF': row["crit_max_IF"],
    #     'crit_lifetime': row["crit_lifetime"],
    #     'min_delta_IF': row["min_delta_IF"],
    #     'min_time_to_max': row["min_time_to_max"]}
    part_check_cloud = partial(check_cloud, params=params_row)
    combined_df['Filter_check'] = combined_df.apply(part_check_cloud, axis=1)
    return combined_df['Filter_check'].sum()

In [None]:
with Pool(8) as pool:
    n_passing_clouds = pool.map(check_param_comb, [row for _, row in params_df.iterrows()])

# Assigning results back to `params_df`
params_df['N_passing_clouds'] = n_passing_clouds

print(params_df)

In [None]:
# Create pairs of variables for axes
params_df["Min Lifetime [h]"] = params_df["crit_lifetime"]/ timedelta(hours=1)
params_df['Frac passing clouds'] = params_df["N_passing_clouds"]/len(combined_df) *100
params_df["crit_max_IF"] = round(params_df["crit_max_IF"],2)
params_df["min_time_to_max [h]"] = params_df["min_time_to_max"]/4

In [None]:

params_df["x_pair"] = params_df["crit_max_IF"].astype(str) + ", " + params_df["Min Lifetime [h]"].astype(str)
params_df["y_pair"] = params_df["min_delta_IF"].astype(str) + ", " + params_df["min_time_to_max [h]"].astype(str)

# Pivot table for heatmap
heatmap_data = params_df.pivot_table(
    index="y_pair",
    columns="x_pair",
    values="Frac passing clouds",
    aggfunc="mean"
)

# Create the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, annot=True, fmt=".01f", cmap="YlGnBu")
plt.title("Percentage fraction of filter passing clouds")
plt.xlabel("Max IF Theshold, Min Lifetime")
plt.ylabel("Min ΔIF from start , Min time to max IF")
filename=os.path.join(GTE_DIR,"/Result_graphs/frac_filter_passing_clouds")
plt.savefig(filename+'.png', dpi=400)
plt.savefig(filename+'.pdf', dpi=400)
plt.show()

In [None]:
combined_df.keys()

In [None]:
sns.scatterplot(data=combined_df, x= "avg_size[km]", y = "Lifetime [h]")
plt.xscale('log')
plt.ylim(0,60)
plt.xlim(1e3,1e6)

In [None]:
clean_df = combined_df[["avg_size[km]", "Lifetime [h]", "min_temp","Timesteps_to_max"]]
sns.PairGrid(data=clean_df)

In [None]:

# Assuming 'clean_df' is the DataFrame containing the columns
pairplot = sns.pairplot(clean_df, hue="min_temp", plot_kws={'alpha': 0.7})

# Apply a logarithmic scale to the x-axis of the "avg_size[km]" variable
for ax in pairplot.axes.flatten():
    # Apply log scale to 'avg_size[km]' on the x-axis
    if ax.get_xlabel() == 'avg_size[km]':
        ax.set_xscale('log')

# Get indices of the diagonal axes
# diagonal_indices = np.diag_indices_from(pairplot.axes)

# # Apply log scale to the y-axis of the diagonal plots (Lifetime [h] vs Lifetime [h], Timesteps_to_max vs Timesteps_to_max)
# for i, j in zip(*diagonal_indices):
#     ax = pairplot.axes[i, j]
#     ax.set_yscale('log')
filename=os.path.join(GTE_DIR,"/Result_graphs/pairplot_all")
plt.savefig(filename+'.png', dpi=400)
plt.savefig(filename+'.pdf', dpi=400)
plt.show()

In [None]:
clean_df = combined_df[["avg_size[km]", "Lifetime [h]", "min_temp", "Timesteps_to_max", "max_ice_fraction", "avg_lat"]].sample(50000)

In [None]:
# Compute the correlation matrix
corr = clean_df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
# sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
#             square=True, linewidths=.5, cbar_kws={"shrink": .5})
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
# Define the dataset


# Create the PairGrid
g = sns.PairGrid(clean_df)

# Define how to plot each subplot
g.map_upper(sns.scatterplot)  # Scatter plot for the upper triangle
g.map_lower(sns.scatterplot)  # Scatter plot for the lower triangle
g.map_diag(sns.histplot, kde=True)  # Histogram with KDE for the diagonal

# Replace subplots with `min_temp` as one of the axes with box plots
for i, var_i in enumerate(g.x_vars):
    for j, var_j in enumerate(g.y_vars):
        if var_i == "min_temp": #or var_j == "min_temp":
            if i != j:  # Avoid diagonal plots
                ax = g.axes[j, i]  # Get the corresponding axes
                sns.boxplot(x=clean_df[var_i], y=clean_df[var_j], ax=ax)

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

In [None]:
def half_filter(row):
    IF = np.array(row['norm_IF_hist'])
    if IF[:50].mean() + 0.1 < IF[50:].mean()
        

In [None]:
half_filter(combined_df.iloc[0])

In [None]:
(combined_df['is_liq']& combined_df['is_mix'] & combined_df['is_ice']).astype(bool).sum(axis=0)/len(combined_df)

In [None]:
clean_df = combined_df[["avg_size[km]", "Lifetime [h]", "min_temp","Timesteps_to_max"]]
sns.pairplot(data=clean_df)

In [None]:
timedelta(minutes=45)/2