In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from matplotlib.colors import Normalize
from scipy.interpolate import PchipInterpolator
from tqdm import tqdm
import sys
from pathlib import Path

#some extra params for graphics
%matplotlib inline
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Set up paths
NOTEBOOK_DIR = Path.cwd()
SRC_DIR = NOTEBOOK_DIR.parent / "src"
sys.path.append(str(SRC_DIR))
# Import the shared paths
from paths import (
    HOURLY_HEATMAPS_DIR,
    HOURLY_HEATMAPS_IMAGES_DIR,
    HOURLY_GREYSCALE_IMAGES_DIR,
    CSV_DIR,
    DATAFRAMES_DIR
)

In [2]:
#Load main dataframes
df_teide = pd.read_parquet(DATAFRAMES_DIR / "df_teide_full_Info.parquet")
df_roque = pd.read_parquet(DATAFRAMES_DIR / "df_roque_full_Info.parquet")

In [3]:
#Create hourly heatmaps, looking to all unique days present in the DataFrame, and then the available hours of that day
#So it will create a heatmap for each available hour(int , hour 1, 2 ,3,...)from 0 to 23 within each available day 

def generate_hourly_heatmaps_unified(df, site_name, mode='color_full',list_filenames=False):
    assert mode in ['color_full', 'color_plain', 'grayscale'], "Invalid mode"

    # Output folder based on mode
    output_dirs = {
        'color_full': 'hourly_heatmaps',
        'color_plain': 'hourly_heatmaps_images',
        'grayscale': 'hourly_greyscale_images'
    }
    output_dir = NOTEBOOK_DIR.parent / output_dirs[mode] / site_name
    os.makedirs(output_dir, exist_ok=True)
    if list_filenames:
        filenames = []
        
    missings_counts = 0

    common_altitudes = np.arange(1600, 25000 + 1, 300)

    # Ensure datetime and numeric
    df['timestamp_begin'] = pd.to_datetime(df['timestamp_begin'])
    df['turbulence_log'] = pd.to_numeric(df['turbulence_log'], errors='coerce')
    df = df[df['turbulence_log'].between(-30, 30) | df['turbulence_log'].isna()]

    # Interpolate turbulence_log within each profile_id
    df['turbulence_log'] = df.groupby('profile_id')['turbulence_log'].transform(
        lambda group: group.interpolate(method='linear', limit_direction='both')
    )

    all_days = df['timestamp_begin'].dt.date.unique()

    for current_day in tqdm(sorted(all_days)):
        df_day = df[df['timestamp_begin'].dt.date == current_day].copy()
        df_day['hour_int'] = df_day['timestamp_begin'].dt.hour

        for hour in sorted(df_day['hour_int'].unique()):
            df_hour = df_day[df_day['hour_int'] == hour].copy()
            profile_ids = df_hour['profile_id'].unique()

            if len(profile_ids) < 2:
                missings_counts += 1
                continue

            heatmap_data = np.zeros((len(common_altitudes), len(profile_ids)))
            valid_profiles = 0

            for i, pid in enumerate(profile_ids):
                profile = df_hour[df_hour['profile_id'] == pid].sort_values('altitude')
                profile = profile.drop_duplicates(subset='altitude')

                if profile['altitude'].nunique() < 10:
                    continue

                if profile['turbulence_log'].isna().any():
                    continue

                try:
                    f = PchipInterpolator(profile['altitude'], profile['turbulence_log'], extrapolate=False)
                    interpolated = f(common_altitudes)

                    if np.any(np.isnan(interpolated)):
                        interpolated = np.nan_to_num(interpolated, nan=np.nanmean(interpolated))

                    heatmap_data[:, valid_profiles] = interpolated
                    valid_profiles += 1
                except Exception:
                    continue

            if valid_profiles < 2:
                continue

            heatmap_data = heatmap_data[:, :valid_profiles]

            # Plot
            fig, ax = plt.subplots(figsize=(4, 6))

            if mode in ['color_full', 'color_plain']:
                im = ax.imshow(
                    heatmap_data,
                    aspect='auto',
                    origin='lower',
                    extent=[0, heatmap_data.shape[1], common_altitudes[0], common_altitudes[-1]],
                    cmap='plasma',
                    norm=Normalize(vmin=-20, vmax=20)
                )
            else:  # grayscale
                # Normalize to [0, 1]
                gray_data = np.clip((heatmap_data + 20) / 40, 0, 1)
                im = ax.imshow(
                    gray_data,
                    aspect='auto',
                    origin='lower',
                    cmap='gray',
                    vmin=0,
                    vmax=1
                )

            if mode == 'color_full':
                ax.set_title(f"{current_day} - {hour:02d}:00 - {site_name}")
                ax.set_xlabel("Profiles in hour")
                ax.set_ylabel("Altitude (m)")
                plt.colorbar(im, ax=ax, label="Turbulence log")
            else:
                ax.axis('off')

            filename = f"{current_day}_hour_{hour:02d}.png"
            filepath = os.path.join(output_dir, filename)
            if list_filenames:
                filenames.append(filename.replace('.png', ''))

            plt.tight_layout()
            plt.savefig(filepath, bbox_inches='tight', pad_inches=0 if mode != 'color_full' else 0.1)
            plt.close()

    print("missings count =", missings_counts)
    if list_filenames:
        return filenames

#### To generate heatmaps can have a high operational cost in terms of time mainly, it depends on the hardware of the machine where it's executed. 
#### Check times with progress bar info when running it, or do it one by one.

#### Images already added to the repository, but in case that you want to give a try it is a one time task

In [5]:
# 1. Full color heatmaps with titles, axes, and colorbar
filenames_teide = generate_hourly_heatmaps_unified(
    df_teide, site_name="OT", mode="color_full", list_filenames=True
)

100%|█████████████████████████████████████████| 163/163 [09:28<00:00,  3.49s/it]

missings count = 3





In [6]:
filenames_roque = generate_hourly_heatmaps_unified(
    df_roque, site_name="ORM", mode="color_full",list_filenames=True
)

100%|█████████████████████████████████████████| 236/236 [23:18<00:00,  5.92s/it]

missings count = 4





In [7]:
#Then create the other versions
# 2. Color heatmaps without titles, axes, or colorbar
generate_hourly_heatmaps_unified(df_teide, site_name="OT", mode="color_plain")
generate_hourly_heatmaps_unified(df_roque, site_name="ORM", mode="color_plain")

100%|█████████████████████████████████████████| 163/163 [06:54<00:00,  2.54s/it]


missings count = 3


100%|█████████████████████████████████████████| 236/236 [16:40<00:00,  4.24s/it]

missings count = 4





In [8]:
# 3. Grayscale heatmaps (normalized to [0,1]) without titles, axes, or colorbar
generate_hourly_heatmaps_unified(df_teide, site_name="OT", mode="grayscale")
generate_hourly_heatmaps_unified(df_roque, site_name="ORM", mode="grayscale")

100%|█████████████████████████████████████████| 163/163 [06:43<00:00,  2.48s/it]


missings count = 3


100%|█████████████████████████████████████████| 236/236 [17:33<00:00,  4.46s/it]

missings count = 4





In [9]:
# Create DataFrames with filenames list from each site and save them as .csv
df_filenames_teide = pd.DataFrame(filenames_teide, columns=['file_names'])
df_filenames_roque = pd.DataFrame(filenames_roque, columns=['file_names'])

In [10]:
df_filenames_teide.to_csv(CSV_DIR / "tensors_names_teide.csv", index=False)
df_filenames_roque.to_csv(CSV_DIR / "tensors_names_roque.csv", index=False)