# Read MATLAB .mat File Using h5py
This notebook demonstrates how to read a MATLAB .mat file using the h5py library and extract only a custom list of variables.

In [1]:
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Specify File Path and Variables
Open file with meta data about variables

In [2]:
meta_df = pd.read_csv('data/meta_variables.csv', sep=';')
meta_df

Unnamed: 0,name,class,bytes,size,size_of_size,left_bound,right_bound,subcategory,category,created_place
0,A,double,3872,"[22, 22]",2,22,22,Unknown,Unknown,before_start_calculation_cycle_5
1,AB,double,24,"[3, 1]",2,3,1,Unknown,Unknown,before_clear_Prometheus_data_3
2,AC1,double,8,"[1, 1]",2,1,1,Unknown,Unknown,after_Read_ISON_Data_P_2
3,AC2,double,8,"[1, 1]",2,1,1,Unknown,Unknown,after_Read_ISON_Data_P_2
4,AC3,double,8,"[1, 1]",2,1,1,Unknown,Unknown,after_Read_ISON_Data_P_2
...,...,...,...,...,...,...,...,...,...,...
2172,zero_vel_cnt,double,8,"[1, 1]",2,1,1,Unknown,Unknown,before_start_calculation_cycle_5
2173,zero_vel_cnt_max,double,8,"[1, 1]",2,1,1,Unknown,Unknown,before_start_calculation_cycle_5
2174,zero_velocity,logical,1,"[1, 1]",2,1,1,Unknown,Unknown,before_start_calculation_cycle_5
2175,zero_velocity0,logical,1,"[1, 1]",2,1,1,Unknown,Unknown,before_start_calculation_cycle_5


Choose data to load

In [3]:
meta_df.loc[meta_df['name'].str.startswith('Fi_'), 
            ['name',  'created_place', 'category', 'subcategory']] \
.sort_values(by=['created_place'], key=lambda col: col.str[-1], ascending=True) \
.reset_index(drop=True)

Unnamed: 0,name,created_place,category,subcategory
0,Fi_gps,after_Read_ISON_Data_P_2,coordinates_ins_gnss,"Plots of latitude, longitude (in degrees), alt..."
1,Fi_prom,after_Read_ISON_Data_P_2,coordinates_ins_gnss,"Plots of latitude, longitude (in degrees), alt..."
2,Fi_S,before_simulation_4,Unknown,Unknown
3,Fi_S_mass,before_simulation_4,coordinates_ins_gnss,"Plots of latitude, longitude (in degrees), alt..."
4,Fi_S_out,before_simulation_4,Unknown,Unknown
5,Fi_S_prev,before_simulation_4,Unknown,Unknown
6,Fi_S_pv,before_simulation_4,Unknown,Unknown
7,Fi_gps2imu,before_simulation_4,coordinates_ins_gnss,"Plots of latitude, longitude (in degrees), alt..."
8,Fi_gps2pv,before_simulation_4,coordinates_ins_gnss,"Plots of latitude, longitude (in degrees), alt..."
9,Fi_od,before_start_calculation_cycle_5,Unknown,Unknown


In [4]:
meta_df.loc[meta_df['name'].str.endswith('_S_mass'), 
            ['name',  'created_place', 'size', 'category', 'subcategory']] \
.sort_values(by=['created_place'], key=lambda col: col.str[-1], ascending=True) \
.reset_index(drop=True)

Unnamed: 0,name,created_place,size,category,subcategory
0,Fi_S_mass,before_simulation_4,"[1, 10000]",coordinates_ins_gnss,"Plots of latitude, longitude (in degrees), alt..."
1,G_S_mass,before_simulation_4,"[1, 10000]",orientation_angless,Orientation angle plots
2,K_S_mass,before_simulation_4,"[1, 10000]",orientation_angless,Orientation angle plots
3,La_S_mass,before_simulation_4,"[1, 10000]",coordinates_ins_gnss,"Plots of latitude, longitude (in degrees), alt..."
4,T_S_mass,before_simulation_4,"[1, 10000]",air_data_computers,Air Data Computer
5,V_S_mass,before_simulation_4,"[3, 10000]",velocities,"North, east and vertical velocity components p..."
6,Vh_S_mass,before_simulation_4,"[1, 10000]",air_data_computers,Air Data Computer
7,h_S_mass,before_simulation_4,"[1, 10000]",coordinates_ins_gnss,"Plots of latitude, longitude (in degrees), alt..."
8,hss_S_mass,before_simulation_4,"[3, 10000]",Unknown,Unknown


In [5]:

meta_df.loc[meta_df['name'].str.contains(r'^V_|V[a-z]_', regex=True) 
            & (meta_df['left_bound'] > 0) & (meta_df['right_bound'] == 10000),
            ['name',  'created_place',  'category', 'subcategory']] \
.sort_values(by=['created_place'], key=lambda col: col.str[-1], ascending=True) \
.reset_index(drop=True)

Unnamed: 0,name,created_place,category,subcategory
0,Vn_prom,after_Read_ISON_Data_P_2,velocities,"North, east and vertical velocity components p..."
1,V_inp,after_Read_ISON_Data_P_2,sensorss,Plots of temperature and voltage changes
2,Ve_prom,after_Read_ISON_Data_P_2,velocities,"North, east and vertical velocity components p..."
3,Vh_gps,after_Read_ISON_Data_P_2,Unknown,Unknown
4,Vn_gps,after_Read_ISON_Data_P_2,velocities,"North, east and vertical velocity components p..."
5,V_ext,after_Read_ISON_Data_P_2,Unknown,Unknown
6,V_od,after_Read_ISON_Data_P_2,air_data_computers,Air Data Computer
7,Vv_gps,after_Read_ISON_Data_P_2,velocities,"North, east and vertical velocity components p..."
8,Ve_gps,after_Read_ISON_Data_P_2,velocities,"North, east and vertical velocity components p..."
9,Vv_prom,after_Read_ISON_Data_P_2,velocities,"North, east and vertical velocity components p..."


Select variables to read from file

In [6]:
manual_list = ['ms_gps',
                     'corr_cut',
                     'cut_KF_1',
                     'cut_KF',
                     'La_prom',
                     'Fi_prom',
                     'Vn_prom',
                     'Ve_prom',
                     'Fi_S_mass',
                     'La_S_mass',
                     'V_S_mass'] 

In [7]:
# All variables type of logical and size (<0, Ndata)
meta_df.loc[(meta_df['left_bound'] > 0) &
            (meta_df['right_bound'] == 10000) &
            (meta_df['class'] == 'logical'),
            'name'] \
.reset_index(drop=True).tolist()

['AHRS_speed_bit',
 'Acceleration_exceeding_detect',
 'Accelerometer_unit',
 'Air_speed_bit',
 'Air_speed_incorrect',
 'Air_speed_incorrect_simulated',
 'Ambient_Air_Data_bit',
 'Angular_rate_exceeding_detect',
 'Automatic_2D_calibration_of_mag',
 'Automatic_3D_calibration_of_mag',
 'BESTPOS_log_index',
 'BESTVEL_log_index',
 'BESTXYZ_log_index',
 'Baro_altimeter',
 'Base_station_range_bit',
 'Below_Threshold',
 'Clb_data_accumulation_calc',
 'Clb_status',
 'DVL_data_bit',
 'Date_bit',
 'Differential_pressure_Temperature_bit',
 'Differential_pressure_input_to_the_INS_algorithm',
 'Differential_pressure_measurement',
 'Differential_pressure_sensor',
 'Dopp_loc_bit',
 'Dual_Axis_Air_speed_bit',
 'Electronics_unit',
 'Environmental_temperature',
 'Ext_alt_bit',
 'Ext_hor_pos_bit',
 'Ext_pos_bit',
 'GNNS_antennas_position_bit',
 'GNSS_position_validity',
 'GNSS_receiver_input_to_the_INS_algorithm',
 'GNSS_receiver_unit',
 'GNSS_switch_on_command',
 'Ground_speed_bit',
 'Gyroscope_unit',
 '

Set the path to the MATLAB .mat file and define the custom list of variables to extract.

In [8]:
# Path to the MATLAB .mat file
mat_file_path = './data/test_data.mat'
#mat_file_path = './data/ISON_v04_20250619.mat'

# Custom list of variables to read from the .mat file
 # Replace with your variable names

variables_to_read = meta_df.loc[(meta_df['left_bound'] > 0) &
            (meta_df['right_bound'] == 10000) &
            (meta_df['class'] == 'logical'),
            'name'].to_list()
variables_to_read.extend(manual_list)
variables_to_read

['AHRS_speed_bit',
 'Acceleration_exceeding_detect',
 'Accelerometer_unit',
 'Air_speed_bit',
 'Air_speed_incorrect',
 'Air_speed_incorrect_simulated',
 'Ambient_Air_Data_bit',
 'Angular_rate_exceeding_detect',
 'Automatic_2D_calibration_of_mag',
 'Automatic_3D_calibration_of_mag',
 'BESTPOS_log_index',
 'BESTVEL_log_index',
 'BESTXYZ_log_index',
 'Baro_altimeter',
 'Base_station_range_bit',
 'Below_Threshold',
 'Clb_data_accumulation_calc',
 'Clb_status',
 'DVL_data_bit',
 'Date_bit',
 'Differential_pressure_Temperature_bit',
 'Differential_pressure_input_to_the_INS_algorithm',
 'Differential_pressure_measurement',
 'Differential_pressure_sensor',
 'Dopp_loc_bit',
 'Dual_Axis_Air_speed_bit',
 'Electronics_unit',
 'Environmental_temperature',
 'Ext_alt_bit',
 'Ext_hor_pos_bit',
 'Ext_pos_bit',
 'GNNS_antennas_position_bit',
 'GNSS_position_validity',
 'GNSS_receiver_input_to_the_INS_algorithm',
 'GNSS_receiver_unit',
 'GNSS_switch_on_command',
 'Ground_speed_bit',
 'Gyroscope_unit',
 '

# Read Selected Variables from .mat File
Test opening .mat file with h5py

In [None]:
# with h5py.File(mat_file_path, 'r') as f:
#     for var in variables_to_read:
#         if var in f:
#             data = f[var][()]
#             display(data, data.ndim, data.shape[1])
#         else:
#             print(f"Variable '{var}' not found in the .mat file.")

Open the .mat file and extract only the specified variables using h5py.

In [9]:
# Read selected variables from .mat file and convert each n x m array to a DataFrame with columns named after the variable
wide_dfs = {}
with h5py.File(mat_file_path, 'r') as f:
    for var in variables_to_read:
        if var in f:
            data = f[var][()]
            if isinstance(data, np.ndarray):
                if data.shape[1] == 1:
                    wide_dfs[var] = pd.DataFrame(data, columns=[var])  # Default column name
                else:
                    col_names = [f"{var}.{i+1}" for i in range(data.shape[1])]
                    wide_dfs[var] = pd.DataFrame(data, columns=col_names)
            else:
                wide_dfs[var] = data  # Keep as is if not a 2D array
        else:
            print(f"Variable '{var}' not found in the .mat file.")

In [11]:
# Combine selected or all DataFrames from wide_dfs into one DataFrame (wide format)
# To combine all, just use list(wide_dfs.values())
# To combine selected, specify the variable names in combine_vars
combine_vars = list(wide_dfs.keys())  # or e.g., ['cut_KF', 'another_var']

# Only keep DataFrames
dfs_to_combine = [wide_dfs[var] for var in combine_vars if isinstance(wide_dfs[var], pd.DataFrame)]

if dfs_to_combine:
    df_merged = pd.concat(dfs_to_combine, axis=1)
    display(df_merged.head())
else:
    print('No DataFrames to combine.')

Unnamed: 0,AHRS_speed_bit,Acceleration_exceeding_detect.1,Acceleration_exceeding_detect.2,Acceleration_exceeding_detect.3,Accelerometer_unit,Air_speed_bit,Air_speed_incorrect,Air_speed_incorrect_simulated,Ambient_Air_Data_bit,Angular_rate_exceeding_detect.1,...,cut_KF.16,La_prom,Fi_prom,Vn_prom,Ve_prom,Fi_S_mass,La_S_mass,V_S_mass.1,V_S_mass.2,V_S_mass.3
0,0,0,0,0,0,1,0,0,0,0,...,0.0,-9.320627,39.348621,0.0,0.0,39.348621,-9.320627,0.0024,-0.0007,-0.0331
1,0,0,0,0,0,1,0,0,0,0,...,0.0,-9.320627,39.348621,0.0,0.0,39.348621,-9.320627,0.0,-0.0,-0.03
2,0,0,0,0,0,1,0,0,0,0,...,0.0,-9.320627,39.348621,0.0,0.0,39.348621,-9.320627,0.0,-0.0,-0.03
3,0,0,0,0,0,1,0,0,0,0,...,0.0,-9.320627,39.348621,0.0,0.0,39.348621,-9.320627,0.0,-0.0,-0.03
4,0,0,0,0,0,1,0,0,0,0,...,0.0,-9.320627,39.348621,0.0,0.0,39.348621,-9.320627,0.0,-0.0,-0.03


Analyze corelation between data

In [12]:
selected_columns = df_merged.columns[df_merged.columns.str.contains(r'^cut_KF|^V_S', regex=True)].to_list()
selected_columns

['cut_KF_1.1',
 'cut_KF_1.2',
 'cut_KF.1',
 'cut_KF.2',
 'cut_KF.3',
 'cut_KF.4',
 'cut_KF.5',
 'cut_KF.6',
 'cut_KF.7',
 'cut_KF.8',
 'cut_KF.9',
 'cut_KF.10',
 'cut_KF.11',
 'cut_KF.12',
 'cut_KF.13',
 'cut_KF.14',
 'cut_KF.15',
 'cut_KF.16',
 'V_S_mass.1',
 'V_S_mass.2',
 'V_S_mass.3']

In [70]:
#df_merged.loc[:,(df_merged != 0).any(axis=0)]
df_merged.loc[:,selected_columns]

Unnamed: 0,cut_KF_1.1,cut_KF_1.2,cut_KF.1,cut_KF.2,cut_KF.3,cut_KF.4,cut_KF.5,cut_KF.6,cut_KF.7,cut_KF.8,...,cut_KF.10,cut_KF.11,cut_KF.12,cut_KF.13,cut_KF.14,cut_KF.15,cut_KF.16,V_S_mass.1,V_S_mass.2,V_S_mass.3
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0024,-0.0007,-0.0331
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,-0.0000,-0.0300
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,-0.0000,-0.0300
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,-0.0000,-0.0300
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,-0.0000,-0.0300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0500,-0.0300,0.0500
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0500,-0.0300,0.0500
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0500,-0.0300,0.0500
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0400,-0.0300,0.0500


In [13]:
corr = df_merged.loc[:,selected_columns].corr()

# Create a boolean mask for correlations between 0.5 and 0.9 (excluding self-correlation)
#mask = (corr >= abs(0.5)) & (corr <= abs(0.8)) & (corr != 1.0)
mask = (corr >= abs(0.1)) & (corr <= abs(0.8)) & (corr != 1.0)

# Option 1: Get the pairs as a list of tuples
selected_pairs = [
    (col1, col2, float(corr.loc[col1, col2]))
    for col1 in corr.columns
    for col2 in corr.columns
    if mask.loc[col1, col2]
]

selected_pairs
# Option 2: Set values outside the range to NaN for visualization
#filtered_corr = corr.where(mask)
#filtered_corr

[('cut_KF.5', 'V_S_mass.2', 0.48438730765620774),
 ('cut_KF.5', 'V_S_mass.3', 0.49424931481909495),
 ('V_S_mass.2', 'cut_KF.5', 0.48438730765620774),
 ('V_S_mass.3', 'cut_KF.5', 0.49424931481909495)]

In [None]:
# Exclude colums with zeros
#df_merged.loc[:, (df_merged != 0).any(axis=0)].corr().T.style.format('{:.2f}').background_gradient(cmap='cividis') 

In [14]:
df_merged.loc[:,['cut_KF.5', 'V_S_mass.2']].describe()

Unnamed: 0,cut_KF.5,V_S_mass.2
count,10000.0,10000.0
mean,0.13,-0.062423
std,0.33632,0.046804
min,0.0,-0.17
25%,0.0,-0.1
50%,0.0,-0.07
75%,0.0,-0.01
max,1.0,0.01


In [15]:
df_merged.loc[:,'cut_KF.5'].sum()

np.float64(1300.0)

In [80]:
col = df_merged['cut_KF.5']
ratio = (col == 1).sum() / (col == 0).sum()
print(ratio)

0.14942528735632185
