In [1]:
import os
import numpy as np
import pandas as pd
import xarray as xr
import tempfile
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
from datetime import datetime

pd.set_option("display.max_columns",30)

In [2]:
from station.data_submission import DataSubmission
from station.station import StationData


from station.dat_to_nc_all_var import DatToNcAllVar
from station.dat_to_nc_all_var_copy import DatToNcConverter2

In [7]:
# Instantiate the converter
name = "marshall_hourly"
directory = "measurements/Marshall"
target_directory = "station_data_as_nc"


converter = DatToNcConverter2(
    name=name,
    directory=directory,
    target_directory=target_directory,
    hourly=True,
    save_raw=True, 
    raw_directory="station_data_raw",
    save_processed=True,
    processed_directory="station_data_processed"
)

In [8]:
converter.execute()

Reading .dat files: 100%|██████████| 2465/2465 [00:14<00:00, 168.73it/s]


Resampling data to hourly intervals...


Resampling variables: 100%|██████████| 22/22 [01:11<00:00,  3.24s/it]

Saving to station_data_as_nc/marshall_hourly.nc
NetCDF file saved successfully.
Data processing complete.





In [36]:
ds = xr.open_dataset("station_data_as_nc/marshall_allvar.nc")

In [37]:
ds

In [38]:
raw_vienna = converter.raw_df
processed_vienna = converter.processed_df
resampled_vienna = converter.resampled_df

In [39]:
raw_vienna.head()

Unnamed: 0_level_0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,bme_slp,bme_alt,bme_hum,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2017-03-01 05:24:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,-999.99
2017-03-01 05:25:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0
2017-03-01 05:26:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0
2017-03-01 05:27:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0
2017-03-01 05:28:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0


In [32]:
processed_vienna.head()

Unnamed: 0_level_0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,bme_slp,bme_alt,bme_hum,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2017-04-24 07:14:00,,,,,,,,,,,,,,,,,0.0,,,,,0.0
2017-04-24 07:15:00,,,,,,,,,,,,,,,,,0.0,,,,70.1,0.0
2017-04-24 07:16:00,,,,,,,,,,,,,,,,,0.0,,,,0.0,0.0
2017-04-24 07:17:00,,,,,,,,,,,,,,,,,0.0,,,,76.7,0.0
2017-04-24 07:18:00,,,,,,,,,,,,,,,,,0.0,,,,0.0,0.0


In [33]:
resampled_vienna.head() 

Unnamed: 0_level_0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,bme_slp,bme_alt,bme_hum,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2017-04-24 07:00:00,,,,,,,,,,,,,,,,,0.0,,,,23.349669,
2017-04-24 08:00:00,,,,,,,,,,,,,,,,,0.0,,,,20.898067,
2017-04-24 09:00:00,,,,,,,,,,,,,,,,,0.0,,,,19.371252,
2017-04-24 10:00:00,,,,,,,,,,,,,,,,,0.2,,,,44.898214,0.398644
2017-04-24 11:00:00,,,,,,,,,,,,,,,,,,,,,,


In [34]:

def df_statistics(df):
    """
    Computes statistics for a pandas DataFrame with a datetime index.
    
    Parameters:
    df (pd.DataFrame): DataFrame with datetime index and measurement columns.
    
    Returns:
    pd.DataFrame: A summary DataFrame containing:
        - Total Potential Measurements
        - Actual Measurements (Non-NaN)
        - Percentage of Measurements Present
        - Non-Zero Measurements
        - Percentage of Non-Zero Measurements

    Use processed Dataframe where -999.9 values are replaced with NaN and values out of bounds are replaced with NaN
    """
    if not isinstance(df.index, pd.DatetimeIndex):
        try:
            df.index = pd.to_datetime(df.index) # import since index is dtype object before
        except Exception as e:
            raise ValueError(f"Failed to convert index to DatetimeIndex: {e}")
    
    
    df = df.sort_index()
    
    # Create a complete minute-wise index
    full_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq="min")  # min stands for minute frequency
    
    # Reindex the dataframe to include all minutes, filling missing with NaN
    df = df.reindex(full_index)    
    
    total_minutes = full_index.shape[0]
    
    # Number of potential measurements per column
    total_potential = total_minutes
    
    # Actual measurements (non-NaN) per column
    actual_measurements = df.notna().sum()
    
    # Percentage of measurements present
    percentage_present = (actual_measurements / total_potential) * 100
    
    # Non-zero measurements per column (assuming numeric data)
    non_zero_measurements = (df != 0).sum()
    
    # Percentage of non-zero measurements
    percentage_non_zero = (non_zero_measurements / total_potential) * 100
    
    # Compile the statistics into a summary DataFrame
    summary = pd.DataFrame({
        'Total Potential Measurements': total_potential,
        'Actual Measurements': actual_measurements,
        'Percentage Present (%)': percentage_present.round(2),
        'Non-Zero Measurements': non_zero_measurements,
        'Percentage Non-Zero (%)': percentage_non_zero.round(2)
    })
    
    return summary

# Example Usage:
# Assuming you have a dataframe `df` with a datetime index and multiple columns.

# import pandas as pd
# df = pd.read_csv('your_data.csv', parse_dates=True, index_col='datetime_column')
# stats = dataframe_statistics(df)
# print(stats)


In [39]:
def dataset_statistics(ds, time_dim='time', freq='h'):
    """
    Computes statistics for an xarray Dataset with a time dimension.
    
    Parameters:
    ds (xr.Dataset): xarray Dataset with a time dimension and multiple data variables.
    time_dim (str): Name of the time dimension. Default is 'time'.
    freq (str): Frequency string for expected measurements (default 'T' for minutes).
    
    Returns:
    pd.DataFrame: A summary DataFrame containing statistics for each data variable:
        - Total Potential Measurements
        - Actual Measurements (Non-NaN)
        - Percentage Present (%)
        - Non-Zero Measurements
        - Percentage Non-Zero (%)
    """
    
    # Step 1: Validate Time Dimension
    if time_dim not in ds.dims:
        raise ValueError(f"The Dataset does not contain a '{time_dim}' dimension.")
    
    # Extract the time coordinates
    time = ds.coords[time_dim]
    
    # Ensure the time coordinate is of datetime type
    if not np.issubdtype(time.dtype, np.datetime64):
        try:
            ds = ds.copy()
            ds[time_dim] = pd.to_datetime(ds[time_dim].values)
            time = ds.coords[time_dim]
        except Exception as e:
            raise ValueError(f"Failed to convert '{time_dim}' to datetime: {e}")
    
    # Sort the Dataset by time
    ds = ds.sortby(time_dim)
    
    # Step 2: Create a Complete Time Index
    start_time = pd.to_datetime(time.values.min())
    end_time = pd.to_datetime(time.values.max())
    full_time = pd.date_range(start=start_time, end=end_time, freq=freq)
    total_potential = len(full_time)
    
    # Reindex the Dataset to include all expected timestamps
    ds = ds.reindex({time_dim: full_time})
    
    # Step 3: Initialize a Dictionary to Hold Statistics
    stats_dict = {
        'Total Potential Measurements': total_potential,
        'Actual Measurements': {},
        'Percentage Present (%)': {},
        'Non-Zero Measurements': {},
        'Percentage Non-Zero (%)': {}
    }
    
    # Step 4: Iterate Over Data Variables and Compute Statistics
    for var in ds.data_vars:
        data = ds[var]
        
        # Ensure the data is numeric; skip non-numeric variables
        if not np.issubdtype(data.dtype, np.number):
            print(f"Skipping non-numeric variable '{var}'.")
            continue
        
        # Actual Measurements: Count of non-NaN values
        actual = data.notnull().sum().item()
        
        # Percentage Present
        perc_present = (actual / total_potential) * 100
        
        # Non-Zero Measurements
        non_zero = (data != 0).sum().item()
        
        # Percentage Non-Zero
        perc_non_zero = (non_zero / total_potential) * 100
        
        # Populate the statistics dictionary
        stats_dict['Actual Measurements'][var] = actual
        stats_dict['Percentage Present (%)'][var] = round(perc_present, 2)
        stats_dict['Non-Zero Measurements'][var] = non_zero
        stats_dict['Percentage Non-Zero (%)'][var] = round(perc_non_zero, 2)
    
    # Step 5: Convert the Dictionary to a pandas DataFrame
    summary_df = pd.DataFrame(stats_dict)
    
    # Reorder the DataFrame for better readability
    summary_df = summary_df[
        ['Total Potential Measurements',
         'Actual Measurements',
         'Percentage Present (%)',
         'Non-Zero Measurements',
         'Percentage Non-Zero (%)']
    ]
    
    return summary_df


In [10]:
def counting_tipping_values(station_data, variabel, raworprocessed):
    # Number of potential measurement
    potential_measurements = station_data.shape[0]

    # Count the number of measurements
    measurement_count = station_data[variabel].count()

    # Count the number of times tipping is 0.0
    tipping_zero_count = station_data[station_data[variabel] == 0.0].shape[0]

    # Count the number of times tipping is non-zero and not NaN
    tipping_non_zero_count = station_data[(station_data[variabel] != 0.0) & (station_data[variabel].notna())].shape[0]

    # Total count of rows where tipping is not NaN
    tipping_total_count = station_data[station_data[variabel].notna()].shape[0]

    # Print the counts
    print("Raw or processed:", raworprocessed)
    print(f"Potential measurements: {potential_measurements}")
    print(f"Tipping equals 0.0: {tipping_zero_count}")
    print(f"Tipping is non-zero and not NaN: {tipping_non_zero_count}")
    print(f"Total number of tipping measurements (non-NaN): {tipping_total_count}")

    # Calculate percentages
    tipping_zero_percentage = (tipping_zero_count / tipping_total_count) * 100
    tipping_non_zero_percentage = (tipping_non_zero_count / tipping_total_count) * 100

    print(f"Percentage of tipping values equal to 0.0: {tipping_zero_percentage:.2f}%")
    print(f"Percentage of tipping values non-zero: {tipping_non_zero_percentage:.2f}%")

# Marshall

## Basic

In [11]:
marshall_raw_df = pd.read_csv("station_data_raw/marshall_hourly_raw_data.csv", index_col=0)
marshall_processed_df = pd.read_csv("station_data_processed/marshall_hourly_processed_data.csv", index_col=0)
ds_marshall = xr.open_dataset("station_data_as_nc/marshall_hourly.nc")

In [38]:
ds_marshall

In [12]:
marshall_processed_df.describe()


Unnamed: 0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,bme_slp,bme_alt,bme_hum,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed
count,2413617.0,2414535.0,2413942.0,2409843.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2612900.0,2400251.0,2551416.0,2759821.0,1735939.0,1749167.0,1736749.0,2762346.0,2692444.0
mean,11.18363,822.4099,1010.259,1743.0,,,,,,,,,,11.51809,46.92207,10.87879,0.0006877982,601.1301,3866.441,187.2449,144.5796,1.391728
std,10.63827,19.24599,24.09141,0.0,,,,,,,,,,10.57538,23.64455,10.6144,0.07311341,288.119,2904.967,146.0509,120.8759,1.780924
min,-36.8,306.9,366.33,1743.0,,,,,,,,,,-29.5,0.1,-34.7,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.3,821.02,1005.98,1743.0,,,,,,,,,,3.7,28.2,3.1,0.0,263.0,280.0,3.0,0.0,0.0
50%,11.7,825.1,1012.18,1743.0,,,,,,,,,,12.0,43.5,11.1,0.0,653.0,4325.0,212.0,149.4,0.85
75%,19.3,828.54,1018.98,1743.0,,,,,,,,,,19.5,64.8,18.9,0.0,865.0,6621.0,326.0,250.7,2.11
max,45.0,1097.7,1353.76,1743.0,,,,,,,,,,44.9,99.9,43.9,28.8,1919.0,15155.0,966.0,360.0,99.55


In [13]:
marshall_processed_df.head()

Unnamed: 0_level_0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,bme_slp,bme_alt,bme_hum,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2017-03-01 05:24:00,,,,,,,,,,,,,,,,,0.0,,,,0.0,
2017-03-01 05:25:00,,,,,,,,,,,,,,,,,0.0,,,,0.0,0.0
2017-03-01 05:26:00,,,,,,,,,,,,,,,,,0.0,,,,0.0,0.0
2017-03-01 05:27:00,,,,,,,,,,,,,,,,,0.0,,,,0.0,0.0
2017-03-01 05:28:00,,,,,,,,,,,,,,,,,0.0,,,,0.0,0.0


In [14]:
marshall_raw_df.describe()

Unnamed: 0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,bme_slp,bme_alt,bme_hum,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed
count,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0,3549033.0
mean,-312.0907,239.8543,367.319,862.5385,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-255.2583,-291.9516,-273.2406,-222.3711,-216.8339,1398.466,-419.0067,-109.1284,-240.3002
std,471.6672,850.0179,937.8128,1280.58,5.79803e-12,5.79803e-12,5.79803e-12,5.79803e-12,5.79803e-12,5.79803e-12,5.79803e-12,5.79803e-12,5.79803e-12,445.8262,490.218,454.4957,415.8371,825.3469,3174.661,602.2131,487.2279,428.5013
min,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99
25%,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0
50%,3.9,821.35,1006.36,1743.0,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,6.2,28.9,4.8,0.0,-999.99,-999.99,-999.99,59.5,0.1
75%,15.9,826.86,1015.33,1743.0,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,16.9,53.5,16.0,0.0,607.0,4032.0,188.0,227.8,1.64
max,84.5,1097.7,1353.76,1743.0,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,111.7,99.9,123.9,28.8,1919.0,15155.0,966.0,360.0,99.55


In [15]:
marshall_raw_df.head()

Unnamed: 0_level_0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,bme_slp,bme_alt,bme_hum,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2017-03-01 05:24:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,-999.99
2017-03-01 05:25:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0
2017-03-01 05:26:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0
2017-03-01 05:27:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0
2017-03-01 05:28:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0


In [16]:
print(marshall_raw_df.info())
print(marshall_processed_df.info())


<class 'pandas.core.frame.DataFrame'>
Index: 3549033 entries, 2017-03-01 05:24:00 to 2023-11-29 19:56:00
Data columns (total 22 columns):
 #   Column       Dtype  
---  ------       -----  
 0   bmp180_temp  float64
 1   bmp180_pres  float64
 2   bmp180_slp   float64
 3   bmp180_alt   float64
 4   bmp280_temp  float64
 5   bmp280_pres  float64
 6   bmp280_slp   float64
 7   bmp280_alt   float64
 8   bme_temp     float64
 9   bme_pres     float64
 10  bme_slp      float64
 11  bme_alt      float64
 12  bme_hum      float64
 13  htu_temp     float64
 14  htu_hum      float64
 15  mcp9808      float64
 16  tipping      float64
 17  vis_light    float64
 18  ir_light     float64
 19  uv_light     float64
 20  wind_dir     float64
 21  wind_speed   float64
dtypes: float64(22)
memory usage: 622.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 3549033 entries, 2017-03-01 05:24:00 to 2023-11-29 19:56:00
Data columns (total 22 columns):
 #   Column       Dtype  
---  ------       -----  

In [21]:
print(type(marshall_processed_df.index))
print(marshall_processed_df.index.dtype)

<class 'pandas.core.indexes.base.Index'>
object


In [24]:
stats = df_statistics(marshall_processed_df)
print(stats)

             Total Potential Measurements  Actual Measurements  \
bmp180_temp                       3549033              2413617   
bmp180_pres                       3549033              2414535   
bmp180_slp                        3549033              2413942   
bmp180_alt                        3549033              2409843   
bmp280_temp                       3549033                    0   
bmp280_pres                       3549033                    0   
bmp280_slp                        3549033                    0   
bmp280_alt                        3549033                    0   
bme_temp                          3549033                    0   
bme_pres                          3549033                    0   
bme_slp                           3549033                    0   
bme_alt                           3549033                    0   
bme_hum                           3549033                    0   
htu_temp                          3549033              2612900   
htu_hum   

In [25]:
stats = dataframe_statistics_with_reindex(marshall_processed_df)
print(stats)

  full_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='T')  # 'T' stands for minute frequency


             Total Potential Measurements  Actual Measurements  \
bmp180_temp                       3549033              2413617   
bmp180_pres                       3549033              2414535   
bmp180_slp                        3549033              2413942   
bmp180_alt                        3549033              2409843   
bmp280_temp                       3549033                    0   
bmp280_pres                       3549033                    0   
bmp280_slp                        3549033                    0   
bmp280_alt                        3549033                    0   
bme_temp                          3549033                    0   
bme_pres                          3549033                    0   
bme_slp                           3549033                    0   
bme_alt                           3549033                    0   
bme_hum                           3549033                    0   
htu_temp                          3549033              2612900   
htu_hum   

In [40]:
summary = dataset_statistics(ds_marshall)
print(summary)

             Total Potential Measurements  Actual Measurements  \
bmp180_temp                         59151                38990   
bmp180_pres                         59151                39811   
bmp180_slp                          59151                39802   
bmp180_alt                          59151                    0   
bmp280_temp                         59151                    0   
bmp280_pres                         59151                    0   
bmp280_slp                          59151                    0   
bmp280_alt                          59151                    0   
bme_temp                            59151                    0   
bme_pres                            59151                    0   
bme_slp                             59151                    0   
bme_alt                             59151                    0   
bme_hum                             59151                    0   
htu_temp                            59151                42839   
htu_hum   

In [None]:
counting_tipping_values(marshall_processed_df, "processed")

Raw or processed: processed
Tipping equals 0.0: 45248
Tipping is non-zero and not NaN: 1140
Total number of tipping measurements (non-NaN): 46388
Percentage of tipping values equal to 0.0: 97.54%
Percentage of tipping values non-zero: 2.46%


# Vienna

## Basic

In [41]:
vienna_raw_df = pd.read_csv("station_data_raw/vienna_hourly_raw_data.csv", index_col=0)
vienna_processed_df = pd.read_csv("station_data_processed/vienna_hourly_processed_data.csv", index_col=0)
ds_vienna = xr.open_dataset("station_data_as_nc/vienna_hourly.nc")

In [33]:
vienna_processed_df.head()

Unnamed: 0_level_0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,bme_slp,bme_alt,bme_hum,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2017-04-24 07:14:00,,,,,,,,,,,,,,,,,0.0,,,,,0.0
2017-04-24 07:15:00,,,,,,,,,,,,,,,,,0.0,,,,70.1,0.0
2017-04-24 07:16:00,,,,,,,,,,,,,,,,,0.0,,,,0.0,0.0
2017-04-24 07:17:00,,,,,,,,,,,,,,,,,0.0,,,,76.7,0.0
2017-04-24 07:18:00,,,,,,,,,,,,,,,,,0.0,,,,0.0,0.0


In [30]:
vienna_processed_df.describe()

Unnamed: 0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,bme_slp,bme_alt,bme_hum,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed
count,0.0,0.0,0.0,0.0,1135884.0,1135825.0,1135801.0,1132564.0,0.0,0.0,0.0,0.0,0.0,1821771.0,1779866.0,886720.0,1924638.0,1153430.0,1153436.0,1153434.0,1923084.0,991046.0
mean,,,,,14.62842,964.8641,983.2735,159.0,,,,,,12.41188,68.78195,10.982326,0.001245325,338.6394,1100.471,46.38932,183.3463,0.837312
std,,,,,9.280641,102.3951,104.527,0.0,,,,,,8.565497,17.75864,8.994721,0.07158464,165.0865,1672.651,85.0547,110.0088,0.793955
min,,,,,-31.7,382.22,389.18,159.0,,,,,,-39.7,1.2,-15.4,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,,6.7,990.95,1009.91,159.0,,,,,,5.4,56.3,3.6,0.0,261.0,253.0,2.0,82.0,0.24
50%,,,,,14.56,997.39,1016.17,159.0,,,,,,11.8,70.3,9.7,0.0,263.0,272.0,3.0,197.4,0.67
75%,,,,,22.43,1002.48,1021.62,159.0,,,,,,18.9,82.8,18.0,0.0,340.0,1023.0,45.0,280.7,1.24
max,,,,,42.27,1087.58,1107.61,159.0,,,,,,38.1,99.9,41.7,28.0,1630.0,13541.0,736.0,360.0,98.24


In [31]:
vienna_raw_df.describe()

Unnamed: 0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,bme_slp,bme_alt,bme_hum,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed
count,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0,2673207.0
mean,-999.99,-999.99,-999.99,-999.99,-568.8633,-165.1387,-157.3345,-508.9579,-999.99,-999.99,-999.99,-999.99,-999.99,-310.0451,-288.3837,-664.3431,-280.0229,-422.4009,-93.6824,-548.4988,-148.7072,-628.9503
std,4.661161e-12,4.661161e-12,4.661161e-12,4.661161e-12,501.5931,973.5705,982.7422,572.7031,4.661161e-12,4.661161e-12,4.661161e-12,4.661161e-12,4.661161e-12,471.7284,504.3525,476.1447,449.0081,671.8095,1513.093,521.2565,539.794,483.401
min,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99
25%,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99
50%,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,5.8,56.2,-999.99,0.0,-999.99,-999.99,-999.99,112.8,-999.99
75%,-999.99,-999.99,-999.99,-999.99,11.63,995.62,1014.45,159.0,-999.99,-999.99,-999.99,-999.99,-999.99,15.5,76.3,3.5,0.0,262.0,254.0,3.0,256.5,0.36
max,-999.99,-999.99,-999.99,-999.99,81.47,1087.58,1107.61,159.0,-999.99,-999.99,-999.99,-999.99,-999.99,81.4,99.9,124.7,28.0,1630.0,13541.0,736.0,360.0,98.24


In [32]:
print(vienna_processed_df.info())
print(vienna_raw_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 2673207 entries, 2017-04-24 07:14:00 to 2022-05-24 16:40:00
Data columns (total 22 columns):
 #   Column       Dtype  
---  ------       -----  
 0   bmp180_temp  float64
 1   bmp180_pres  float64
 2   bmp180_slp   float64
 3   bmp180_alt   float64
 4   bmp280_temp  float64
 5   bmp280_pres  float64
 6   bmp280_slp   float64
 7   bmp280_alt   float64
 8   bme_temp     float64
 9   bme_pres     float64
 10  bme_slp      float64
 11  bme_alt      float64
 12  bme_hum      float64
 13  htu_temp     float64
 14  htu_hum      float64
 15  mcp9808      float64
 16  tipping      float64
 17  vis_light    float64
 18  ir_light     float64
 19  uv_light     float64
 20  wind_dir     float64
 21  wind_speed   float64
dtypes: float64(22)
memory usage: 469.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 2673207 entries, 2017-04-24 07:14:00 to 2022-05-24 16:40:00
Data columns (total 22 columns):
 #   Column       Dtype  
---  ------       -----  

In [None]:
counting_tipping_values(vienna_raw_df, "raw")

Raw or processed: raw
Tipping equals 0.0: 1918917
Tipping is non-zero and not NaN: 754290
Total number of tipping measurements (non-NaN): 2673207
Percentage of tipping values equal to 0.0: 71.78%
Percentage of tipping values non-zero: 28.22%


In [None]:
counting_tipping_values(vienna_processed_df, "processed")

Raw or processed: processed
Tipping equals 0.0: 30236
Tipping is non-zero and not NaN: 1840
Total number of tipping measurements (non-NaN): 32076
Percentage of tipping values equal to 0.0: 94.26%
Percentage of tipping values non-zero: 5.74%


In [35]:
stats = dataframe_statistics_with_reindex(vienna_processed_df)
print(stats)

  full_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='T')  # 'T' stands for minute frequency


             Total Potential Measurements  Actual Measurements  \
bmp180_temp                       2673207                    0   
bmp180_pres                       2673207                    0   
bmp180_slp                        2673207                    0   
bmp180_alt                        2673207                    0   
bmp280_temp                       2673207              1135884   
bmp280_pres                       2673207              1135825   
bmp280_slp                        2673207              1135801   
bmp280_alt                        2673207              1132564   
bme_temp                          2673207                    0   
bme_pres                          2673207                    0   
bme_slp                           2673207                    0   
bme_alt                           2673207                    0   
bme_hum                           2673207                    0   
htu_temp                          2673207              1821771   
htu_hum   

In [42]:
summary = dataset_statistics(ds_vienna)
print(summary)

             Total Potential Measurements  Actual Measurements  \
bmp180_temp                         44554                    0   
bmp180_pres                         44554                    0   
bmp180_slp                          44554                    0   
bmp180_alt                          44554                    0   
bmp280_temp                         44554                17070   
bmp280_pres                         44554                17072   
bmp280_slp                          44554                17072   
bmp280_alt                          44554                    0   
bme_temp                            44554                    0   
bme_pres                            44554                    0   
bme_slp                             44554                    0   
bme_alt                             44554                    0   
bme_hum                             44554                    0   
htu_temp                            44554                27336   
htu_hum   

# Barbados

In [43]:
barbados_raw_df = pd.read_csv("station_data_raw/barbados_hourly_raw_data.csv", index_col=0)
barbados_processed_df = pd.read_csv("station_data_processed/barbados_hourly_processed_data.csv", index_col=0)
ds_barbados = xr.open_dataset("station_data_as_nc/barbados_hourly.nc")


## Basic

In [None]:
barbados_raw_df.describe()

Unnamed: 0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,...,bme_hum,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed
count,1817508.0,1817508.0,1817508.0,1817508.0,1817508.0,1817508.0,1817508.0,1817508.0,1817508.0,1817508.0,...,1817508.0,1817508.0,1817508.0,1817508.0,1817508.0,1817508.0,1817508.0,1817508.0,1817508.0,1817508.0
mean,-999.99,-999.99,-999.99,-999.99,-212.3054,484.2709,506.9669,-21.88401,-999.99,-999.99,...,-999.99,-196.7659,-436.3836,-260.3876,-43.56546,91.18536,1256.991,-164.6803,-762.592,-41.9657
std,2.501111e-12,2.501111e-12,2.501111e-12,2.501111e-12,433.2454,822.5395,835.3257,537.9648,2.501111e-12,2.501111e-12,...,2.501111e-12,419.785,542.6432,459.2763,204.1328,637.4778,2569.836,474.4023,425.4855,207.2575
min,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,...,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99
25%,-999.99,-999.99,-999.99,-999.99,23.12,648.6,669.32,274.0,-999.99,-999.99,...,-999.99,9.3,-999.99,-999.99,0.0,259.0,252.0,1.0,-999.99,1.24
50%,-999.99,-999.99,-999.99,-999.99,24.85,983.62,1014.75,274.0,-999.99,-999.99,...,-999.99,23.5,69.7,24.2,0.0,261.0,254.0,2.0,-999.99,2.55
75%,-999.99,-999.99,-999.99,-999.99,26.7,985.47,1016.69,274.0,-999.99,-999.99,...,-999.99,25.3,87.5,25.9,0.0,430.0,1988.0,92.0,-999.99,3.65
max,-999.99,-999.99,-999.99,-999.99,84.41,1084.97,1119.66,274.0,-999.99,-999.99,...,-999.99,124.3,99.9,123.9,12.6,1960.0,14052.0,984.0,0.0,99.98


In [None]:
barbados_raw_df.head()

Unnamed: 0_level_0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,...,bme_hum,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-12 00:00:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,...,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0
2020-06-12 00:01:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,...,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0
2020-06-12 00:02:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,...,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0
2020-06-12 00:03:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,...,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0
2020-06-12 00:04:00,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,...,-999.99,-999.99,-999.99,-999.99,0.0,-999.99,-999.99,-999.99,0.0,0.0


In [None]:
barbados_processed_df.describe()

Unnamed: 0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,...,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed,tas
count,0.0,0.0,0.0,0.0,19275.0,19304.0,19304.0,0.0,0.0,0.0,...,19539.0,15417.0,17389.0,29008.0,22612.0,21491.0,13230.0,0.0,26956.0,24780.0
mean,,,,,26.363526,984.414959,1015.581027,,,,...,22.188743,85.960735,25.575539,0.193285,419.594529,1990.336373,147.866062,,3.065972,297.753908
std,,,,,1.96642,11.590826,11.95317,,,,...,6.201811,8.186707,2.048191,1.34119,228.350229,2400.179046,129.135906,,2.429038,2.330054
min,,,,,-21.155,648.6,669.32,,,,...,1.5,55.9,0.0,0.0,0.0,0.0,0.0,,0.001,251.995
25%,,,,,24.975,983.83,1014.975,,,,...,21.9,80.15,24.2,0.0,260.0,253.5,29.0,,1.963417,296.56125
50%,,,,,26.16,985.0,1016.21,,,,...,24.4,86.9,25.4,0.0,266.5,499.0,124.5,,2.895667,297.826667
75%,,,,,27.66,986.125,1017.37,,,,...,26.2,92.55,27.1,0.0,536.625,3262.25,227.0,,3.824583,299.106667
max,,,,,33.41,990.78,1021.94,,,,...,31.1,99.8,32.2,66.2,1221.0,10258.5,516.5,,74.617241,303.87


In [None]:
barbados_processed_df.head()

Unnamed: 0_level_0,bmp180_temp,bmp180_pres,bmp180_slp,bmp180_alt,bmp280_temp,bmp280_pres,bmp280_slp,bmp280_alt,bme_temp,bme_pres,...,htu_temp,htu_hum,mcp9808,tipping,vis_light,ir_light,uv_light,wind_dir,wind_speed,tas
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-08 15:00:00,,,,,,,,,,,...,,,,0.0,,,,,,
2020-06-08 16:00:00,,,,,,,,,,,...,,,,0.0,,,,,,
2020-06-08 17:00:00,,,,,,,,,,,...,,,,0.0,,,,,,
2020-06-08 18:00:00,,,,,,,,,,,...,,,,0.0,,,,,,
2020-06-08 19:00:00,,,,,,,,,,,...,,,,,,,,,,


In [None]:
print(barbados_processed_df.info())
print(barbados_raw_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 30293 entries, 2020-06-08 15:00:00 to 2023-11-21 23:00:00
Data columns (total 23 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   bmp180_temp  0 non-null      float64
 1   bmp180_pres  0 non-null      float64
 2   bmp180_slp   0 non-null      float64
 3   bmp180_alt   0 non-null      float64
 4   bmp280_temp  19275 non-null  float64
 5   bmp280_pres  19304 non-null  float64
 6   bmp280_slp   19304 non-null  float64
 7   bmp280_alt   0 non-null      float64
 8   bme_temp     0 non-null      float64
 9   bme_pres     0 non-null      float64
 10  bme_slp      0 non-null      float64
 11  bme_alt      0 non-null      float64
 12  bme_hum      0 non-null      float64
 13  htu_temp     19539 non-null  float64
 14  htu_hum      15417 non-null  float64
 15  mcp9808      17389 non-null  float64
 16  tipping      29008 non-null  float64
 17  vis_light    22612 non-null  float64
 18  ir_light     21491 

In [None]:
counting_tipping_values(barbados_raw_df, "raw")

Raw or processed: raw
Tipping equals 0.0: 1721851
Tipping is non-zero and not NaN: 95657
Total number of tipping measurements (non-NaN): 1817508
Percentage of tipping values equal to 0.0: 94.74%
Percentage of tipping values non-zero: 5.26%


In [None]:
counting_tipping_values(barbados_processed_df, "processed")

Raw or processed: processed
Tipping equals 0.0: 26140
Tipping is non-zero and not NaN: 2868
Total number of tipping measurements (non-NaN): 29008
Percentage of tipping values equal to 0.0: 90.11%
Percentage of tipping values non-zero: 9.89%


In [37]:
stats = dataframe_statistics_with_reindex(barbados_processed_df)
print(stats)

  full_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='T')  # 'T' stands for minute frequency


             Total Potential Measurements  Actual Measurements  \
bmp180_temp                       1817508                    0   
bmp180_pres                       1817508                    0   
bmp180_slp                        1817508                    0   
bmp180_alt                        1817508                    0   
bmp280_temp                       1817508              1395374   
bmp280_pres                       1817508              1395378   
bmp280_slp                        1817508              1395368   
bmp280_alt                        1817508              1395392   
bme_temp                          1817508                    0   
bme_pres                          1817508                    0   
bme_slp                           1817508                    0   
bme_alt                           1817508                    0   
bme_hum                           1817508                    0   
htu_temp                          1817508              1426764   
htu_hum   

In [44]:
summary = dataset_statistics(ds_barbados)
print(summary)

             Total Potential Measurements  Actual Measurements  \
bmp180_temp                         30293                    0   
bmp180_pres                         30293                    0   
bmp180_slp                          30293                    0   
bmp180_alt                          30293                    0   
bmp280_temp                         30293                19275   
bmp280_pres                         30293                19304   
bmp280_slp                          30293                19304   
bmp280_alt                          30293                    0   
bme_temp                            30293                    0   
bme_pres                            30293                    0   
bme_slp                             30293                    0   
bme_alt                             30293                    0   
bme_hum                             30293                    0   
htu_temp                            30293                19539   
htu_hum   