In [65]:
import glob

files = sorted(glob.glob("result*.txt"))

with open("humidity_merged_results.txt", "w", encoding="utf-8") as fout:
    for fn in files:
        with open(fn, "r", encoding="utf-8") as fin:

            fout.write(fin.read())
            fout.write("\n")  
            print(f"Merged file: {fn}")


Merged file: result (1).txt
Merged file: result (2).txt
Merged file: result (3).txt
Merged file: result.txt


In [60]:
import os

input_file  = '/Users/shuzhou/Downloads/humidity_merged_results.txt'
output_file = '/Users/shuzhou/Downloads/humidity_merged_results_filtered.txt'

with open(input_file, 'r') as fin, open(output_file, 'w') as fout:
    for line in fin:
        # Preserve comment/header lines starting with '#'
        if line.startswith('#'):
            fout.write(line)
            continue

        # Split by comma, strip whitespace, and ensure there are at least 4 fields
        parts = [p.strip() for p in line.rstrip('\n').split(',')]
        # Write only lines where the 4th field is non-empty
        if len(parts) >= 4 and parts[3] != '':
            fout.write(line)

print(f"Filtered results saved to: {output_file}")


Filtered results saved to: /Users/shuzhou/Downloads/humidity_merged_results_filtered.txt


In [62]:

import numpy as np

path = '/Users/shuzhou/Downloads/capsule-0341365/data/global_temp/temp_global_hourly_train.npy'

arr = np.load(path)

print("shape:", arr.shape)
print("dtype:", arr.dtype)

print(arr[:1])

shape: (12280, 3850, 1)
dtype: float64
[[[ 10.]
  [ 82.]
  [ 88.]
  ...
  [270.]
  [250.]
  [250.]]]


In [63]:
import numpy as np

path = '/Users/shuzhou/Downloads/capsule-0341365/data/global_temp/data_time_train.npy'

arr = np.load(path)

print("shape:", arr.shape)
print("dtype:", arr.dtype)

print(arr[1])


shape: (12280,)
dtype: <U19
2019-01-01 01:00:00


In [64]:
import numpy as np
import pandas as pd
import os
import re

knmi_file = '/Users/shuzhou/Downloads/humidity_merged_results_filtered.txt'

# Read station metadata from the header lines
station_info = {}
with open(knmi_file, 'r') as f:
    for line in f:
        # Match lines like: "# <station_id> <longitude> <latitude> <elevation> ..."
        m = re.match(r'#\s*(\d+)\s+([\d\.\-]+)\s+([\d\.\-]+)\s+([\d\.\-]+)', line)
        if m:
            stn = int(m.group(1))
            lon = float(m.group(2))
            lat = float(m.group(3))
            alt = float(m.group(4))
            station_info[stn] = [lat, lon, alt]
        if not line.startswith('#'):
            break
df = pd.read_csv(
    knmi_file,
    comment='#',
    header=None,
    names=['STN', 'YYYYMMDD', 'HH', 'U'],
    skipinitialspace=True
)

df['date'] = pd.to_datetime(df['YYYYMMDD'], format='%Y%m%d')
# Combine date and hour into a full datetime
df['datetime'] = df['date'] + pd.to_timedelta(df['HH'], unit='h')
# Format datetime as string for indexing
df['datetime_str'] = df['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Get sorted list of unique station IDs
unique_stations = sorted(df['STN'].unique())

stations_array = np.array([
    station_info[stn] for stn in unique_stations
])

#  Deduplicate timestamps and sort, then create arrays
unique_times = sorted(df['datetime_str'].unique())
time_array = np.array(unique_times, dtype=str)

# Prepare a 3D array: [time, station, 1 feature]
humidity_array = np.full((len(unique_times), len(unique_stations), 1), np.nan)

# Create lookup maps for indexing
station_indices = {stn: i for i, stn in enumerate(unique_stations)}
time_indices = {t: i for i, t in enumerate(unique_times)}

# Populate the humidity array
for _, row in df.iterrows():
    t_idx = time_indices[row['datetime_str']]
    s_idx = station_indices[row['STN']]
    humidity_array[t_idx, s_idx, 0] = row['U']

# Split dataset by time (70% train, 10% val, 20% test)
n_times = len(unique_times)
train_end = int(n_times * 0.7)
val_end = int(n_times * 0.8)

train_time = time_array[:train_end]
val_time   = time_array[train_end:val_end]
test_time  = time_array[val_end:]

train_humidity = humidity_array[:train_end, :, :]
val_humidity   = humidity_array[train_end:val_end, :, :]
test_humidity  = humidity_array[val_end:, :, :]

output_dir = '/Users/shuzhou/Downloads/knmi_humidity_model_format/'
os.makedirs(output_dir, exist_ok=True)
np.save(os.path.join(output_dir, 'stations_sorted.npy'), stations_array)

np.save(os.path.join(output_dir, 'data_time_train.npy'), train_time)
np.save(os.path.join(output_dir, 'humidity_knmi_hourly_train.npy'), train_humidity)

np.save(os.path.join(output_dir, 'data_time_val.npy'), val_time)
np.save(os.path.join(output_dir, 'humidity_knmi_hourly_val.npy'), val_humidity)


np.save(os.path.join(output_dir, 'data_time_test.npy'), test_time)
np.save(os.path.join(output_dir, 'humidity_knmi_hourly_test.npy'), test_humidity)

print(f"Conversion complete! Data saved to: {output_dir}")
print(f"Number of stations: {len(unique_stations)}")
print(f"Total time points: {n_times}")
print(f"Training time points: {len(train_time)} ({len(train_time)/n_times*100:.1f}%)")
print(f"Validation time points: {len(val_time)} ({len(val_time)/n_times*100:.1f}%)")
print(f"Test time points: {len(test_time)} ({len(test_time)/n_times*100:.1f}%)")
print(f"Data shapes -> Train: {train_humidity.shape}, Val: {val_humidity.shape}, Test: {test_humidity.shape}")


Conversion complete! Data saved to: /Users/shuzhou/Downloads/knmi_humidity_model_format/
Number of stations: 34
Total time points: 8784
Training time points: 6148 (70.0%)
Validation time points: 879 (10.0%)
Test time points: 1757 (20.0%)
Data shapes -> Train: (6148, 34, 1), Val: (879, 34, 1), Test: (1757, 34, 1)
