# Weather Data Combination

Combine all daily weather data from cuaca-harian folder (5 stations) and create unified ID column.

In [28]:
import pandas as pd
import numpy as np
import os
import re
from glob import glob

# Get all weather CSV files from cuaca-harian folder
weather_folder = 'data/cuaca-harian'
weather_files = sorted(glob(os.path.join(weather_folder, '*.csv')))

print(f"Found {len(weather_files)} weather files:")
for file in weather_files:
    print(f"  - {os.path.basename(file)}")

# Extract station ID from filename (dki1 -> DKI1, etc.)
def extract_station_id(filename):
    basename = os.path.basename(filename).lower()
    match = re.search(r'dki(\d)', basename)
    if match:
        return f'DKI{match.group(1)}'
    return None

print("\nExtracted station IDs:")
for file in weather_files:
    station = extract_station_id(file)
    print(f"  {os.path.basename(file)} → {station}")

Found 5 weather files:
  - cuaca-harian-dki1-bundaranhi.csv
  - cuaca-harian-dki2-kelapagading.csv
  - cuaca-harian-dki3-jagakarsa.csv
  - cuaca-harian-dki4-lubangbuaya.csv
  - cuaca-harian-dki5-kebonjeruk.csv

Extracted station IDs:
  cuaca-harian-dki1-bundaranhi.csv → DKI1
  cuaca-harian-dki2-kelapagading.csv → DKI2
  cuaca-harian-dki3-jagakarsa.csv → DKI3
  cuaca-harian-dki4-lubangbuaya.csv → DKI4
  cuaca-harian-dki5-kebonjeruk.csv → DKI5


In [29]:
# Load all weather files and combine them
all_dfs = []

for file in weather_files:
    station_id = extract_station_id(file)
    
    # Load CSV file
    df = pd.read_csv(file)
    
    # Add station column
    df['stasiun'] = station_id
    
    # Add ID column in format YYYY-MM-DD_DKIx
    df['ID'] = df['time'].astype(str) + '_' + df['stasiun']
    
    all_dfs.append(df)
    print(f"Loaded {len(df)} rows from {station_id}")

# Combine all dataframes
combined_weather_df = pd.concat(all_dfs, ignore_index=True)

print(f"\n=== COMBINED DATASET ===")
print(f"Total rows: {len(combined_weather_df)}")
print(f"Date range: {combined_weather_df['time'].min()} to {combined_weather_df['time'].max()}")
print(f"Unique stations: {combined_weather_df['stasiun'].nunique()}")
print(f"Stations: {sorted(combined_weather_df['stasiun'].unique())}")

print(f"\nDataFrame shape: {combined_weather_df.shape}")
print(f"\nColumn names: {combined_weather_df.columns.tolist()}")

Loaded 5722 rows from DKI1
Loaded 5722 rows from DKI2
Loaded 5722 rows from DKI3
Loaded 5722 rows from DKI4
Loaded 5722 rows from DKI5

=== COMBINED DATASET ===
Total rows: 28610
Date range: 2010-01-01 to 2025-08-31
Unique stations: 5
Stations: ['DKI1', 'DKI2', 'DKI3', 'DKI4', 'DKI5']

DataFrame shape: (28610, 26)

Column names: ['time', 'temperature_2m_max (°C)', 'temperature_2m_min (°C)', 'precipitation_sum (mm)', 'precipitation_hours (h)', 'wind_speed_10m_max (km/h)', 'wind_direction_10m_dominant (°)', 'shortwave_radiation_sum (MJ/m²)', 'temperature_2m_mean (°C)', 'relative_humidity_2m_mean (%)', 'cloud_cover_mean (%)', 'surface_pressure_mean (hPa)', 'wind_gusts_10m_max (km/h)', 'winddirection_10m_dominant (°)', 'relative_humidity_2m_max (%)', 'relative_humidity_2m_min (%)', 'cloud_cover_max (%)', 'cloud_cover_min (%)', 'wind_gusts_10m_mean (km/h)', 'wind_speed_10m_mean (km/h)', 'wind_gusts_10m_min (km/h)', 'wind_speed_10m_min (km/h)', 'surface_pressure_max (hPa)', 'surface_pressure

In [30]:
# Reorder columns to put ID and stasiun near the front
column_order = ['ID', 'time', 'stasiun'] + [col for col in combined_weather_df.columns 
                                             if col not in ['ID', 'time', 'stasiun']]
combined_weather_df = combined_weather_df[column_order]

print("Sample data (first 5 rows):")
print(combined_weather_df[['ID', 'time', 'stasiun', 'temperature_2m_max (°C)', 
                           'precipitation_sum (mm)', 'wind_speed_10m_max (km/h)']].head())

print("\n\nData from each station (first row):")
for station in sorted(combined_weather_df['stasiun'].unique()):
    station_data = combined_weather_df[combined_weather_df['stasiun'] == station].iloc[0]
    print(f"\n{station}:")
    print(f"  ID: {station_data['ID']}")
    print(f"  Date: {station_data['time']}")
    print(f"  Max Temp: {station_data['temperature_2m_max (°C)']}°C")
    print(f"  Precipitation: {station_data['precipitation_sum (mm)']}mm")

print("\n\nData summary by station:")
print(combined_weather_df.groupby('stasiun').agg({
    'time': ['min', 'max', 'count']
}).round(2))

Sample data (first 5 rows):
                ID        time stasiun  temperature_2m_max (°C)  \
0  2010-01-01_DKI1  2010-01-01    DKI1                     29.4   
1  2010-01-02_DKI1  2010-01-02    DKI1                     30.8   
2  2010-01-03_DKI1  2010-01-03    DKI1                     30.4   
3  2010-01-04_DKI1  2010-01-04    DKI1                     30.3   
4  2010-01-05_DKI1  2010-01-05    DKI1                     29.9   

   precipitation_sum (mm)  wind_speed_10m_max (km/h)  
0                     4.0                       16.0  
1                     6.5                       14.7  
2                     7.6                       12.6  
3                     0.9                       19.3  
4                    14.3                       15.9  


Data from each station (first row):

DKI1:
  ID: 2010-01-01_DKI1
  Date: 2010-01-01
  Max Temp: 29.4°C
  Precipitation: 4.0mm

DKI2:
  ID: 2010-01-01_DKI2
  Date: 2010-01-01
  Max Temp: 29.4°C
  Precipitation: 5.2mm

DKI3:
  ID: 2010-01-

In [31]:
# Save the combined weather dataset
output_file = 'weather_2010-2025.csv'
combined_weather_df.to_csv(output_file, index=False)

print(f"✓ Combined weather dataset saved to: {output_file}")
print(f"  Total rows: {len(combined_weather_df)}")
print(f"  Total columns: {len(combined_weather_df.columns)}")
print(f"  File size: {combined_weather_df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")

# Show info about the key columns
print(f"\nKey columns:")
print(f"  - ID: YYYY-MM-DD_DKIx format")
print(f"  - time: Date in YYYY-MM-DD format")
print(f"  - stasiun: Station ID (DKI1-DKI5)")
print(f"  - {len(combined_weather_df.columns) - 3} weather metrics")

print(f"\nDate range: {combined_weather_df['time'].min()} to {combined_weather_df['time'].max()}")
print(f"Total unique combinations (ID): {combined_weather_df['ID'].nunique()}")

✓ Combined weather dataset saved to: weather_2010-2025.csv
  Total rows: 28610
  Total columns: 26
  File size: 9.82 MB

Key columns:
  - ID: YYYY-MM-DD_DKIx format
  - time: Date in YYYY-MM-DD format
  - stasiun: Station ID (DKI1-DKI5)
  - 23 weather metrics

Date range: 2010-01-01 to 2025-08-31
Total unique combinations (ID): 28610
