In [2]:
import pandas as pd
import numpy as np

In [3]:
print("--- Step 1: Loading Datasets ---")

# Load our primary base file, df_historical.csv, which already contains Temp, Precip, and RH.
try:
    df_historical = pd.read_csv('..\data\df_historical.csv', index_col='Date', parse_dates=True)
    print("Successfully loaded df_historical.csv")
    print(f"Base historical shape: {df_historical.shape}")
except FileNotFoundError:
    print("ERROR: df_historical.csv not found. Please ensure it's in the same directory.")
    exit()

--- Step 1: Loading Datasets ---
Successfully loaded df_historical.csv
Base historical shape: (9422, 15)


In [4]:
# Load the new datasets that need to be added
try:
    df_wind = pd.read_csv('..\data\wind_speed_direction_KTM.csv', parse_dates=['Time'], index_col='Time')
    df_solar = pd.read_csv('..\data\globsolar_ktm.csv', parse_dates=['Time'], index_col='Time')
    print("Successfully loaded wind and solar data.")
except FileNotFoundError as e:
    print(f"ERROR: Could not find a required data file. Details: {e}")
    exit()

Successfully loaded wind and solar data.


In [5]:
df_wind.head()

Unnamed: 0_level_0,"Wind direction (deg) at 10 m, 1 h avg","Wind speed (m/s) at 10 m, 1 h avg"
Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-09-17 03:00:00,24.0,0.8
2020-09-17 04:00:00,135.0,1.0
2020-09-17 05:00:00,184.0,1.0
2020-09-17 06:00:00,286.0,1.4
2020-09-17 07:00:00,272.0,1.7


In [6]:
df_solar.head()

Unnamed: 0_level_0,Global Solar Radiation 1 hour average
Time,Unnamed: 1_level_1
2020-09-17 03:00:00,212.8
2020-09-17 04:00:00,224.2
2020-09-17 05:00:00,402.6
2020-09-17 06:00:00,600.2
2020-09-17 07:00:00,285.9


In [7]:
print("\n--- Step 2: Processing New Wind and Solar Data ---")

# 2a. Process Wind Data
print("Processing and resampling wind data...")
df_wind.columns = ['Wind_Dir_deg', 'Wind_Speed_ms'] # Set clear names
wind_dir_rad = df_wind['Wind_Dir_deg'] * np.pi / 180
df_wind['Wind_Dir_x'] = np.cos(wind_dir_rad)
df_wind['Wind_Dir_y'] = np.sin(wind_dir_rad)
df_wind_daily = df_wind.resample('D').agg({'Wind_Speed_ms': ['mean', 'max'], 'Wind_Dir_x': 'mean', 'Wind_Dir_y': 'mean'})
df_wind_daily.columns = ['Wind_Speed_ms_mean', 'Wind_Speed_ms_max', 'Wind_Dir_x_mean', 'Wind_Dir_y_mean']
print("-> Wind data resampled to daily.")


--- Step 2: Processing New Wind and Solar Data ---
Processing and resampling wind data...
-> Wind data resampled to daily.


In [8]:
df_wind_daily.head()

Unnamed: 0_level_0,Wind_Speed_ms_mean,Wind_Speed_ms_max,Wind_Dir_x_mean,Wind_Dir_y_mean
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-09-17,1.557143,3.3,0.144514,-0.113014
2020-09-18,1.317391,3.0,-0.189129,0.434765
2020-09-19,1.258333,4.3,0.050985,0.057511
2020-09-20,1.891667,5.5,-0.345494,0.095298
2020-09-21,1.679167,4.6,0.089347,0.542182


In [9]:
# 2b. Process Solar Radiation Data
print("Processing and resampling solar radiation data...")
df_solar.rename(columns={'Global Solar Radiation 1 hour average': 'Solar_Rad'}, inplace=True)
df_solar_daily = df_solar.resample('D').agg({'Solar_Rad': ['mean', 'max', 'sum']})
df_solar_daily.columns = ['Solar_Rad_mean', 'Solar_Rad_max', 'Solar_Rad_sum']
print("-> Solar Radiation data resampled to daily.")

Processing and resampling solar radiation data...
-> Solar Radiation data resampled to daily.


In [10]:
print("\n--- Step 3: Joining All DataFrames ---")

# Join the processed wind and solar data to our historical base DataFrame.
# We can pass a list of dataframes to the join method.
df_recent_enriched = df_historical.join([df_wind_daily, df_solar_daily])
print(f"Shape after joining all data: {df_recent_enriched.shape}")


--- Step 3: Joining All DataFrames ---
Shape after joining all data: (9422, 22)


In [11]:
df_recent_enriched.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9422 entries, 1999-01-01 to 2024-10-17
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Precipitation_mm              9422 non-null   float64
 1   Min_Temp_C                    9422 non-null   float64
 2   Max_Temp_C                    9422 non-null   float64
 3   Precipitation_7d_rolling_sum  9422 non-null   float64
 4   Year                          9422 non-null   int64  
 5   Month                         9422 non-null   int64  
 6   Day_of_Year                   9422 non-null   int64  
 7   Day_of_Week                   9422 non-null   int64  
 8   Month_sin                     9422 non-null   float64
 9   Month_cos                     9422 non-null   float64
 10  Day_of_Year_sin               9422 non-null   float64
 11  Day_of_Year_cos               9422 non-null   float64
 12  RH_mean                       9422 non-null 

In [12]:
print("\n--- Step 4: Filtering to Recent Period & Cleaning ---")

# KEY STEP: Drop all rows that don't have wind or solar data.
# This cleanly filters the DataFrame to the recent period where all data is available.
df_recent_enriched.dropna(subset=['Wind_Speed_ms_mean', 'Solar_Rad_mean'], inplace=True)
print(f"Shape after dropping rows with no wind/solar data: {df_recent_enriched.shape}")


--- Step 4: Filtering to Recent Period & Cleaning ---
Shape after dropping rows with no wind/solar data: (1381, 22)


In [13]:
df_recent_enriched.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1381 entries, 2020-09-17 to 2024-10-02
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Precipitation_mm              1381 non-null   float64
 1   Min_Temp_C                    1381 non-null   float64
 2   Max_Temp_C                    1381 non-null   float64
 3   Precipitation_7d_rolling_sum  1381 non-null   float64
 4   Year                          1381 non-null   int64  
 5   Month                         1381 non-null   int64  
 6   Day_of_Year                   1381 non-null   int64  
 7   Day_of_Week                   1381 non-null   int64  
 8   Month_sin                     1381 non-null   float64
 9   Month_cos                     1381 non-null   float64
 10  Day_of_Year_sin               1381 non-null   float64
 11  Day_of_Year_cos               1381 non-null   float64
 12  RH_mean                       1381 non-null 

In [14]:
df_recent_enriched.head()

Unnamed: 0_level_0,Precipitation_mm,Min_Temp_C,Max_Temp_C,Precipitation_7d_rolling_sum,Year,Month,Day_of_Year,Day_of_Week,Month_sin,Month_cos,...,RH_mean,RH_min,RH_max,Wind_Speed_ms_mean,Wind_Speed_ms_max,Wind_Dir_x_mean,Wind_Dir_y_mean,Solar_Rad_mean,Solar_Rad_max,Solar_Rad_sum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-09-17,0.01,19.6,27.5,17.61,2020,9,261,3,-1.0,-1.83697e-16,...,87.1875,73.11,96.72,1.557143,3.3,0.144514,-0.113014,140.452381,600.2,2949.5
2020-09-18,2.0,20.8,25.6,15.21,2020,9,262,4,-1.0,-1.83697e-16,...,92.24375,80.63,100.0,1.317391,3.0,-0.189129,0.434765,93.147826,406.8,2142.4
2020-09-19,20.5,20.5,30.8,35.71,2020,9,263,5,-1.0,-1.83697e-16,...,87.35625,63.72,100.0,1.258333,4.3,0.050985,0.057511,192.583333,1058.1,4622.0
2020-09-20,0.0,20.0,31.0,27.81,2020,9,264,6,-1.0,-1.83697e-16,...,85.26,62.67,98.27,1.891667,5.5,-0.345494,0.095298,243.220833,958.9,5837.3
2020-09-21,0.0,20.5,31.8,26.81,2020,9,265,0,-1.0,-1.83697e-16,...,90.05125,59.4,100.0,1.679167,4.6,0.089347,0.542182,142.808333,1000.7,3427.4


In [15]:
# Interpolate any minor, remaining gaps *within* the recent period.
df_recent_enriched.interpolate(method='time', inplace=True)
print("Interpolated any small gaps within the recent data.")

Interpolated any small gaps within the recent data.


In [16]:
# Save the final, clean dataframe to a new CSV file.
df_recent_enriched.to_csv('..\data\df_recent_enriched.csv')

print(f"\n✅ Success! The recent, enriched dataset has been created and saved as 'df_recent_enriched.csv'")
print(f"Final shape: {df_recent_enriched.shape}")
print(f"Date range: {df_recent_enriched.index.min()} to {df_recent_enriched.index.max()}")
print("--- Final DataFrame Head ---")
print(df_recent_enriched.head())


✅ Success! The recent, enriched dataset has been created and saved as 'df_recent_enriched.csv'
Final shape: (1381, 22)
Date range: 2020-09-17 00:00:00 to 2024-10-02 00:00:00
--- Final DataFrame Head ---
            Precipitation_mm  Min_Temp_C  Max_Temp_C  \
Date                                                   
2020-09-17              0.01        19.6        27.5   
2020-09-18              2.00        20.8        25.6   
2020-09-19             20.50        20.5        30.8   
2020-09-20              0.00        20.0        31.0   
2020-09-21              0.00        20.5        31.8   

            Precipitation_7d_rolling_sum  Year  Month  Day_of_Year  \
Date                                                                 
2020-09-17                         17.61  2020      9          261   
2020-09-18                         15.21  2020      9          262   
2020-09-19                         35.71  2020      9          263   
2020-09-20                         27.81  2020      9