In [1]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller

In [2]:
# Load cleaned weather data
df = pd.read_csv('cleaned_weather_data.csv', parse_dates=['date'])

In [3]:
df.head()

Unnamed: 0,date,location_id,temperature,rainfall,wind_speed,precipitation
0,2010-01-01,0,26.0,0.0,12.2,0.0
1,2010-01-02,0,26.4,15.7,11.6,15.7
2,2010-01-03,0,28.0,0.0,13.0,0.0
3,2010-01-04,0,27.5,7.4,13.6,7.4
4,2010-01-05,0,27.7,9.6,10.3,9.6


In [4]:
# Define variables to process
variables = ['temperature', 'rainfall', 'wind_speed', 'precipitation']

In [5]:
# Create empty list to hold transformed data
stationary_data = []

In [6]:
# ADF test function
def adf_test(series, var_name, loc):
    try:
        result = adfuller(series.dropna(), autolag='AIC')
        print(f"Location {loc} - {var_name} ADF Statistic: {result[0]:.4f}, p-value: {result[1]:.4f}")
        for key, val in result[4].items():
            print(f"   Critical Value ({key}): {val:.4f}")
        if result[1] < 0.05:
            print(f"   {var_name} is stationary (reject H0)")
            return True
        else:
            print(f"   {var_name} is NOT stationary (fail to reject H0)")
            return False
    except Exception as e:
        print(f"   ADF test failed for {var_name} at location {loc}: {e}")
        return False

In [7]:
# Process each location
for loc in df['location_id'].unique():
    print(f"\nProcessing location: {loc}")
    loc_df = df[df['location_id'] == loc].sort_values('date').copy()
    loc_df.set_index('date', inplace=True)

    for var in variables:
        print(f"\nChecking stationarity for '{var}' at location {loc}:")
        is_stationary = adf_test(loc_df[var], var, loc)

        if not is_stationary:
            new_var = var + '_diff'
            print(f"Applying first differencing to '{var}' → '{new_var}'")
            loc_df[new_var] = loc_df[var].diff()

            print(f"Re-checking stationarity for '{new_var}' at location {loc}:")
            is_stationary_after = adf_test(loc_df[new_var], new_var, loc)

            if is_stationary_after:
                loc_df.drop(columns=[var], inplace=True)
            else:
                print(f"Warning: '{new_var}' is still not stationary")

    # Drop NaNs from differencing
    loc_df.dropna(inplace=True)

    # Add location_id column back
    loc_df['location_id'] = loc

    # Store the processed data
    stationary_data.append(loc_df.reset_index())


Processing location: 0

Checking stationarity for 'temperature' at location 0:
Location 0 - temperature ADF Statistic: -4.5997, p-value: 0.0001
   Critical Value (1%): -3.4316
   Critical Value (5%): -2.8621
   Critical Value (10%): -2.5671
   temperature is stationary (reject H0)

Checking stationarity for 'rainfall' at location 0:
Location 0 - rainfall ADF Statistic: -10.0279, p-value: 0.0000
   Critical Value (1%): -3.4316
   Critical Value (5%): -2.8621
   Critical Value (10%): -2.5671
   rainfall is stationary (reject H0)

Checking stationarity for 'wind_speed' at location 0:
Location 0 - wind_speed ADF Statistic: -5.6065, p-value: 0.0000
   Critical Value (1%): -3.4316
   Critical Value (5%): -2.8621
   Critical Value (10%): -2.5671
   wind_speed is stationary (reject H0)

Checking stationarity for 'precipitation' at location 0:
Location 0 - precipitation ADF Statistic: -10.0279, p-value: 0.0000
   Critical Value (1%): -3.4316
   Critical Value (5%): -2.8621
   Critical Value (1

In [8]:
# Combine all processed data
final_stationary_df = pd.concat(stationary_data, ignore_index=True)

In [9]:
# Save final stationary dataset
final_stationary_df.to_csv('stationary_weather_data.csv', index=False)

print("\nAll locations processed.")
print("Saved stationary dataset as 'stationary_weather_data.csv'")


All locations processed.
Saved stationary dataset as 'stationary_weather_data.csv'
