In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from datetime import datetime, timedelta

#dataset
data = pd.read_csv("weather_1jan2020_24jan2025.csv")

#"date" column to datetime with dayfirst=True
data['date'] = pd.to_datetime(data['date'], dayfirst=True)

#feature extraction
data['day'] = data['date'].dt.day
data['month'] = data['date'].dt.month
data['year'] = data['date'].dt.year
data['day_of_year'] = data['date'].dt.dayofyear
data['day_of_week'] = data['date'].dt.dayofweek

In [4]:
print(data['date'].head())
print(data.info())

0   2020-01-01
1   2020-01-02
2   2020-01-03
3   2020-01-04
4   2020-01-05
Name: date, dtype: datetime64[ns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1853 entries, 0 to 1852
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         1851 non-null   datetime64[ns]
 1   tavg         1851 non-null   float64       
 2   tmin         1851 non-null   float64       
 3   tmax         1851 non-null   float64       
 4   prcp         960 non-null    float64       
 5   snow         0 non-null      float64       
 6   wdir         1851 non-null   float64       
 7   wspd         1851 non-null   float64       
 8   wpgt         1851 non-null   float64       
 9   pres         1851 non-null   float64       
 10  day          1851 non-null   float64       
 11  month        1851 non-null   float64       
 12  year         1851 non-null   float64       
 13  day_of_year  1851 non-null   float64       


In [5]:
#Drop unnecessary columns
data = data.drop(columns=['snow'])

#lag features
for lag in [1, 7, 30]:  # Lag values: 1 day, 7 days, 30 days
    data[f'tavg_lag_{lag}'] = data['tavg'].shift(lag)
    data[f'prcp_lag_{lag}'] = data['prcp'].shift(lag)

# Drop rows with missing lag values (caused by shifting)
data = data.dropna()

#features and target
features = [
    'day', 'month', 'day_of_year', 'day_of_week', 'tmin', 'tmax', 'prcp',
    'wdir', 'wspd', 'wpgt', 'pres',
    'tavg_lag_1', 'tavg_lag_7', 'tavg_lag_30',
    'prcp_lag_1', 'prcp_lag_7', 'prcp_lag_30'
]
target = 'tavg'

#Train-test split
#data up to 2024 for training and 2025 for testing
train_data = data[data['year'] <= 2024]
test_data = data[data['year'] == 2025]

X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]

#training of the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Mean Absolute Error: {mae:.2f}°C")
print(f"Root Mean Squared Error: {rmse:.2f}°C")

Mean Absolute Error: 0.69°C
Root Mean Squared Error: 1.00°C


In [6]:
def prepare_features_for_prediction(data, target_date):
    """
    Preparing features for a future date by applying feature engineering logic
    and calculating lag values dynamically from the dataset
    """
    #dataset is sorted by date
    data = data.sort_values('date')
    
    #recent rows to compute lag features
    recent_data = data[data['date'] < target_date].iloc[-30:]  # Use up to 30 days before target_date
    
    #input features for the target date
    features = {
        'day': target_date.day,
        'month': target_date.month,
        'day_of_year': target_date.timetuple().tm_yday,
        'day_of_week': target_date.weekday(),
        'tmin': recent_data['tmin'].iloc[-1],
        'tmax': recent_data['tmax'].iloc[-1],
        'prcp': recent_data['prcp'].iloc[-1],
        'wdir': recent_data['wdir'].iloc[-1],
        'wspd': recent_data['wspd'].iloc[-1],
        'wpgt': recent_data['wpgt'].iloc[-1],
        'pres': recent_data['pres'].iloc[-1],
        'tavg_lag_1': recent_data['tavg'].iloc[-1],
        'tavg_lag_7': recent_data['tavg'].iloc[-7] if len(recent_data) >= 7 else np.nan,
        'tavg_lag_30': recent_data['tavg'].iloc[-30] if len(recent_data) >= 30 else np.nan,
        'prcp_lag_1': recent_data['prcp'].iloc[-1],
        'prcp_lag_7': recent_data['prcp'].iloc[-7] if len(recent_data) >= 7 else np.nan,
        'prcp_lag_30': recent_data['prcp'].iloc[-30] if len(recent_data) >= 30 else np.nan
    }
    
    return pd.DataFrame([features])

In [7]:
target_date = datetime(2025, 2, 1)  
features_for_target_date = prepare_features_for_prediction(data, target_date)

predicted_temp = model.predict(features_for_target_date)[0]
print(f"Predicted Temperature for this date (Saturday) next week (1/2/2025) is:){target_date.strftime('%d/%m/%Y')}: {predicted_temp:.2f}°C")

Predicted Temperature for this date (Saturday) next week (1/2/2025) is:)01/02/2025: 4.37°C


In [8]:
target_date = datetime(2026, 1, 25)  
features_for_target_date = prepare_features_for_prediction(data, target_date)

predicted_temp = model.predict(features_for_target_date)[0]
print(f"Predicted Temperature for this date (Saturday) next year (25/1/2026) is:){target_date.strftime('%d/%m/%Y')}: {predicted_temp:.2f}°C")

Predicted Temperature for this date (Saturday) next year (25/1/2026) is:)25/01/2026: 4.40°C


In [9]:
target_date = datetime(2025, 1, 26)  
features_for_target_date = prepare_features_for_prediction(data, target_date)

predicted_temp = model.predict(features_for_target_date)[0]
print(f"Predicted Temperature for this date (Saturday) next year (25/1/2025) is:){target_date.strftime('%d/%m/%Y')}: {predicted_temp:.2f}°C")
print("According to Meteostat the average temperatur on 28/1/2025 was 4.8°C, and based om Apple's weather app: 4.5°C (3°C lowest & 6°C highest) ")

Predicted Temperature for this date (Saturday) next year (25/1/2025) is:)26/01/2025: 4.40°C
According to Meteostat the average temperatur on 28/1/2025 was 4.8°C, and based om Apple's weather app: 4.5°C (3°C lowest & 6°C highest) 


In [13]:
target_date = datetime(2024, 8, 1) 
features_for_target_date = prepare_features_for_prediction(data, target_date)

predicted_temp = model.predict(features_for_target_date)[0]
print(f"Predicted Temperature for (1/8/2024) was:){target_date.strftime('%d/%m/%Y')}: {predicted_temp:.2f}°C")
print("According to Meteostat the average temperatur on 28/1/2025 was 1/8/2024 was: 16.5°C ")

Predicted Temperature for (1/8/2024) was:)01/08/2024: 16.74°C
According to Meteostat the average temperatur on 28/1/2025 was 1/8/2024 was: 16.5°C 
