In [15]:
# STEP 0: IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
import joblib
import shap
import warnings
warnings.filterwarnings("ignore")
import os

In [5]:
# STEP 1: LOAD DATA & FEATURE ENGINEERING
df = pd.read_csv("F:/projects/personal project/weather prediction_v1/data/weather_atmospheric_processed.csv", parse_dates=['time'], index_col='time')

# Create lag features
for col in df.columns:
    df[f"{col}_lag1"] = df[col].shift(1)

# Rolling means
for col in df.columns:
    df[f"{col}_roll3"] = df[col].rolling(window=3).mean()

# Time-based features
df['month'] = df.index.month
df['dayofyear'] = df.index.dayofyear

# Drop rows with NaNs due to shifting/rolling
df.dropna(inplace=True)

In [9]:
# STEP 2: MULTI-STEP TARGET CREATION (t+1 to t+3)
target_var = 'TMP'  # Target to forecast
df['TMP_t+1'] = df[target_var].shift(-1)
df['TMP_t+2'] = df[target_var].shift(-2)
df['TMP_t+3'] = df[target_var].shift(-3)
df.dropna(inplace=True)

In [11]:
# STEP 3: TRAIN-TEST SPLIT & SCALING
features = df.drop(columns=['TMP_t+1', 'TMP_t+2', 'TMP_t+3'])
targets = df[['TMP_t+1', 'TMP_t+2', 'TMP_t+3']]

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(features)

train_size = int(len(df) * 0.8)
X_train, X_test = X_scaled[:train_size], X_scaled[train_size:]
y_train, y_test = targets[:train_size], targets[train_size:]

In [17]:
# STEP 4: RANDOM FOREST WITH HYPERPARAMETER TUNING
param_dist = {
    'n_estimators': [50, 100],
    'max_depth': [10, None],
    'min_samples_split': [2, 5],
}
# Define the model save directory
model_dir = "F:/projects/personal project/weather prediction_v1/models/"

best_models = {}

for step in ['TMP_t+1', 'TMP_t+2', 'TMP_t+3']:
    rf = RandomForestRegressor(random_state=42)
    search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=3, cv=2, n_jobs=-1)
    search.fit(X_train, y_train[step])
    best_models[step] = search.best_estimator_
    model_path = os.path.join(model_dir, f"rf_model_{step}.pkl")
    joblib.dump(best_models[step], model_path)


In [19]:
# STEP 5: EVALUATION
for step in ['TMP_t+1', 'TMP_t+2', 'TMP_t+3']:
    model = best_models[step]
    preds = model.predict(X_test)
    print(f"\n📊 {step} Evaluation")
    print("MAE:", mean_absolute_error(y_test[step], preds))
    print("RMSE:", np.sqrt(mean_squared_error(y_test[step], preds)))
    print("R2 Score:", r2_score(y_test[step], preds))

    # Save prediction plots
    plt.figure(figsize=(10, 4))
    plt.plot(y_test[step].values[:100], label='Actual')
    plt.plot(preds[:100], label='Predicted')
    plt.title(f"{step} - Actual vs Predicted")
    plt.legend()
    plt.savefig(f"prediction_plot_{step}.png")
    plt.close()


📊 TMP_t+1 Evaluation
MAE: 0.4450822181090124
RMSE: 0.7312935582267434
R2 Score: 0.9995650732044311

📊 TMP_t+2 Evaluation
MAE: 0.5487915975801441
RMSE: 0.8121955301700466
R2 Score: 0.9994635097197827

📊 TMP_t+3 Evaluation
MAE: 0.5984560427480963
RMSE: 0.8621495567024434
R2 Score: 0.9993954929615083


In [21]:
# STEP 6: BASELINE MODEL
print("\n📉 Baseline Model (predict tomorrow = today)")
y_naive = X_test[:, list(features.columns).index('TMP')]  # today's TMP = tomorrow's forecast
y_true = y_test['TMP_t+1'].values
print("MAE:", mean_absolute_error(y_true, y_naive))
print("RMSE:", np.sqrt(mean_squared_error(y_true, y_naive)))


📉 Baseline Model (predict tomorrow = today)
MAE: 251.60300377432955
RMSE: 254.00514240939933


In [23]:
# STEP 7: EXPLAINABILITY WITH SHAP (t+1 only)
explainer = shap.TreeExplainer(best_models['TMP_t+1'])
shap_values = explainer.shap_values(X_test[:100])
shap.summary_plot(shap_values, features.columns, show=False)
plt.savefig("F:/projects/personal project/weather prediction_v1/outputs/shap_summary_t+1.png")
plt.close()


In [25]:
# STEP 8: CROSS-VALIDATION
print("\n📊 Cross-validation (R2 scores)")
tscv = TimeSeriesSplit(n_splits=5)
for step in ['TMP_t+1']:
    scores = []
    for train_idx, val_idx in tscv.split(X_scaled):
        model = RandomForestRegressor()
        model.fit(X_scaled[train_idx], targets[step].iloc[train_idx])
        preds = model.predict(X_scaled[val_idx])
        scores.append(r2_score(targets[step].iloc[val_idx], preds))
    print(f"{step}: Mean R2 = {np.mean(scores):.4f}, Std = {np.std(scores):.4f}")


📊 Cross-validation (R2 scores)
TMP_t+1: Mean R2 = 0.9996, Std = 0.0001


In [27]:
# STEP 9: MODEL DRIFT
print("\n📊 Model Drift Analysis")
early = slice(0, len(X_scaled)//2)
late = slice(len(X_scaled)//2, None)
model = best_models['TMP_t+1']
print("Early R2:", r2_score(targets['TMP_t+1'].iloc[early], model.predict(X_scaled[early])))
print("Late  R2:", r2_score(targets['TMP_t+1'].iloc[late], model.predict(X_scaled[late])))


📊 Model Drift Analysis
Early R2: 0.9999688765719816
Late  R2: 0.9998039280927691


In [29]:
# STEP 10: SAVE SCALER
scaler_path = os.path.join(model_dir, "scaler.pkl")
joblib.dump(scaler, scaler_path)
print("\n✅ Project pipeline completed. Ready for dashboard!")


✅ Project pipeline completed. Ready for dashboard!


>LSTM MODEL

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

scaler = joblib.load(os.path.join(model_dir, "scaler.pkl"))
X_scaled = scaler.transform(features)
y = targets
# Reshape X for LSTM: (samples, timesteps=1, features)
X_lstm = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# Define function to train and save LSTM model
def train_and_save_lstm(y_data, label):
    model = Sequential([
        LSTM(64, input_shape=(1, X_lstm.shape[2]), activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_lstm, y_data, epochs=20, batch_size=32, verbose=1)
    model_path = os.path.join(model_dir, f"lstm_model_{label}.h5")
    model.save(model_path)
    print(f"✅ LSTM model saved: lstm_model_{label}.h5")

# Train models for t+1, t+2, t+3
train_and_save_lstm(y['TMP_t+1'], 'TMP_t+1')
train_and_save_lstm(y['TMP_t+2'], 'TMP_t+2')
train_and_save_lstm(y['TMP_t+3'], 'TMP_t+3')

Epoch 1/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 16674.3496
Epoch 2/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 87.1544
Epoch 3/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 17.5639
Epoch 4/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 8.3863
Epoch 5/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 5.6620
Epoch 6/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 4.5847
Epoch 7/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 3.9958
Epoch 8/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 3.6845
Epoch 9/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 3.3156
Epoch 10/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0



✅ LSTM model saved: lstm_model_TMP_t+1.h5
Epoch 1/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 16544.0684
Epoch 2/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 77.9648
Epoch 3/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 28.6089
Epoch 4/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 9.7166
Epoch 5/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 3.8662
Epoch 6/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 2.7464
Epoch 7/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 2.1738
Epoch 8/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 1.8969
Epoch 9/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 1.7351
Epoch 10/20
[1m4600/460



✅ LSTM model saved: lstm_model_TMP_t+2.h5
Epoch 1/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - loss: 16006.2607
Epoch 2/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 79.7976
Epoch 3/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 8.2125
Epoch 4/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 3.3639
Epoch 5/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 2.6322
Epoch 6/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 2.1847
Epoch 7/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 1.9660
Epoch 8/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 1.7808
Epoch 9/20
[1m4600/4600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 1.7362
Epoch 10/20
[1m4600/4600



✅ LSTM model saved: lstm_model_TMP_t+3.h5
