In [3]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import random
import time
import numpy as np

In [6]:
df = pd.read_csv("/content/drive/MyDrive/DATA_SCIENCE_PROJECT/data_v2_full.csv", sep = ';', decimal=",")

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isnull().any()

In [None]:
print(df.dtypes)

In [None]:
df['date']= pd.to_datetime(df['date'])
df["date"] = df["date"].dt.year
df = df.rename(columns={"date": "year"})
df = df.sort_values(by=["year", "day_of_year"])

In [None]:
X = df.drop(columns=["entry", "rlm", "slp"])
y = df["slp"]

In [None]:
print(X.shape)
print(y.shape)

In [None]:
print(y.head())

In [None]:
X.head()
X.columns

In [None]:
random.seed(123)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = False)

In [None]:
X_train.head()

In [None]:
X_train.iloc[-1:]

In [None]:
X_test.head()

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
feature_names = X.columns

In [None]:
X_train.dtypes

In [None]:
forest = RandomForestRegressor(random_state=1)
forest.fit(X_train, y_train)

In [None]:
important_features = pd.Series(data=forest.feature_importances_, index=feature_names)
important_features = important_features.sort_values(ascending=False)
print(important_features)

##Simple bar plot using features importance

In [None]:
important_features.plot(kind="bar", title = "Feature Importance in Random Forest")

##Feature importance based on mean decrease in impurity (MDI)

In [None]:
individual_trees=forest.estimator_
manual_feature_importances = np.mean([tree.feature_importances_ for tree in forest.estimators_], axis=0)

In [None]:
np.allclose(manual_feature_importances, forest.feature_importances_)

In [None]:
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)

fig, ax = plt.subplots()
important_features.plot.barh(xerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_xlabel("Mean decrease in impurity")
fig.tight_layout()

Standard deviation (SD) is a statistical measure that tells you how spread out the values are from their mean (average).



- The most important feature is clearly apparent_temperature_mean (huge bar).

- Other strong features include apparent_temperature_max, temperature_2m_mean, and temperature_2m_max.

- Features like sunrise, sunset, daylight_duration, rain_sum, windspeed_10m_max, etc., have very low importance.

- Calendar features (year, day_of_year, day_of_week, holiday) have almost no impact.

This tells us that temperature-related features dominate gas consumption prediction, which makes sense for heating demand.

##Feature importance based on feature permutation

Permutation feature importance overcomes limitations of the impurity-based feature importance: they do not have a bias toward high-cardinality features and can be computed on a left-out test set.

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
result = permutation_importance(forest, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)

In [None]:
forest_importances = pd.Series(result.importances_mean, index=feature_names)

In [None]:
fig, ax = plt.subplots()
forest_importances.plot.barh(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_xlabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

Differences between MDI and permutation importance
1. Different measurement approaches
MDI (Mean Decrease in Impurity): Measures how much each feature reduces impurity (e.g., Gini) when used in splits. It reflects how often and how effectively a feature is used in the tree structure.

    Permutation importance: Measures how much model performance (accuracy) decreases when a feature’s values are randomly shuffled. It reflects the feature’s actual predictive contribution.
2. Why the results differ
Order swap:
MDI: apparent_temperature_mean (0.45) > temperature_2m_mean (0.28)
Permutation: temperature_2m_mean (0.225) > apparent_temperature_mean (0.145)
This suggests:
apparent_temperature_mean is used more in splits (MDI), but when shuffled, temperature_2m_mean has a larger impact on accuracy.
temperature_2m_mean may be more directly predictive, while apparent_temperature_mean may be correlated and used as a proxy.
Magnitude differences:
apparent_temperature_max and temperature_2m_max are much smaller in permutation (approx.0.015 vs approx.0.12 and approx. 0.08).
This suggests they are used in splits but dont add much unique predictive power beyond the mean features.
Feature appearance:
sunrise appears in permutation (approx. 0.01) but not prominently in MDI, indicating it adds some predictive value that MDI didnt capture.
3. Why this happens
MDI can be biased toward features with more categories or splits.
MDI doesnt account for feature correlation; permutation does.
Permutation measures actual impact on model performance, not just tree usage.
4. Which to trust?
Permutation importance is generally preferred because it:
Measures actual predictive contribution
Accounts for feature interactions and correlations
Is less biased by feature characteristics
Recommendation: Use temperature_2m_mean and apparent_temperature_mean as your primary features, with temperature_2m_max as a secondary feature. The other features appear to add little predictive value.

Why error bars matter?

- Large error bars: importance varies a lot across trees (less stable)
- Small error bars: consistent importance (more stable)
- Error bars crossing zero: some trees assign negative/zero importance (unreliable)

Which features use ?

- apparent_temperature_mean
- temperature_2m_mean
- apparent_temperature_max
- temperature_2m_max
- day_of_year

In [18]:
cols = ['date',
        'day_of_year',
        'slp',
        'apparent_temperature_mean',
        'temperature_2m_mean',
        'apparent_temperature_max',
        'temperature_2m_max',
        'sunrise']

df_reduced = df[cols]

In [19]:
import os

folder_name = '/content/drive/MyDrive/DATA_SCIENCE_PROJECT'
file_name = 'df_reduced.csv'

os.makedirs(folder_name, exist_ok=True)
full_path = os.path.join(folder_name, file_name)
df_reduced.to_csv(full_path, index=True, sep=';')
print(f"File 'df_clean.csv' saved in: {full_path}")

File 'df_clean.csv' saved in: /content/drive/MyDrive/DATA_SCIENCE_PROJECT/df_reduced.csv


In [20]:
df_reducedl.head()

Unnamed: 0,date,day_of_year,slp,apparent_temperature_mean,temperature_2m_mean,apparent_temperature_max,temperature_2m_max,sunrise
0,2016-01-01 00:00:00+00:00,1,1935724.0,-0.4,2.4,2.2,4.3,26400
1,2016-01-02 00:00:00+00:00,2,2410158.0,-6.9,-1.5,-0.7,2.5,26340
2,2016-01-03 00:00:00+00:00,3,3009970.0,-13.2,-6.8,-11.8,-5.4,26340
3,2016-01-04 00:00:00+00:00,4,3201452.0,-11.5,-5.7,-9.1,-3.5,26340
4,2016-01-05 00:00:00+00:00,5,3169938.0,-11.3,-5.8,-10.6,-5.2,26340


In [15]:
df_clean_model.describe()

Unnamed: 0,slp,day_of_year,sunrise
count,3560.0,3560.0,3560.0
mean,988192.4,179.662079,17927.696629
std,752107.5,104.39644,5540.841244
min,-435171.6,1.0,9960.0
25%,254846.2,90.0,12660.0
50%,810164.1,179.0,17670.0
75%,1627962.0,268.0,23220.0
max,3341411.0,366.0,26400.0
