In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np


In [2]:
# Load the dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data


In [3]:
daily_data_path = 'final_data_daily.csv' 

daily_data = load_data(daily_data_path)

daily_data.head()

Unnamed: 0,Date,DailyPrecipitation,MaxHourlyPrecipitation,HDMaxPrecipitation,DailyMeanTemperature,HourlyMinTemperature,HDMinTemperature,HourlyMaxTemperature,HDMaxTemperature,DailyMeanWindspeed,...,Electricity_NonHousehold_2000to20000MWh_EuroPerKWh,Electricity_NonHousehold_20000to70000MWh_EuroPerKWh,Electricity_NonHousehold_70000to150000MWh_EuroPerKWh,Electricity_NonHousehold_MoreThan_150000MWh_EuroPerKWh,AveragePrice_Electricity_Household,AveragePrice_Electricity_NonHousehold,AveragePrice_NaturalGas_Household,AveragePrice_NaturalGas_NonHousehold,GDP,NAO
0,2009-01-15,0.0,0.0,1,2.6,10,24,5.0,14,3.3,...,0.118,0.108,0.104,0.08,0.1424,0.134429,0.856667,0.5046,158583.0,-0.01
1,2009-01-16,-0.1,-0.1,14,1.7,1,8,3.7,23,3.9,...,0.117989,0.10795,0.103945,0.079956,0.142302,0.134369,0.856098,0.504255,158583.0,-0.01
2,2009-01-17,0.7,0.3,12,5.0,28,3,7.5,16,5.1,...,0.117978,0.107901,0.10389,0.079912,0.142203,0.13431,0.855529,0.50391,158583.0,-0.01
3,2009-01-18,5.4,2.9,4,5.5,21,24,7.7,3,6.1,...,0.117967,0.107851,0.103834,0.079867,0.142105,0.134251,0.854959,0.503566,158583.0,-0.01
4,2009-01-19,4.1,1.0,19,4.8,21,1,8.8,18,6.5,...,0.117956,0.107801,0.103779,0.079823,0.142007,0.134192,0.85439,0.503221,158583.0,-0.01


In [4]:
# Clean and preprocess numeric columns
def clean_numeric_columns(df):
    for column in df.columns:
        df[column] = pd.to_numeric(df[column].astype(str).str.replace(',', '.'), errors='coerce')
    return df

# Add time-based features and encode categorical variables
def preprocess_data(df):
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df['Day'] = df['Date'].dt.day
        df['Month'] = df['Date'].dt.month
        df['Year'] = df['Date'].dt.year
        df = df.drop(columns=['Date'])
    
    if 'day_of_week' in df.columns:
        label_encoder = LabelEncoder()
        df['day_of_week'] = label_encoder.fit_transform(df['day_of_week'])

    return df
daily_data = clean_numeric_columns(daily_data)
daily_data = preprocess_data(daily_data)

In [5]:
# Define the modeling process
def train_model(X, y, param_grid):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    grid_search = GridSearchCV(
        estimator=RandomForestRegressor(random_state=42),
        param_grid=param_grid,
        cv=3,
        scoring='neg_mean_squared_error',
        verbose=2,
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return grid_search.best_params_, rmse, mae, r2

# Main script
def main():
    file_path = 'final_data_daily.csv'  # Update this with your file path
    data = load_data(file_path)

    # Clean and preprocess the dataset
    data_cleaned = clean_numeric_columns(data)
    data_cleaned = preprocess_data(data_cleaned)

    # Define target variable and features
    target = 'DailyLoadConsumption'
    top_10_features = [
        'day_of_week',
        'HourlyMaxTemperature',
        'Week',
        'Electricity_NonHousehold_500to2000MWh_EuroPerKWh',
        'Electricity_NonHousehold_70000to150000MWh_EuroPerKWh',
        'DailyMeanTemperature',
        'MaxHourlyPrecipitation',
        'DailyMeanWindspeed',
        'OnshoreWindEnergy',
        'SolarPower'
    ]
    X = data_cleaned[top_10_features]
    y = data_cleaned[target]

    # Define hyperparameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Train model and evaluate
    best_params, rmse, mae, r2 = train_model(X, y, param_grid)

    print("Best Parameters:", best_params)
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R^2:", r2)

if __name__ == "__main__":
    main()


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
RMSE: 22926.661175353092
MAE: 19483.735926681828
R^2: 0.45276286533016774
