In [None]:
80:20

In [1]:
import pandas as pd
import pmdarima as pm
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Load your dataset (assuming it is a CSV file)
df = pd.read_csv('phase_2_data_with_survey.csv')

# Convert 'Date' and 'Hour' columns to a datetime object
df['Datetime'] = pd.to_datetime(df['Date']) + pd.to_timedelta(df['Hour'], unit='h')

# Set 'Datetime' as index
df.set_index('Datetime', inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
df['home_size_encoded'] = label_encoder.fit_transform(df['home_size'])
df['electric_car_encoded'] = df['electric_car'].map({'Yes': 1, 'No': 0})
df['electrically_heated_encoded'] = df['electrically_heated'].map({'Yes': 1, 'No': 0})

# List of exogenous variables
exog_vars = ['Temperature', 'home_size_encoded', 'electric_car_encoded', 'electrically_heated_encoded','no_of_people']

def fit_forecast_arima(df, split_ratio=0.8):
    ids = df['ID'].unique()[:1000]
    forecasts = []
    i = 1
    for id_ in ids:
        print(f"{i} Processing ID: {id_}")
        i += 1
        df_id = df[df['ID'] == id_].sort_index()

        # Split the data into train and test
        split_index = int(len(df_id) * split_ratio)
        train = df_id.iloc[:split_index]
        test = df_id.iloc[split_index:]

        # Fit the ARIMA model
        model = auto_arima(train['Demand_kWh'],
                           exogenous=train[exog_vars],
                           seasonal=False,
                           stepwise=True,
                           suppress_warnings=True,
                           trace=False)

        # Forecast
        n_periods = len(test)
        forecast = model.predict(n_periods=n_periods, exogenous=test[exog_vars])

        # Store the forecast and actual values
        forecast_df = pd.DataFrame({
            'ID': id_,
            'Datetime': test.index,
            'Actual_Demand': test['Demand_kWh'],
            'Predicted_Demand': forecast
        })

        forecasts.append(forecast_df)

    return pd.concat(forecasts)

# Run the forecasting function
results_df = fit_forecast_arima(df)

# Calculate overall error metrics
mse = mean_squared_error(results_df['Actual_Demand'], results_df['Predicted_Demand'])
mae = mean_absolute_error(results_df['Actual_Demand'], results_df['Predicted_Demand'])
rmse = np.sqrt(mse)

print(f"Overall MSE: {mse}")
print(f"Overall MAE: {mae}")
print(f"Overall RMSE: {rmse}")

# Write the results to a file
results_df['Date'] = results_df['Datetime'].dt.date
results_df['Hour'] = results_df['Datetime'].dt.hour
results_df = results_df[['ID', 'Date', 'Hour', 'Actual_Demand', 'Predicted_Demand']]
results_df.to_csv('arima_phase_2_with_survey_8020.csv', index=False)

print("Forecast results have been written to arima_phase_1_with_survey_8020.csv")


1 Processing ID: Exp_3928
2 Processing ID: Exp_3921
3 Processing ID: Exp_3913
4 Processing ID: Exp_3917
5 Processing ID: Exp_3884
6 Processing ID: Exp_3901
7 Processing ID: Exp_3889
8 Processing ID: Exp_3843
9 Processing ID: Exp_3924
10 Processing ID: Exp_3935
11 Processing ID: Exp_3874
12 Processing ID: Exp_3887
13 Processing ID: Exp_3866
14 Processing ID: Exp_3841
15 Processing ID: Exp_3850
16 Processing ID: Exp_3886
17 Processing ID: Exp_3915
18 Processing ID: Exp_3875
19 Processing ID: Exp_3882
20 Processing ID: Exp_3930
21 Processing ID: Exp_3906
22 Processing ID: Exp_3854
23 Processing ID: Exp_3936
24 Processing ID: Exp_3893
25 Processing ID: Exp_3898
26 Processing ID: Exp_3839
27 Processing ID: Exp_3938
28 Processing ID: Exp_3899
29 Processing ID: Exp_3933
30 Processing ID: Exp_3910
31 Processing ID: Exp_3862
32 Processing ID: Exp_3908
33 Processing ID: Exp_3863
34 Processing ID: Exp_3840
35 Processing ID: Exp_3888
36 Processing ID: Exp_3967
37 Processing ID: Exp_3926
38 Process

In [7]:
import pandas as pd
import pmdarima as pm
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Load your dataset (assuming it is a CSV file)
df = pd.read_csv('phase_2_data_with_survey.csv')
df_clean = df.dropna()
# Convert 'Date' and 'Hour' columns to a datetime object
df_clean['Datetime'] = pd.to_datetime(df_clean['Date']) + pd.to_timedelta(df_clean['Hour'], unit='h')

# Set 'Datetime' as index
df_clean.set_index('Datetime', inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
df_clean['home_size_encoded'] = label_encoder.fit_transform(df_clean['home_size'])
df_clean['electric_car_encoded'] = df_clean['electric_car'].map({'Yes': 1, 'No': 0})
df_clean['electrically_heated_encoded'] = df_clean['electrically_heated'].map({'Yes': 1, 'No': 0})

# List of exogenous variables
exog_vars = ['Temperature', 'home_size_encoded', 'electric_car_encoded', 'electrically_heated_encoded','no_of_people']

def fit_forecast_arima(df, split_ratio=0.6):
    ids = df['ID'].unique()[:1000]
    forecasts = []
    i = 1
    for id_ in ids:
        print(f"{i} Processing ID: {id_}")
        i += 1
        df_id = df[df['ID'] == id_].sort_index()

        # Split the data into train and test
        split_index = int(len(df_id) * split_ratio)
        train = df_id.iloc[:split_index]
        test = df_id.iloc[split_index:]

        # Fit the ARIMA model
        model = auto_arima(train['Demand_kWh'],
                           exogenous=train[exog_vars],
                           seasonal=False,
                           stepwise=True,
                           suppress_warnings=True,
                           trace=False)

        # Forecast
        n_periods = len(test)
        forecast = model.predict(n_periods=n_periods, exogenous=test[exog_vars])

        # Store the forecast and actual values
        forecast_df = pd.DataFrame({
            'ID': id_,
            'Datetime': test.index,
            'Actual_Demand': test['Demand_kWh'],
            'Predicted_Demand': forecast
        })

        forecasts.append(forecast_df)

    return pd.concat(forecasts)

# Run the forecasting function
results_df = fit_forecast_arima(df_clean)

# Calculate overall error metrics
mse = mean_squared_error(results_df['Actual_Demand'], results_df['Predicted_Demand'])
mae = mean_absolute_error(results_df['Actual_Demand'], results_df['Predicted_Demand'])
rmse = np.sqrt(mse)

print(f"Overall MSE: {mse}")
print(f"Overall MAE: {mae}")
print(f"Overall RMSE: {rmse}")

# Write the results to a file
results_df['Date'] = results_df['Datetime'].dt.date
results_df['Hour'] = results_df['Datetime'].dt.hour
results_df = results_df[['ID', 'Date', 'Hour', 'Actual_Demand', 'Predicted_Demand']]
results_df.to_csv('arima_phase_2_with_survey_6040.csv', index=False)

print("Forecast results have been written to arima_phase_1_with_survey_8020.csv")


1 Processing ID: Exp_3928
2 Processing ID: Exp_3921
3 Processing ID: Exp_3913
4 Processing ID: Exp_3917
5 Processing ID: Exp_3884
6 Processing ID: Exp_3901
7 Processing ID: Exp_3889
8 Processing ID: Exp_3843
9 Processing ID: Exp_3924
10 Processing ID: Exp_3935
11 Processing ID: Exp_3874
12 Processing ID: Exp_3887
13 Processing ID: Exp_3866
14 Processing ID: Exp_3841
15 Processing ID: Exp_3850
16 Processing ID: Exp_3886
17 Processing ID: Exp_3915
18 Processing ID: Exp_3875
19 Processing ID: Exp_3882
20 Processing ID: Exp_3930
21 Processing ID: Exp_3906
22 Processing ID: Exp_3854
23 Processing ID: Exp_3936
24 Processing ID: Exp_3893
25 Processing ID: Exp_3898
26 Processing ID: Exp_3839
27 Processing ID: Exp_3938
28 Processing ID: Exp_3899
29 Processing ID: Exp_3933
30 Processing ID: Exp_3910
31 Processing ID: Exp_3862
32 Processing ID: Exp_3908
33 Processing ID: Exp_3863
34 Processing ID: Exp_3840
35 Processing ID: Exp_3888
36 Processing ID: Exp_3967
37 Processing ID: Exp_3926
38 Process

ValueError: Input contains NaN.

: 

In [5]:
df = pd.read_csv('phase_2_data_with_survey.csv')
df_cleaned = df[df['ID'].isna()]

# Display the cleaned DataFrame
print("\nCleaned DataFrame (rows with any value in 'id' dropped):")
print(df_cleaned)

# Write the cleaned DataFrame to a CSV file
df_cleaned.to_csv('phase_2_data_with_survey_cl.csv', index=False)

print("\nCleaned DataFrame has been written to 'cleaned_data.csv'")


Cleaned DataFrame (rows with any value in 'id' dropped):
Empty DataFrame
Columns: [ID, From, Date, Hour, Participation_Phase, Demand_kWh, Temperature, home_size, electric_car, no_of_people, electrically_heated]
Index: []

Cleaned DataFrame has been written to 'cleaned_data.csv'
