In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# Importing cleaned datasets
training_data = pd.read_csv("data/train_data.csv")
test_data = pd.read_csv("data/test_data.csv")
validation_data = pd.read_csv("data/validate_data.csv")

In [3]:
# Splitting the features and the target for datasets
features = ["Plant_Production_GWh", "Population_k", "tmax"]
target = ["Max_Demand_GW"]

x_train = training_data[features]
y_train = training_data[target]

x_test = test_data[features]
y_test = test_data[target]

x_validate = validation_data[features]
y_validate = validation_data[target]

In [4]:
# Adding polynomial features
# Degree is equal to 2 because 2 is the polynomial degree
degree = 2

poly = PolynomialFeatures(degree=degree)

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)
x_validate_poly = poly.transform(x_validate)

In [None]:
# Create and train the polynomial regression model
poly_regression_model = LinearRegression()
poly_regression_model.fit(x_train_poly, y_train)

In [None]:
# Make predictions on the test dataset
y_test_prediction = poly_regression_model.predict(x_test_poly)
# Evaluate the model on the test dataset
mse_test = mean_squared_error(y_test, y_test_prediction)
print(f"Mean Squared Error on Test Set: {mse_test}")
mae_test= mean_absolute_error(y_test, y_test_prediction)
print(f"Mean Absolute Error on Test Set: {mae_test}")

print("\n")

# Make predictions on the validation dataset
y_validate_prediction = poly_regression_model.predict(x_validate_poly)
# Evaluate the model on the validation dataset
mse_validation = mean_squared_error(y_validate, y_validate_prediction)
print(f"Mean Squared Error on Validation Set: {mse_validation}")
mae_validation = mean_absolute_error(y_validate, y_validate_prediction)
print(f"Mean Absolute Error on Validation Set: {mae_validation}")

In [None]:
# Generate data for the ideal line (y = x)
x_ideal = np.linspace(y_validate.min(), y_validate.max(), 100)
y_ideal = x_ideal

# Plotting the actual vs predicted values on the test set
plt.scatter(y_validate, y_validate_prediction, label="Actual vs Predicted")
plt.plot(x_ideal, y_ideal, color="red", label="Ideal Line", linestyle='--')
plt.title("Actual vs Predicted Values on Validation Dataset")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.legend()
plt.show()

In [None]:
# Calculating the MAPE on the Validation dataset
def mean_absolute_percentage_error(actual, predictions): 
    actual, predictions = np.array(actual), np.array(predictions)
    return np.mean(np.abs((actual - predictions) / actual)) * 100

mape = mean_absolute_percentage_error(y_validate, y_validate_prediction)
print(f'\nMAPE on validation dataset: {mape:.2f}%')

In [None]:
# T-Test - Generated
x_train_poly_intercept = sm.add_constant(x_train_poly)
sm_model = sm.OLS(y_train, x_train_poly_intercept).fit()
print(sm_model.summary())