In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load the dataset
# Assuming 'spg.csv' is available in the environment
try:
    df = pd.read_csv('spg.csv')
except FileNotFoundError:
    print("Error: spg.csv not found. Please ensure the file is in the correct directory.")
    exit()

# Identify all columns except the last one as features (X)
# The last column is 'generated_power_kw', which is our target variable.
# All columns from the first up to the second-to-last will be used as features.
feature_columns = df.columns[:-1].tolist()
target_column = df.columns[-1]

X = df[feature_columns]
y = df[target_column]

print(f"Features selected: {feature_columns}")
print(f"Target selected: {target_column}")

# Split the data into training and testing sets
# test_size=0.2 means 20% of the data will be used for testing
# random_state for reproducibility, ensuring the split is the same every time you run it
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model using the training data
print("\nTraining the Linear Regression model with all features...")
model.fit(X_train, y_train)
print("Model training complete.")

# Make predictions on the test set
print("\nMaking predictions on the test set...")
y_pred = model.predict(X_test)

# Calculate accuracy metrics

# R-squared (R²) - Coefficient of Determination
# Measures the proportion of the variance in the dependent variable that is predictable from the independent variable(s).
r2 = r2_score(y_test, y_pred)

# Mean Absolute Error (MAE)
# The average of the absolute differences between predictions and actual values.
# It gives an idea of the magnitude of the errors.
mae = mean_absolute_error(y_test, y_pred)

# Mean Squared Error (MSE)
# The average of the squared differences between predictions and actual values.
# It penalizes larger errors more heavily.
mse = mean_squared_error(y_test, y_pred)

# Root Mean Squared Error (RMSE)
# The square root of the MSE. It's in the same units as the target variable,
# making it more interpretable than MSE.
rmse = np.sqrt(mse)

print("\n--- Model Evaluation ---")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Print the model coefficients and intercept
print("\n--- Model Coefficients ---")
for i, col in enumerate(feature_columns):
    print(f"{col}: {model.coef_[i]:.4f}")
print(f"Intercept: {model.intercept_:.4f}")


Features selected: ['temperature_2_m_above_gnd', 'relative_humidity_2_m_above_gnd', 'mean_sea_level_pressure_MSL', 'shortwave_radiation_backwards_sfc', 'wind_speed_10_m_above_gnd', 'wind_speed_80_m_above_gnd', 'wind_speed_900_mb', 'wind_gust_10_m_above_gnd', 'angle_of_incidence', 'zenith', 'azimuth']
Target selected: generated_power_kw

Training data shape: (3370, 11)
Testing data shape: (843, 11)

Training the Linear Regression model with all features...
Model training complete.

Making predictions on the test set...

--- Model Evaluation ---
R-squared (R²): 0.7015
Mean Absolute Error (MAE): 407.45
Mean Squared Error (MSE): 272656.10
Root Mean Squared Error (RMSE): 522.16

--- Model Coefficients ---
temperature_2_m_above_gnd: -9.4610
relative_humidity_2_m_above_gnd: -6.0717
mean_sea_level_pressure_MSL: 19.2784
shortwave_radiation_backwards_sfc: 1.6691
wind_speed_10_m_above_gnd: 15.7338
wind_speed_80_m_above_gnd: 11.6604
wind_speed_900_mb: -32.4831
wind_gust_10_m_above_gnd: -3.8830
ang