In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load the dataset
# Assuming 'solarpowergeneration.csv' is available in the environment
try:
    df = pd.read_csv('solarpowergeneration.csv')
except FileNotFoundError:
    print("Error: solarpowergeneration.csv not found. Please ensure the file is in the correct directory.")
    exit()

# --- Data Cleaning Steps ---
print("\n--- Starting Data Cleaning ---")

# 1. Handle Missing Values: Fill missing numerical values with the mean of their column
# This is a common strategy for imputation. You could also use median, mode, or more advanced methods.
initial_rows = df.shape[0]
df.fillna(df.mean(numeric_only=True), inplace=True)
print(f"Missing values handled by filling with column mean. Initial rows: {initial_rows}, Rows after filling: {df.shape[0]}")

# 2. Remove Duplicate Rows
initial_rows = df.shape[0]
df.drop_duplicates(inplace=True)
print(f"Duplicate rows removed. Initial rows: {initial_rows}, Rows after removing duplicates: {df.shape[0]}")

print("--- Data Cleaning Complete ---")
# --- End Data Cleaning Steps ---


# Identify all columns except the last one as features (X)
# The last column is 'power-generated', which is our target variable.
feature_columns = df.columns[:-1].tolist()
target_column = df.columns[-1]

X = df[feature_columns]
y = df[target_column]

print(f"\nFeatures selected: {feature_columns}")
print(f"Target selected: {target_column}")

# Split the data into training and testing sets
# test_size=0.2 means 20% of the data will be used for testing
# random_state for reproducibility, ensuring the split is the same every time you run it
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model using the training data
print("\nTraining the Linear Regression model...")
model.fit(X_train, y_train)
print("Model training complete.")

# Make predictions on the test set
print("\nMaking predictions on the test set...")
y_pred = model.predict(X_test)

# Calculate accuracy metrics

# R-squared (R²) - Coefficient of Determination
# Measures the proportion of the variance in the dependent variable that is predictable from the independent variable(s).
r2 = r2_score(y_test, y_pred)

# Mean Absolute Error (MAE)
# The average of the absolute differences between predictions and actual values.
# It gives an idea of the magnitude of the errors.
mae = mean_absolute_error(y_test, y_pred)

# Mean Squared Error (MSE)
# The average of the squared differences between predictions and actual values.
# It penalizes larger errors more heavily.
mse = mean_squared_error(y_test, y_pred)

# Root Mean Squared Error (RMSE)
# The square root of the MSE. It's in the same units as the target variable,
# making it more interpretable than MSE.
rmse = np.sqrt(mse)

print("\n--- Model Evaluation ---")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Print the model coefficients and intercept
print("\n--- Model Coefficients ---")
for i, col in enumerate(feature_columns):
    print(f"{col}: {model.coef_[i]:.4f}")
print(f"Intercept: {model.intercept_:.4f}")



--- Starting Data Cleaning ---
Missing values handled by filling with column mean. Initial rows: 2920, Rows after filling: 2920
Duplicate rows removed. Initial rows: 2920, Rows after removing duplicates: 2920
--- Data Cleaning Complete ---

Features selected: ['distance-to-solar-noon', 'temperature', 'wind-direction', 'wind-speed', 'sky-cover', 'visibility', 'humidity', 'average-wind-speed-(period)', 'average-pressure-(period)']
Target selected: power-generated

Training data shape: (2336, 9)
Testing data shape: (584, 9)

Training the Linear Regression model...
Model training complete.

Making predictions on the test set...

--- Model Evaluation ---
R-squared (R²): 0.6251
Mean Absolute Error (MAE): 4981.20
Mean Squared Error (MSE): 39495176.52
Root Mean Squared Error (RMSE): 6284.52

--- Model Coefficients ---
distance-to-solar-noon: -22473.2469
temperature: -77.1000
wind-direction: 40.5586
wind-speed: -6.7174
sky-cover: -918.1244
visibility: 189.8545
humidity: -152.6505
average-wind-