In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib

# Load the training data
train_data = pd.read_csv("wind_power_gen_5years_training_data.csv")

# Convert DateTime column to datetime dtype
train_data['DateTime'] = pd.to_datetime(train_data['DateTime'])

# Drop DateTime column as it's not a feature
train_data.drop("DateTime", axis=1, inplace=True)

# Separate features and target variable
X_train = train_data.drop("PowerGen", axis=1)  # Features (AirTemp, Pressure, WindSpeed)
y_train = train_data["PowerGen"]  # Target variable

# Load the testing data
test_data = pd.read_csv("wind_power_gen_3months_validation_data.csv")

# Convert DateTime column to datetime dtype
test_data['DateTime'] = pd.to_datetime(test_data['DateTime'])

# Drop DateTime column as it's not a feature
test_data.drop("DateTime", axis=1, inplace=True)

# Ensure feature columns in testing data match those in training data
X_test = test_data[X_train.columns]

# Define and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, max_features="sqrt", random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing data
predicted_power = model.predict(X_test)

# Load the actual power generation values for the testing period
actual_power = test_data["PowerGen"]

# Calculate Mean Squared Error
mse = mean_squared_error(actual_power, predicted_power)
print("Mean Squared Error:", mse)

# Calculate Mean Absolute Error
mae = mean_absolute_error(actual_power, predicted_power)

# Calculate the range of actual power generation values
actual_power_range = max(actual_power) - min(actual_power)

# Compute accuracy percentage
accuracy_percentage = 100 * (1 - (mae / actual_power_range))
print("Accuracy Percentage:", accuracy_percentage)

# Save the trained model to a .joblib file
joblib.dump(model, 'PowerGen.joblib')
print("Model saved as 'PowerGen.joblib'")


KeyError: 'DateTime'