In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVR

In [2]:
# Load the raw dataset
df = pd.read_csv(r"C:\Users\user\Desktop\Ashar CEP\Solar_dataset.csv")
df1 = df.copy()

In [3]:
# 1. Data Type Conversion
df1['DC_POWER'] = pd.to_numeric(df1['DC_POWER'], errors='coerce')
df1['AC_POWER'] = pd.to_numeric(df1['AC_POWER'], errors='coerce')

In [4]:
# 2. Convert Date and Time columns to datetime format
df1['Date'] = pd.to_datetime(df1[['Year', 'Month', 'Day', 'Hour', 'Minute']])
df1 = df1.drop(['Year', 'Month', 'Day', 'Hour', 'Minute'], axis=1)

In [5]:
# 3. Remove rows with negative values in the 'YIELD' column
df1 = df1[df1['YIELD'] >= 0]

In [6]:
# 4. Handle Missing Values
df1 = df1.dropna()

In [7]:
# 5. Create a new column for the portion of the day
df1['Day_Part'] = pd.cut(df1['Date'].dt.hour,
                         bins=[0, 6, 12, 15, 19, 24],
                         labels=['Night', 'Morning', 'Afternoon', 'Evening', 'Night'],
                         right=False,
                         ordered=False)

In [8]:
# 6. Save the cleaned DataFrame to a new CSV file
df1.to_csv(r"C:\Users\user\Desktop\Ashar CEP\New_Solar_dataset.csv", index=False)

In [9]:
df = pd.read_csv("C:/Users/user/Desktop/Ashar CEP/Cleaned_Solar_dataset.csv")

# Exclude 'Date' column from features
X = df.drop(['YIELD', 'AC_POWER', 'DC_POWER', 'Date'], axis=1)
y = df['YIELD']

# Label encode categorical columns
label_encoder = LabelEncoder()
X['PLANT_ID'] = label_encoder.fit_transform(X['PLANT_ID'])
X['Day_Part'] = label_encoder.fit_transform(X['Day_Part'])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Train machine learning models
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(kernel='linear')  # You can change the kernel as needed
}

best_model_name = None
best_model = None
best_testing_r2_score = -np.inf

# Find the best-performing model based on testing R2 score
results = {'Model': [], 'Mean Squared Error': [], 'Training R2 Score': [], 'Testing R2 Score': []}

for model_name, model in models.items():
    model.fit(X_train, y_train)

    # Predict on the test set
    y_test_pred = model.predict(X_test)

    # Calculate testing R2 score
    testing_r2_score = r2_score(y_test, y_test_pred)

    # Store results in the dictionary
    results['Model'].append(model_name)
    results['Mean Squared Error'].append(mean_squared_error(y_test, y_test_pred))
    results['Training R2 Score'].append(r2_score(y_train, model.predict(X_train)))
    results['Testing R2 Score'].append(testing_r2_score)

    # Update the best model
    if testing_r2_score > best_testing_r2_score:
        best_testing_r2_score = testing_r2_score
        best_model_name = model_name
        best_model = model

# Print results
print("Best Model:", best_model_name)
print("Testing R2 Score of the Best Model:", best_testing_r2_score)
# Choose two random data points from the test set
random_points_indices = np.random.choice(X_test.shape[0], size=2, replace=False)
random_points = X_test.iloc[random_points_indices, :]

random_points_y = y_test.iloc[random_points_indices]

In [None]:
# Predict on the two random data points using the best model
predictions = best_model.predict(random_points)

# Print the results
print(f"\nPredictions for {best_model_name} (Best Model):")
for i, (actual_yield, prediction) in enumerate(zip(random_points_y, predictions)):
    print(f"Data Point {i + 1} - Predicted YIELD: {prediction}, Actual YIELD: {actual_yield}")