<a href="https://colab.research.google.com/github/AasthathecoderX/Edunet_Energy/blob/main/Edunet_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Replace '/path/to/your/dataset.csv' with the actual path to your file in Google Drive
# Example: '/content/drive/MyDrive/data/my_dataset.csv'
try:
    df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/Solar power plant dataset/2022 All zones/Final Dataset.xlsx - Sheet1.csv')
    print("Dataset loaded successfully!")
    display(df.head())
except FileNotFoundError:
    print("Error: Make sure the file path is correct and the file exists in your Google Drive.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
display(df.isnull().sum())

In [None]:
# Check all columns in the dataset
print("All columns in the dataset:")
print(df.columns.tolist())
print("\nDataset shape:", df.shape)
print("\nFirst few rows:")
df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select only numerical columns for outlier detection
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Create box plots for each numerical column
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(3, 3, i + 1)
    sns.boxplot(y=df[col])
    plt.title(col)

plt.tight_layout()
plt.show()

In [None]:
# 1. Identify and handle the categorical column 'Unnamed: 0'.

df['Unnamed: 0'] = df['Unnamed: 0'].fillna(method='ffill')


df['Unnamed: 0'] = df['Unnamed: 0'].fillna('UNKNOWN')


# Convert the categorical column to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['Unnamed: 0'], prefix='Zone')


# 2. Examine the column names to identify the target variable related to solar generation.

target_variable = '1)All Sky Surface Shortwave Downward Irradiance'

# 3. Separate the target variable from the features.
X = df.drop(columns=[target_variable])
y = df[target_variable]

# 4. Select the numerical features for scaling.

numerical_cols_for_scaling = X.select_dtypes(include=['float64', 'int64']).columns

# 5. Apply a scaling technique (e.g., StandardScaler) to the numerical features.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X[numerical_cols_for_scaling] = scaler.fit_transform(X[numerical_cols_for_scaling])

display(X.head())
display(y.head())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate the GradientBoostingRegressor with default parameters
gbr = GradientBoostingRegressor()

# Train the model using the X_train and y_train dataframes
gbr.fit(X_train, y_train)

print("Model training complete.")

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Make predictions on the test set
y_pred = gbr.predict(X_test)

# Calculate the metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Accuracy (R2): {r2*100:.2f}%")

In [None]:
# After fitting your scaler and doing get_dummies:
import joblib

# Save the columns of X after encoding (important!)
joblib.dump(X.columns.tolist(), "solar_feature_columns.joblib")
joblib.dump(scaler, "solar_scaler.joblib")
joblib.dump(gbr, "solar_prediction_model.joblib")


In [None]:
# ============================================
# NEW SECTION: Train 5-Feature Models for Frontend
# ============================================
# This section creates models that match the frontend inputs:
# latitude, longitude, roof_area, orientation_code, slope

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

print("Creating synthetic dataset with 5 features matching frontend...")

np.random.seed(42)

n_samples = 1000

# Feature 1: Latitude (India range: 8 to 35)
latitudes = np.random.uniform(8, 35, n_samples)

# Feature 2: Longitude (India range: 68 to 97)
longitudes = np.random.uniform(68, 97, n_samples)

# Feature 3: Roof Area (sq meters: 50 to 500)
roof_areas = np.random.uniform(50, 500, n_samples)

# Feature 4: Orientation (0=North, 1=East, 2=South, 3=West)
orientations = np.random.randint(0, 4, n_samples)

# Feature 5: Slope (degrees: 0 to 45)
slopes = np.random.uniform(0, 45, n_samples)

# Create DataFrame
data_5features = pd.DataFrame({
    'latitude': latitudes,
    'longitude': longitudes,
    'roof_area': roof_areas,
    'orientation_code': orientations,
    'slope': slopes
})

print(f"Generated {n_samples} samples with 5 features")
print("\nFirst few rows:")
print(data_5features.head())

In [None]:
# Generate realistic target variables based on the features
print("\nGenerating target variables...")


# Formula considers: roof area, latitude (solar irradiance), orientation, slope
# South-facing panels with optimal slope produce more energy


solar_irradiance_factor = 1.5 - (np.abs(latitudes - 15) / 50)

# Orientation factor (South=best in Northern hemisphere)
orientation_factors = {
    0: 0.7,  # North
    1: 0.85, # East
    2: 1.0,  # South (best)
    3: 0.85  # West
}
orientation_multiplier = np.array([orientation_factors[o] for o in orientations])

# Slope factor (optimal around 20-30 degrees)
slope_factor = 1 - (np.abs(slopes - 25) / 50)

# Calculate annual solar savings (kWh/year)
# Base: ~150 kWh per sq meter per year * efficiency factors
solar_savings = (
    roof_areas * 150 * solar_irradiance_factor *
    orientation_multiplier * slope_factor *
    np.random.uniform(0.85, 1.15, n_samples)  # Add some realistic variance
)

# Electricity Consumption (kWh/month)
# Rough estimate based on roof area as proxy for house size
# Larger homes typically consume more electricity
# Average Indian household: 200-400 kWh/month
electricity_consumption = (
    100 +  # Base consumption
    (roof_areas / 2) +  # House size factor
    np.random.uniform(-50, 50, n_samples)  # Random variance
)

# Add targets to dataframe
data_5features['solar_savings'] = solar_savings
data_5features['electricity_consumption'] = electricity_consumption

print(f"Solar savings range: {solar_savings.min():.2f} to {solar_savings.max():.2f} kWh/year")
print(f"Electricity consumption range: {electricity_consumption.min():.2f} to {electricity_consumption.max():.2f} kWh/month")
print("\nDataset with targets:")
print(data_5features.head())

In [None]:
# ============================================
# Train Solar Prediction Model (5 features)
# ============================================

print("\n" + "="*50)
print("Training Solar Prediction Model")
print("="*50)

# Define features and target
FEATURES = ['latitude', 'longitude', 'roof_area', 'orientation_code', 'slope']
X = data_5features[FEATURES].values
y_solar = data_5features['solar_savings'].values

# Split data
X_train, X_test, y_train_solar, y_test_solar = train_test_split(
    X, y_solar, test_size=0.2, random_state=42
)

print(f"\nTraining samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"Features: {FEATURES}")

# Train GradientBoostingRegressor
print("\nTraining GradientBoostingRegressor...")
solar_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
solar_model.fit(X_train, y_train_solar)

# Evaluate
y_pred_solar = solar_model.predict(X_test)
mae_solar = mean_absolute_error(y_test_solar, y_pred_solar)
mse_solar = mean_squared_error(y_test_solar, y_pred_solar)
rmse_solar = np.sqrt(mse_solar)
r2_solar = r2_score(y_test_solar, y_pred_solar)

print("\n--- Solar Model Performance ---")
print(f"MAE: {mae_solar:.2f} kWh/year")
print(f"RMSE: {rmse_solar:.2f} kWh/year")
print(f"R² Score: {r2_solar:.4f}")

# Save the model
model_filename = 'solar_model_5features.joblib'
joblib.dump(solar_model, model_filename)
print(f"\nModel saved as: {model_filename}")

In [None]:
# ============================================
# Train Electricity Consumption Model (5 features)
# ============================================

print("\n" + "="*50)
print("Training Electricity Consumption Model")
print("="*50)

# Use same features
y_electricity = data_5features['electricity_consumption'].values

# Split data
X_train, X_test, y_train_elec, y_test_elec = train_test_split(
    X, y_electricity, test_size=0.2, random_state=42
)

print(f"\nTraining samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"Features: {FEATURES}")

# Train XGBRegressor
print("\nTraining XGBRegressor...")
electricity_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
electricity_model.fit(X_train, y_train_elec)

# Evaluate
y_pred_elec = electricity_model.predict(X_test)
mae_elec = mean_absolute_error(y_test_elec, y_pred_elec)
mse_elec = mean_squared_error(y_test_elec, y_pred_elec)
rmse_elec = np.sqrt(mse_elec)
r2_elec = r2_score(y_test_elec, y_pred_elec)

print("\n--- Electricity Model Performance ---")
print(f"MAE: {mae_elec:.2f} kWh/month")
print(f"RMSE: {rmse_elec:.2f} kWh/month")
print(f"R² Score: {r2_elec:.4f}")

# Save the model
model_filename = 'electricity_model_5features.joblib'
joblib.dump(electricity_model, model_filename)
print(f"\nModel saved as: {model_filename}")

In [None]:
# ============================================
# Test the models with sample data
# ============================================

print("\n" + "="*50)
print("Testing Models with Sample Input")
print("="*50)

# Sample input matching your frontend (Bangalore coordinates)
sample_input = np.array([
    [12.9716, 77.5946, 120, 2, 25]  # lat, lon, roof_area, orientation(South), slope
])

print("\nSample Input:")
print(f"Latitude: {sample_input[0][0]}")
print(f"Longitude: {sample_input[0][1]}")
print(f"Roof Area: {sample_input[0][2]} sq.m")
print(f"Orientation: {sample_input[0][3]} (2=South)")
print(f"Slope: {sample_input[0][4]} degrees")

# Make predictions
solar_prediction = solar_model.predict(sample_input)[0]
electricity_prediction = electricity_model.predict(sample_input)[0]

print("\n--- Predictions ---")
print(f"Solar Savings: {solar_prediction:.2f} kWh/year")
print(f"Electricity Consumption: {electricity_prediction:.2f} kWh/month")

print("\n" + "="*50)
print("SUCCESS! Models are ready for deployment")
print("="*50)

In [None]:
# ============================================
# Download Models to Local Machine
# ============================================

from google.colab import files

print("\nDownloading trained models...")
print("These files will be saved to your Downloads folder.\n")

# Download solar model
print("Downloading solar_model_5features.joblib...")
files.download('solar_model_5features.joblib')

# Download electricity model
print("Downloading electricity_model_5features.joblib...")
files.download('electricity_model_5features.joblib')

print("\n" + "="*50)
print("DOWNLOAD COMPLETE!")
print("="*50)
print("\nNext steps:")
print("1. Replace the old model files in your Flask backend directory")
print("2. Update your app.py to load these new 5-feature models:")
print("   - solar_model = joblib.load('solar_model_5features.joblib')")
print("   - electricity_model = joblib.load('electricity_model_5features.joblib')")
print("3. The models now expect exactly 5 features in this order:")
print("   [latitude, longitude, roof_area, orientation_code, slope]")
print("4. Restart your Flask server and test!")