<a href="https://colab.research.google.com/github/AasthathecoderX/Edunet_Energy/blob/main/Edunet_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Datasets/archive(1)/electricity_bill_dataset.csv")
display(df.head())

In [None]:
display(df.head())
display(df.isnull().sum())
df.info()

In [None]:
from sklearn.model_selection import train_test_split

# Select relevant features for electricity bill prediction from the loaded dataframe
# Based on df.info() and df.head(), these columns seem suitable.
selected_features = ['Fan', 'Refrigerator', 'AirConditioner', 'Television', 'Monitor', 'MotorPump', 'Month', 'MonthlyHours', 'TariffRate']
df_selected = df[selected_features].copy()

# No need for one-hot encoding on these selected numerical features

# Assuming 'ElectricityBill' is still the target variable
X = df_selected
y = df['ElectricityBill']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

display(X_train.head())
display(X_test.head())
display(y_train.head())
display(y_test.head())

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
# Check your electricity training:
print("Target variable:", target_variable)
print("Sample targets:", y.head())
print("Target range:", y.min(), "to", y.max())

# After training:
sample_input = X_test.iloc[0:1]
prediction = model.predict(sample_input)
actual = y_test.iloc[0]
print(f"Prediction: {prediction[0]:.2f} kWh")
print(f"Actual: {actual:.2f} kWh")


In [None]:
y_pred = model.predict(X_test)
predictions_df = pd.DataFrame({'Original': y_test, 'Predicted': y_pred})
display(predictions_df.head())

In [None]:
import joblib

# Save the trained model to a file
joblib.dump(model, '/content/drive/MyDrive/Colab Notebooks/electricity_prediction_model-1.joblib')


print("Model saved successfully as 'electricity_prediction_model-1.joblib'")

In [None]:
import pandas as pd
#Solar prediction
# Replace '/path/to/your/dataset.csv' with the actual path to your file in Google Drive
# Example: '/content/drive/MyDrive/data/my_dataset.csv'
try:
    df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/Solar power plant dataset/2022 All zones/Final Dataset.xlsx - Sheet1.csv')
    print("Dataset loaded successfully!")
    display(df.head())
except FileNotFoundError:
    print("Error: Make sure the file path is correct and the file exists in your Google Drive.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# ============================================
# CRITICAL: Verify target column
# ============================================
print("Dataset columns:", df.columns.tolist())
print("\nTarget column check:")
print(df['1)All Sky Surface Shortwave Downward Irradiance'].describe())
print("\nFirst 5 values:")
print(df['1)All Sky Surface Shortwave Downward Irradiance'].head())

# This should show values around 4-7 kWh/m²/day, NOT 3000+!


display(df.isnull().sum())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select only numerical columns for outlier detection
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Create box plots for each numerical column
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(3, 3, i + 1)
    sns.boxplot(y=df[col])
    plt.title(col)

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Ensure df is loaded and contains 'Unnamed: 0' before this cell executes.
# Based on previous output, 'Unnamed: 0' is the correct column name.

# 1. Identify and handle the categorical column 'Unnamed: 0'.
# Since 'Unnamed: 0' seems to contain zone information and has missing values,
# we will fill the missing values with a placeholder or the previous valid entry if appropriate.
# Given the small number of non-null values, filling forward seems reasonable to propagate the zone information.
df['Unnamed: 0'] = df['Unnamed: 0'].fillna(method='ffill')

# If the column still has missing values at the beginning, fill them with a placeholder like 'UNKNOWN'
df['Unnamed: 0'] = df['Unnamed: 0'].fillna('UNKNOWN')


# Convert the categorical column to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['Unnamed: 0'], prefix='Zone')


# 2. Examine the column names to identify the target variable related to solar generation.
# Based on the column names, '1)All Sky Surface Shortwave Downward Irradiance' is the most likely target variable.
target_variable = '1)All Sky Surface Shortwave Downward Irradiance'

# Select the specified features for training (reduced set as requested)
selected_features = ['LAT', 'LON', '4)Cloud Amount'] + [col for col in df.columns if col.startswith('Zone_')]
# 3. Separate the target variable from the features.
X = df[selected_features]
y = df[target_variable]

# 4. Select the numerical features for scaling.
# Exclude the newly created one-hot encoded columns
numerical_cols_for_scaling = ['LAT', 'LON', '4)Cloud Amount']

# 5. Apply a scaling technique (e.g., StandardScaler) to the numerical features.
scaler = StandardScaler()
# Ensure that only the selected numerical columns are transformed
X[numerical_cols_for_scaling] = scaler.fit_transform(X[numerical_cols_for_scaling])

display(X.head())
display(y.head())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate the GradientBoostingRegressor with default parameters
gbr = GradientBoostingRegressor()

# Train the model using the X_train and y_train dataframes
gbr.fit(X_train, y_train)

print("Model training complete.")

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Make predictions on the test set
y_pred = gbr.predict(X_test)

# Calculate the metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Accuracy (R2): {r2*100:.2f}%")

# In your solar training notebook, check:
print("Target variable name:", target_variable)
print("Sample target values:", y.head())
print("Target statistics:")
print(y.describe())

# After training, test the model:
sample_input = X_test.iloc[0:1]  # Take first test sample
prediction = gbr.predict(sample_input)
actual = y_test.iloc[0]
print(f"Sample prediction: {prediction[0]:.2f}")
print(f"Actual value: {actual:.2f}")


In [None]:
import joblib

# Save BOTH the trained model AND the scaler
joblib.dump(gbr, '/content/drive/MyDrive/Colab Notebooks/solar_prediction_model.joblib')
joblib.dump(scaler, '/content/drive/MyDrive/Colab Notebooks/solar_scaler.joblib')

print("✓ Model saved successfully as 'solar_prediction_model.joblib'")
print("✓ Scaler saved successfully as 'solar_scaler.joblib'")

# Verify save
print("\n" + "="*60)
print("VERIFICATION - Testing saved model")
print("="*60)

# Reload and test
loaded_model = joblib.load('/content/drive/MyDrive/Colab Notebooks/solar_prediction_model.joblib')
loaded_scaler = joblib.load('/content/drive/MyDrive/Colab Notebooks/solar_scaler.joblib')

# Test case: Bangalore
test_raw = pd.DataFrame({
    'LAT': [12.9716],
    'LON': [77.5946],
    '4)Cloud Amount': [30],
    'Zone_CENTRAL ZONE': [0],
    'Zone_EAST ZONE': [0],
    'Zone_NORTH ZONE': [0],
    'Zone_SOUTH ZONE': [1],
    'Zone_WEST ZONE': [0]
})

# Scale features
test_scaled = test_raw.copy()
test_scaled[['LAT', 'LON', '4)Cloud Amount']] = loaded_scaler.transform(
    test_raw[['LAT', 'LON', '4)Cloud Amount']]
)

# Predict
test_pred = loaded_model.predict(test_scaled)
annual_kwh = test_pred[0] * 365 * 0.75

print(f"\nTest prediction (Bangalore, 30% cloud):")
print(f"  Irradiance: {test_pred[0]:.2f} kWh/m²/day")
print(f"  Annual generation (1kW): {annual_kwh:.0f} kWh/year")
print(f"  Expected: 1200-1500 kWh/year")

if 1000 < annual_kwh < 1600:
    print("\n✓ Loaded model works correctly!")
else:
    print("\n⚠️ Something is still wrong!")
