# Energy Consumption Prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

## Loading the Dataset

In [None]:
df = pd.read_csv("C:\\Users\\user\\Downloads\\Energy_Consumption_Prediction.csv")

## Data Preprocessing

### Date Creation

In [None]:
df['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))

### Extract time-based features



In [None]:
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

### Define features and target

In [None]:
features = ['number_of_accounts', 'full_fips', 'month', 'year']  
target = 'value'
df.replace([np.inf, -np.inf], np.nan, inplace=True) 

### Handling missing values

In [None]:
df.fillna(df.select_dtypes(include=[np.number]).median(), inplace=True)

### Remove duplicates


In [None]:
df.drop_duplicates(inplace=True)

### Detecting outliers and removing them

In [None]:
def detect_and_remove_outliers(df, cols):
    for col in cols:
        if col in df.columns:  # Check if column exists
            Q1, Q3 = df[col].quantile([0.25, 0.75])
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

df = detect_and_remove_outliers(df, features + [target])

In [None]:
# Check if dataset is empty
if df.empty or len(df) < 5:
    print("Warning: Too many outliers removed! Consider adjusting the IQR threshold.")
    df = pd.read_csv("Energy_Consumption_Prediction.csv")  

In [None]:
# Check if dataframe is empty
print(f"Dataframe size after preprocessing: {df.shape}")

In [None]:
# Check for missing values
print(f"Missing values:\n{df.isnull().sum()}")

In [None]:
# Check if selected features exist in dataset
print(f"Columns in dataset: {df.columns.tolist()}")
print(f"Selected features: {features}")

In [None]:
df.info()

In [None]:
df.describe()

## Visualizations

In [None]:
plt.figure(figsize=(10, 5))
sns.lineplot(data=df, x='date', y='value', marker='o', label='Energy Consumption')
plt.xlabel("Date")
plt.ylabel("Energy Consumption")
plt.title("Energy Consumption Over Time")
plt.xticks(rotation=45)
plt.legend()
plt.show()

In [None]:
num_df = df.select_dtypes(include=['number', 'float', 'int'])
plt.figure(figsize=(10, 5))
sns.heatmap(num_df.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

## Model Building

### Splitting data into training and testing sets

In [None]:
X, y = df[features], df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Scale data

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=features, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=features, index=X_test.index)

### Random Forest Regressor

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

### XG Boost Regressor

In [None]:
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

## Model Evaluation

In [None]:
def evaluate_model(model, X_test, y_test, name):
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    print(f"{name} Model:")
    print(f"R² Score: {r2:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}\n")
    return r2, mae, mse

rf_r2, rf_mae, rf_mse = evaluate_model(rf_model, X_test_scaled, y_test, "Random Forest")
xgb_r2, xgb_mae, xgb_mse = evaluate_model(xgb_model, X_test_scaled, y_test, "XGBoost")

### Best model


In [None]:
models = {"Random Forest": (rf_model, rf_r2, rf_mae, rf_mse), "XGBoost": (xgb_model, xgb_r2, xgb_mae, xgb_mse)}
best_model_name = max(models, key=lambda k: models[k][1]) 
best_model, best_r2, best_mae, best_mse = models[best_model_name]

print(f"Best model selected: {best_model_name}")
print(f"R² Score: {best_r2:.4f}")
print(f"Mean Absolute Error (MAE): {best_mae:.4f}")
print(f"Mean Squared Error (MSE): {best_mse:.4f}")

## Predictions

In [None]:
# Predict using XGBoost
xgb_predictions = xgb_model.predict(X_test_scaled)

# Compare actual vs. predicted values
prediction_results = pd.DataFrame({'Actual': y_test, 'Predicted': xgb_predictions})
print(prediction_results.head())

# Evaluate model
print("XGBoost Model Performance:")
print(f"R² Score: {r2_score(y_test, xgb_predictions):.4f}")
print(f"MAE: {mean_absolute_error(y_test, xgb_predictions):.4f}")
print(f"MSE: {mean_squared_error(y_test, xgb_predictions):.4f}")


In [None]:
total_months = 120
future_dates = pd.date_range(start=df['date'].max(), periods=total_months, freq='ME')
future_features = pd.DataFrame({'number_of_accounts': [X['number_of_accounts'].mean()] * total_months,
                                'full_fips': [X['full_fips'].mean()] * total_months,
                                'month': future_dates.month,
                                'year': future_dates.year})

future_features_scaled = pd.DataFrame(scaler.transform(future_features), columns=features)
future_predictions = best_model.predict(future_features_scaled)

future_predictions_df = pd.DataFrame({'date': future_dates, 'predicted_energy_consumption': future_predictions})
print(future_predictions_df)

In [None]:
plt.figure(figsize=(10, 5))
sns.scatterplot(x=y_test, y=xgb_predictions, color='blue', alpha=0.6)
sns.lineplot(x=y_test, y=y_test, color='red', linestyle='dashed')  
plt.xlabel("Actual Energy Consumption")
plt.ylabel("Predicted Energy Consumption")
plt.title("Actual vs. Predicted Energy Consumption (XGBoost)")
plt.grid(True)
plt.show()
