In [16]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("long_data_.csv")

# ====== Target Variable ======
target = "Usage"

# ====== Handle Dates ======
df["Dates"] = pd.to_datetime(df["Dates"], errors="coerce")
df["year"] = df["Dates"].dt.year
df["month"] = df["Dates"].dt.month
df["day"] = df["Dates"].dt.day
df = df.drop(columns=["Dates"])  # drop raw datetime

# ====== Handle Categorical Columns ======
categorical_cols = []
for col in ["States", "Regions"]:
    if col in df.columns:
        df[col] = df[col].astype("category")
        categorical_cols.append(col)

# ====== Features & Target ======
X = df.drop(columns=[target])
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# LightGBM Dataset
train_data = lgb.Dataset(
    X_train, label=y_train, categorical_feature=categorical_cols, free_raw_data=False
)
test_data = lgb.Dataset(
    X_test, label=y_test, categorical_feature=categorical_cols, free_raw_data=False
)

# Parameters
params = {
    "objective": "regression",
    "metric": ["rmse"],
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1
}

# Train with callbacks
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

# Predictions
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# Evaluation
import numpy as np
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"RÂ² Score: {r2:.4f}")

# Region-wise prediction summary
if "Regions" in df.columns:
    region_preds = X_test.copy()
    region_preds["actual_usage"] = y_test
    region_preds["predicted_usage"] = y_pred
    print(region_preds.groupby("Regions")[["actual_usage", "predicted_usage"]].mean())


Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 21.7152	valid_1's rmse: 23.0204
[200]	training's rmse: 20.9211	valid_1's rmse: 22.3599
[300]	training's rmse: 20.5745	valid_1's rmse: 22.2197
[400]	training's rmse: 20.3378	valid_1's rmse: 22.0093
[500]	training's rmse: 20.142	valid_1's rmse: 21.8593
[600]	training's rmse: 19.9937	valid_1's rmse: 21.7326
[700]	training's rmse: 19.904	valid_1's rmse: 21.7027
[800]	training's rmse: 19.8207	valid_1's rmse: 21.6904
Early stopping, best iteration is:
[755]	training's rmse: 19.8523	valid_1's rmse: 21.6493
RMSE: 21.6493
RÂ² Score: 0.9657
         actual_usage  predicted_usage
Regions                               
ER          65.448141        64.254990
NER          5.734615         5.767246
NR         112.003859       112.655754
SR         163.135342       164.839596
WR         184.959804       185.980177


In [19]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("long_data_.csv")

# ====== Target Variable ======
target = "Usage"

# ====== Handle Dates ======
df["Dates"] = pd.to_datetime(df["Dates"], errors="coerce")
df["year"] = df["Dates"].dt.year
df["month"] = df["Dates"].dt.month
df["day"] = df["Dates"].dt.day
df["dayofweek"] = df["Dates"].dt.dayofweek
df["is_weekend"] = (df["Dates"].dt.dayofweek >= 5).astype(int)

# ====== Handle Categorical Columns ======
categorical_cols = []
for col in ["States", "Regions"]:
    if col in df.columns:
        df[col] = df[col].astype("category")
        categorical_cols.append(col)

# ====== Features & Target ======
X = df.drop(columns=[target, "Dates"])
y = df[target]

# LightGBM Dataset (use all data for training)
train_data = lgb.Dataset(
    X, label=y, categorical_feature=categorical_cols, free_raw_data=False
)

# Parameters
params = {
    "objective": "regression",
    "metric": ["rmse"],
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1
}

# Train full model
model = lgb.train(
    params,
    train_data,
    num_boost_round=500
)

# ===============================
# ðŸ”® Predict Future Dates
# ===============================

# Generate next 90 days (example)
future_dates = pd.date_range(df["Dates"].max() + pd.Timedelta(days=1), periods=1825, freq="D")
future_df = pd.DataFrame({"Dates": future_dates})

# Expand to all regions (so every region gets prediction for each future date)
regions = df["Regions"].unique()
states = df["States"].unique()

future_full = []
for region in regions:
    for state in df.loc[df["Regions"] == region, "States"].unique():
        temp = future_df.copy()
        temp["States"] = state
        temp["Regions"] = region
        # Use stateâ€™s lat/long (taking mean if multiple rows)
        lat = df.loc[df["States"] == state, "latitude"].mean()
        lon = df.loc[df["States"] == state, "longitude"].mean()
        temp["latitude"] = lat
        temp["longitude"] = lon
        future_full.append(temp)

future_full = pd.concat(future_full, ignore_index=True)

# Extract date features
future_full["year"] = future_full["Dates"].dt.year
future_full["month"] = future_full["Dates"].dt.month
future_full["day"] = future_full["Dates"].dt.day
future_full["dayofweek"] = future_full["Dates"].dt.dayofweek
future_full["is_weekend"] = (future_full["Dates"].dt.dayofweek >= 5).astype(int)

# Encode categorical
for col in ["States", "Regions"]:
    future_full[col] = future_full[col].astype("category")

# Drop Dates for prediction
X_future = future_full.drop(columns=["Dates"])

# Predict usage
future_full["predicted_usage"] = model.predict(X_future)

print(future_full.head(20))  # Show first 20 predictions
model.save_model("energy_usage_model.txt")


        Dates  States Regions   latitude  longitude  year  month  day  \
0  2020-05-13  Punjab      NR  31.519974  75.980003  2020      5   13   
1  2020-05-14  Punjab      NR  31.519974  75.980003  2020      5   14   
2  2020-05-15  Punjab      NR  31.519974  75.980003  2020      5   15   
3  2020-05-16  Punjab      NR  31.519974  75.980003  2020      5   16   
4  2020-05-17  Punjab      NR  31.519974  75.980003  2020      5   17   
5  2020-05-18  Punjab      NR  31.519974  75.980003  2020      5   18   
6  2020-05-19  Punjab      NR  31.519974  75.980003  2020      5   19   
7  2020-05-20  Punjab      NR  31.519974  75.980003  2020      5   20   
8  2020-05-21  Punjab      NR  31.519974  75.980003  2020      5   21   
9  2020-05-22  Punjab      NR  31.519974  75.980003  2020      5   22   
10 2020-05-23  Punjab      NR  31.519974  75.980003  2020      5   23   
11 2020-05-24  Punjab      NR  31.519974  75.980003  2020      5   24   
12 2020-05-25  Punjab      NR  31.519974  75.980003

<lightgbm.basic.Booster at 0x1ef7dc0f910>