In [125]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [126]:
import os
os.chdir('/content/drive/MyDrive/SuperAI/SolarEnergy')

In [127]:
import pandas as pd
import glob
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
import glob

In [128]:
# Use glob to find all .xlsx files in the folder
file_paths = glob.glob('data/*.csv')
# Load and concatenate all Excel files
df_list = [pd.read_csv(file) for file in file_paths]
df = pd.concat(df_list, ignore_index=True)

df.head(20)


Unnamed: 0,Time,System Production (W)
0,01/01/2022 00:00,0.0
1,01/01/2022 00:15,0.0
2,01/01/2022 00:30,0.0
3,01/01/2022 00:45,0.0
4,01/01/2022 01:00,0.0
5,01/01/2022 01:15,0.0
6,01/01/2022 01:30,0.0
7,01/01/2022 01:45,0.0
8,01/01/2022 02:00,0.0
9,01/01/2022 02:15,0.0


In [129]:
len(df)

96288

In [130]:
# Drop rows with missing or bad timestamps
df = df.dropna(subset=["Time"])
df["Time"] = pd.to_datetime(df["Time"], errors='coerce')
df = df.dropna(subset=["Time"])
len(df)

96288

In [131]:
# Drop rows with missing production
df = df.dropna(subset=["System Production (W)"])
len(df)

96288

In [132]:
# ========== 2. Feature Engineering ==========

def create_time_features(df):
    df["hour"] = df["Time"].dt.hour
    df["minute"] = df["Time"].dt.minute
    df["dayofweek"] = df["Time"].dt.dayofweek
    df["month"] = df["Time"].dt.month
    df["dayofyear"] = df["Time"].dt.dayofyear
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
    df["sin_hour"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["cos_hour"] = np.cos(2 * np.pi * df["hour"] / 24)

    # One-hot encode dayofweek and month
    df = pd.get_dummies(df, columns=["dayofweek", "month"], prefix=["dow", "mon"], drop_first=True)

    return df


df = create_time_features(df)

In [135]:
df.columns

Index(['Time', 'System Production (W)', 'hour', 'minute', 'dayofyear',
       'is_weekend', 'sin_hour', 'cos_hour', 'dow_1', 'dow_2', 'dow_3',
       'dow_4', 'dow_5', 'dow_6', 'mon_2', 'mon_3', 'mon_4', 'mon_5', 'mon_6',
       'mon_7', 'mon_8', 'mon_9', 'mon_10', 'mon_11', 'mon_12'],
      dtype='object')

In [136]:
# ========== 3. Train/Test Split ==========
features = ['hour', 'minute', 'dayofyear',
       'is_weekend', 'sin_hour', 'cos_hour', 'dow_1', 'dow_2', 'dow_3',
       'dow_4', 'dow_5', 'dow_6', 'mon_2', 'mon_3', 'mon_4', 'mon_5', 'mon_6',
       'mon_7', 'mon_8', 'mon_9', 'mon_10', 'mon_11', 'mon_12']
X = df[features]
y = df["System Production (W)"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [137]:
from sklearn.ensemble import RandomForestRegressor

In [138]:
# ========== 4. Train XGBoost Model ==========
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# ========== 5. Evaluate on Validation ==========
val_preds = model.predict(X_val)
rmse = root_mean_squared_error(y_val, val_preds)
print(f"Validation RMSE: {rmse:.2f} W")

Validation RMSE: 8912.46 W


In [139]:
# ========== 6. Load Test.csv and Predict ==========
submission = pd.read_csv("/content/drive/MyDrive/SuperAI/SolarEnergy/Sample-submission.csv")
test_df = pd.read_csv("/content/drive/MyDrive/SuperAI/SolarEnergy/Sample-submission.csv")

In [140]:
test_df["Time"] = pd.to_datetime(test_df["Time"], errors='coerce')
test_df

Unnamed: 0,Time,System Production (W)
0,2023-09-16 09:45:00,62059.3281
1,2023-12-28 12:15:00,58862.0000
2,2024-03-06 09:00:00,48442.0000
3,2023-07-01 11:15:00,
4,2023-07-03 08:30:00,
...,...,...
3341,2024-06-13 16:30:00,
3342,2023-12-26 06:30:00,
3343,2023-10-15 15:45:00,
3344,2023-12-02 12:15:00,


In [141]:
test_df = create_time_features(test_df)
X_test = test_df[features]

test_df["System Production (W)"] = model.predict(X_test)

In [142]:
test_df

Unnamed: 0,Time,System Production (W),hour,minute,dayofyear,is_weekend,sin_hour,cos_hour,dow_1,dow_2,...,mon_3,mon_4,mon_5,mon_6,mon_7,mon_8,mon_9,mon_10,mon_11,mon_12
0,2023-09-16 09:45:00,37645.456496,9,45,259,1,7.071068e-01,-7.071068e-01,False,False,...,False,False,False,False,False,False,True,False,False,False
1,2023-12-28 12:15:00,54236.728473,12,15,362,0,1.224647e-16,-1.000000e+00,False,False,...,False,False,False,False,False,False,False,False,False,True
2,2024-03-06 09:00:00,36084.156302,9,0,66,0,7.071068e-01,-7.071068e-01,False,True,...,True,False,False,False,False,False,False,False,False,False
3,2023-07-01 11:15:00,59636.501521,11,15,182,1,2.588190e-01,-9.659258e-01,False,False,...,False,False,False,False,True,False,False,False,False,False
4,2023-07-03 08:30:00,32343.023092,8,30,184,0,8.660254e-01,-5.000000e-01,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3341,2024-06-13 16:30:00,17822.649901,16,30,165,0,-8.660254e-01,-5.000000e-01,False,False,...,False,False,False,True,False,False,False,False,False,False
3342,2023-12-26 06:30:00,166.837481,6,30,360,0,1.000000e+00,6.123234e-17,True,False,...,False,False,False,False,False,False,False,False,False,True
3343,2023-10-15 15:45:00,11302.674772,15,45,288,1,-7.071068e-01,-7.071068e-01,False,False,...,False,False,False,False,False,False,False,True,False,False
3344,2023-12-02 12:15:00,47232.938147,12,15,336,1,1.224647e-16,-1.000000e+00,False,False,...,False,False,False,False,False,False,False,False,False,True


In [143]:
# ========== 7. Save Predictions ==========
submission["System Production (W)"] = test_df["System Production (W)"]
submission.to_csv("submission_completed.csv", index=False)