<h4>IMPORTING NECCESARY LIBRARIES

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [7]:
data = pd.read_csv('Data/assam_data.csv')

In [8]:
data.head()

Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations
0,assam,2018-12-01,79.8,58.4,68.3,79.8,58.4,68.3,62.5,82.9,...,14.7,7,,2018-12-01T05:52:48,2018-12-01T16:30:27,0.79,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"42410099999,VEGT,42414099999,42408099999,42516..."
1,assam,2018-12-02,79.7,58.3,68.6,79.7,58.3,68.6,61.8,80.6,...,14.5,7,,2018-12-02T05:53:32,2018-12-02T16:30:29,0.82,Partially cloudy,Becoming cloudy in the afternoon.,partly-cloudy-day,"42410099999,VEGT,42414099999,42408099999,42516..."
2,assam,2018-12-03,80.6,59.5,69.3,81.8,59.5,69.3,62.7,81.0,...,11.5,6,,2018-12-03T05:54:15,2018-12-03T16:30:32,0.86,Partially cloudy,Clearing in the afternoon.,partly-cloudy-day,"42410099999,VEGT,42414099999,42408099999,42516..."
3,assam,2018-12-04,80.6,58.5,68.4,81.8,58.5,68.4,60.9,79.4,...,14.2,6,,2018-12-04T05:54:59,2018-12-04T16:30:37,0.89,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"42410099999,VEGT,42414099999,42408099999,42516..."
4,assam,2018-12-05,79.8,55.9,67.8,79.8,55.9,67.8,60.7,79.7,...,13.5,6,,2018-12-05T05:55:41,2018-12-05T16:30:44,0.93,Partially cloudy,Becoming cloudy in the afternoon.,partly-cloudy-day,"42410099999,VEGT,42408099999,remote,4251609999..."


<h4>FEATURE ENGINEERING</h4>

In [9]:
# Convert datetime to features
data['datetime'] = pd.to_datetime(data['datetime'])
data['year'] = data['datetime'].dt.year
data['month'] = data['datetime'].dt.month
data['day'] = data['datetime'].dt.day
data['day_of_year'] = data['datetime'].dt.dayofyear

In [10]:
# Extract time components from sunrise and sunset
data['sunrise_hour'] = pd.to_datetime(data['sunrise']).dt.hour
data['sunrise_minute'] = pd.to_datetime(data['sunrise']).dt.minute
data['sunset_hour'] = pd.to_datetime(data['sunset']).dt.hour
data['sunset_minute'] = pd.to_datetime(data['sunset']).dt.minute
data['daylight_hours'] = (pd.to_datetime(data['sunset']) - pd.to_datetime(data['sunrise'])).dt.total_seconds()/3600

In [11]:
data.head()

Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,stations,year,month,day,day_of_year,sunrise_hour,sunrise_minute,sunset_hour,sunset_minute,daylight_hours
0,assam,2018-12-01,79.8,58.4,68.3,79.8,58.4,68.3,62.5,82.9,...,"42410099999,VEGT,42414099999,42408099999,42516...",2018,12,1,335,5,52,16,30,10.6275
1,assam,2018-12-02,79.7,58.3,68.6,79.7,58.3,68.6,61.8,80.6,...,"42410099999,VEGT,42414099999,42408099999,42516...",2018,12,2,336,5,53,16,30,10.615833
2,assam,2018-12-03,80.6,59.5,69.3,81.8,59.5,69.3,62.7,81.0,...,"42410099999,VEGT,42414099999,42408099999,42516...",2018,12,3,337,5,54,16,30,10.604722
3,assam,2018-12-04,80.6,58.5,68.4,81.8,58.5,68.4,60.9,79.4,...,"42410099999,VEGT,42414099999,42408099999,42516...",2018,12,4,338,5,54,16,30,10.593889
4,assam,2018-12-05,79.8,55.9,67.8,79.8,55.9,67.8,60.7,79.7,...,"42410099999,VEGT,42408099999,remote,4251609999...",2018,12,5,339,5,55,16,30,10.584167


In [12]:
# Handle categorical variables (conditions)
conditions_dummies = pd.get_dummies(data['conditions'], prefix='cond')
data = pd.concat([data, conditions_dummies], axis=1)

In [14]:
# Drop unnecessary columns
drop_cols = ['name', 'datetime', 'sunrise', 'sunset', 'conditions', 'description', 
             'icon', 'stations', 'preciptype', 'severerisk']
data = data.drop(columns=drop_cols)

In [16]:
data.shape

(2255, 37)

In [17]:
data.head()

Unnamed: 0,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,precipprob,...,sunrise_hour,sunrise_minute,sunset_hour,sunset_minute,daylight_hours,cond_Clear,cond_Overcast,cond_Partially cloudy,"cond_Rain, Overcast","cond_Rain, Partially cloudy"
0,79.8,58.4,68.3,79.8,58.4,68.3,62.5,82.9,0.0,0,...,5,52,16,30,10.6275,False,False,True,False,False
1,79.7,58.3,68.6,79.7,58.3,68.6,61.8,80.6,0.0,0,...,5,53,16,30,10.615833,False,False,True,False,False
2,80.6,59.5,69.3,81.8,59.5,69.3,62.7,81.0,0.0,0,...,5,54,16,30,10.604722,False,False,True,False,False
3,80.6,58.5,68.4,81.8,58.5,68.4,60.9,79.4,0.0,0,...,5,54,16,30,10.593889,False,False,True,False,False
4,79.8,55.9,67.8,79.8,55.9,67.8,60.7,79.7,0.0,0,...,5,55,16,30,10.584167,False,False,True,False,False


In [18]:
# Handle missing values
data = data.fillna(data.mean())

In [19]:
# Define features and target
X = data.drop(columns=['solarenergy'])
y = data['solarenergy']

<h4>TRAIN TEST SPLIT </h4>

In [20]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h4>GRID SEARCH CV</h4>

In [21]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# Create the XGBoost model
xgb = XGBRegressor(random_state=42)

# Grid search with cross-validation
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=10,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

Fitting 10 folds for each of 729 candidates, totalling 7290 fits
Best parameters found:  {'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.9}


<h4>TRAINING THE MODEL</h4>

In [22]:
# Train the model with best parameters
best_xgb = XGBRegressor(**best_params, random_state=42)
best_xgb.fit(X_train_scaled, y_train)

# Make predictions
y_pred = best_xgb.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Squared Error: 0.02
Mean Absolute Error: 0.10
R-squared: 1.00


In [23]:
import joblib

# Save the model
joblib.dump(best_xgb, 'best_xgb_model.pkl')


['best_xgb_model.pkl']

In [24]:
# Load the saved model
loaded_model = joblib.load('best_xgb_model.pkl')

# Test it on some new (or test) data
new_predictions = loaded_model.predict(X_test_scaled)
print(new_predictions[:5])


[22.959906   7.9585857 10.164321  12.6792755 16.68639  ]


In [25]:
print(y_test[:5])

2064    23.2
1263     7.9
2128    10.0
599     12.7
464     16.6
Name: solarenergy, dtype: float64
