In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# for visualization
plt.rcParams['figure.figsize'] = (10,5)
plt.rcParams['figure.dpi'] = 300 
%matplotlib inline

# for clear response without warnings
import warnings 
warnings.filterwarnings('ignore')

In [28]:
# importing dataset

df = pd.read_csv('solarpowergeneration.csv')
df

Unnamed: 0,distance-to-solar-noon,temperature,wind-direction,wind-speed,sky-cover,visibility,humidity,average-wind-speed-(period),average-pressure-(period),power-generated
0,0.859897,69,28,7.5,0,10.0,75,8.0,29.82,0
1,0.628535,69,28,7.5,0,10.0,77,5.0,29.85,0
2,0.397172,69,28,7.5,0,10.0,70,0.0,29.89,5418
3,0.165810,69,28,7.5,0,10.0,33,0.0,29.91,25477
4,0.065553,69,28,7.5,0,10.0,21,3.0,29.89,30069
...,...,...,...,...,...,...,...,...,...,...
2915,0.166453,63,27,13.9,4,10.0,75,10.0,29.93,6995
2916,0.064020,63,27,13.9,1,10.0,66,15.0,29.91,29490
2917,0.294494,63,27,13.9,2,10.0,68,21.0,29.88,17257
2918,0.524968,63,27,13.9,2,10.0,81,17.0,29.87,677


In [29]:
df.columns = df.columns.str.replace('-', '_')

In [30]:
df.rename(columns={'average_wind_speed_(period)':'avg_wind_speed_period','average_pressure_(period)':'avg_pressure_period'},inplace=True)   # As paranthesis also errs
df.head()

Unnamed: 0,distance_to_solar_noon,temperature,wind_direction,wind_speed,sky_cover,visibility,humidity,avg_wind_speed_period,avg_pressure_period,power_generated
0,0.859897,69,28,7.5,0,10.0,75,8.0,29.82,0
1,0.628535,69,28,7.5,0,10.0,77,5.0,29.85,0
2,0.397172,69,28,7.5,0,10.0,70,0.0,29.89,5418
3,0.16581,69,28,7.5,0,10.0,33,0.0,29.91,25477
4,0.065553,69,28,7.5,0,10.0,21,3.0,29.89,30069


In [31]:
# Do have outliers

df.avg_wind_speed_period.fillna(df.avg_wind_speed_period.median(),inplace=True)

In [32]:
df.isna().sum()

distance_to_solar_noon    0
temperature               0
wind_direction            0
wind_speed                0
sky_cover                 0
visibility                0
humidity                  0
avg_wind_speed_period     0
avg_pressure_period       0
power_generated           0
dtype: int64

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

In [34]:
def initial_model(X_train,y_train,model):
    model.fit(X_train,y_train)
    print("Training Accuracy : ",model.score(X_train,y_train))
    print("Testing Accuracy : ",model.score(X_test,y_test))

def tune_model(param_grid, model,X_train,y_train):
    grid = GridSearchCV(model,param_grid,cv=5,scoring='r2',n_jobs=-1)
    grid.fit(X_train,y_train)
    print(grid.best_params_)
    return grid

def rebuild_tuned_model(grid,X_train,y_train):
    model = grid.best_estimator_
    model.fit(X_train,y_train)
    print("Training Accuracy : ",model.score(X_train,y_train))
    print("Testing Accuracy : ",model.score(X_test,y_test))

def splitting_dataset(df):
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    return (X,y)
    

In [35]:
X,y = splitting_dataset(df)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

gbr_final = GradientBoostingRegressor(random_state=42)
initial_model(X_train,y_train,gbr_final)

param_grid = {
    'n_estimators': [100, 300],      # Keeps two reasonable values  
    'learning_rate': [0.05, 0.1],    # Small but effective range  
    'max_depth': [3, 5],             # Avoids excessive depth  
    'subsample': [0.7, 0.9],         # Slight variation without overfitting  
    'max_features': ['sqrt', None]   # Tests both feature selection strategies  
}

grid = tune_model(param_grid, gbr_final, X_train,y_train)

rebuild_tuned_model(grid,X_train,y_train)

Training Accuracy :  0.9453990791657144
Testing Accuracy :  0.9010240096349007
{'learning_rate': 0.05, 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 300, 'subsample': 0.9}
Training Accuracy :  0.9852949211951333
Testing Accuracy :  0.9103202849070717


In [36]:
gbr_final = grid.best_estimator_

In [37]:
import pickle

In [38]:
pickle.dump(gbr_final,open('gbr.pkl','wb'))