In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# for visualization
plt.rcParams['figure.figsize'] = (10,5)
plt.rcParams['figure.dpi'] = 300 
%matplotlib inline

# for clear response without warnings
import warnings 
warnings.filterwarnings('ignore')

In [47]:
# importing dataset

df = pd.read_csv('solarpowergeneration.csv')
df

Unnamed: 0,distance-to-solar-noon,temperature,wind-direction,wind-speed,sky-cover,visibility,humidity,average-wind-speed-(period),average-pressure-(period),power-generated
0,0.859897,69,28,7.5,0,10.0,75,8.0,29.82,0
1,0.628535,69,28,7.5,0,10.0,77,5.0,29.85,0
2,0.397172,69,28,7.5,0,10.0,70,0.0,29.89,5418
3,0.165810,69,28,7.5,0,10.0,33,0.0,29.91,25477
4,0.065553,69,28,7.5,0,10.0,21,3.0,29.89,30069
...,...,...,...,...,...,...,...,...,...,...
2915,0.166453,63,27,13.9,4,10.0,75,10.0,29.93,6995
2916,0.064020,63,27,13.9,1,10.0,66,15.0,29.91,29490
2917,0.294494,63,27,13.9,2,10.0,68,21.0,29.88,17257
2918,0.524968,63,27,13.9,2,10.0,81,17.0,29.87,677


In [48]:
df.columns = df.columns.str.replace('-', '_')

In [49]:
df.rename(columns={'average_wind_speed_(period)':'avg_wind_speed_period','average_pressure_(period)':'avg_pressure_period'},inplace=True)   # As paranthesis also errs
df.head()

Unnamed: 0,distance_to_solar_noon,temperature,wind_direction,wind_speed,sky_cover,visibility,humidity,avg_wind_speed_period,avg_pressure_period,power_generated
0,0.859897,69,28,7.5,0,10.0,75,8.0,29.82,0
1,0.628535,69,28,7.5,0,10.0,77,5.0,29.85,0
2,0.397172,69,28,7.5,0,10.0,70,0.0,29.89,5418
3,0.16581,69,28,7.5,0,10.0,33,0.0,29.91,25477
4,0.065553,69,28,7.5,0,10.0,21,3.0,29.89,30069


In [50]:
# Do have outliers

df.avg_wind_speed_period.fillna(df.avg_wind_speed_period.median(),inplace=True)

In [51]:
df.isna().sum()

distance_to_solar_noon    0
temperature               0
wind_direction            0
wind_speed                0
sky_cover                 0
visibility                0
humidity                  0
avg_wind_speed_period     0
avg_pressure_period       0
power_generated           0
dtype: int64

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [53]:
def initial_model(X_train,y_train,model):
    model.fit(X_train,y_train)
    print("Training Accuracy : ",model.score(X_train,y_train))
    print("Testing Accuracy : ",model.score(X_test,y_test))

def tune_model(param_grid, model,X_train,y_train):
    grid = GridSearchCV(model,param_grid,cv=5,scoring='r2',n_jobs=-1)
    grid.fit(X_train,y_train)
    print(grid.best_params_)
    return grid

def rebuild_tuned_model(grid,X_train,y_train):
    model = grid.best_estimator_
    model.fit(X_train,y_train)
    print("Training Accuracy : ",model.score(X_train,y_train))
    print("Testing Accuracy : ",model.score(X_test,y_test))

def feature_importance(fr_model,X,y):
    fr_model.fit(X,y)
    arr = fr_model.feature_importances_
    f_df = pd.DataFrame({'column_name':X.columns,'feature_importance':arr})
    f_df = f_df.sort_values('feature_importance',ascending=False)
    f_df['cum_sum']=f_df.feature_importance.cumsum()
    return f_df

def get_top_n_features(n,f_df,X):
    top_n = f_df[:n]
    x = X.loc[:,top_n.column_name]
    return x

def splitting_dataset(df):
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    return (X,y)
    

In [54]:
X,y = splitting_dataset(df)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [55]:
xgb = XGBRegressor(random_state=42)

f_df = feature_importance(xgb,X,y)
f_df

Unnamed: 0,column_name,feature_importance,cum_sum
0,distance_to_solar_noon,0.506058,0.506058
4,sky_cover,0.285973,0.792031
2,wind_direction,0.069019,0.86105
6,humidity,0.03976,0.90081
3,wind_speed,0.025409,0.926219
7,avg_wind_speed_period,0.023918,0.950137
1,temperature,0.020178,0.970315
8,avg_pressure_period,0.019972,0.990287
5,visibility,0.009713,1.0


In [56]:
x = get_top_n_features(7,f_df,X)
x.head()

Unnamed: 0,distance_to_solar_noon,sky_cover,wind_direction,humidity,wind_speed,avg_wind_speed_period,temperature
0,0.859897,0,28,75,7.5,8.0,69
1,0.628535,0,28,77,7.5,5.0,69
2,0.397172,0,28,70,7.5,0.0,69
3,0.16581,0,28,33,7.5,0.0,69
4,0.065553,0,28,21,7.5,3.0,69


In [57]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

xgb = XGBRegressor(random_state=42)
initial_model(X_train,y_train,xgb)

param_grid = {
    'n_estimators': [100, 300, 500],   # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2], # Step size shrinkage
    'max_depth': [3, 5, 7],            # Maximum depth of trees
    'subsample': [0.7, 1.0],           # Fraction of samples per tree
}

grid = tune_model(param_grid, xgb, X_train,y_train)

rebuild_tuned_model(grid,X_train,y_train)

Training Accuracy :  0.9984981000802962
Testing Accuracy :  0.8955760045043292
{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.7}
Training Accuracy :  0.9608851932673675
Testing Accuracy :  0.9004676266331068


In [58]:
xgb = grid.best_estimator_

In [59]:
import pickle

In [60]:
pickle.dump(xgb,open('xgb.pkl','wb'))