In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset
df = pd.read_csv('../data/wind-data.csv')

* `Date/Time` (for 10 minutes intervals)
* `LV ActivePower (kW)`: The power generated by the turbine for that moment
* `Wind Speed (m/s)`: The wind speed at the hub height of the turbine (the wind speed that turbine use for electricity generation)
* `Theoretical_Power_Curve (KWh)`: The theoretical power values that the turbine generates with that wind speed which is given by the turbine manufacturer
* `Wind Direction (°)`: The wind direction at the hub height of the turbine (wind turbines turn to this direction automaticly)

In [3]:
# show the first five row in dataset
df.head()

Unnamed: 0,Date/Time,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°)
0,01 01 2018 00:00,380.047791,5.311336,416.328908,259.994904
1,01 01 2018 00:10,453.769196,5.672167,519.917511,268.641113
2,01 01 2018 00:20,306.376587,5.216037,390.900016,272.564789
3,01 01 2018 00:30,419.645904,5.659674,516.127569,271.258087
4,01 01 2018 00:40,380.650696,5.577941,491.702972,265.674286


In [4]:
# How many rows and columns are there in dataset
print(f"The number of rows {df.shape[0]} and Columns {df.shape[1]}")

The number of rows 50530 and Columns 5


In [5]:
# Drop columns ('Date/Time', 'Theoretical_Power_Curve (KWh)')

df.drop(columns=df[['Date/Time','Theoretical_Power_Curve (KWh)']],inplace=True)

y=df['LV ActivePower (kW)']

df.drop(columns=['LV ActivePower (kW)'],axis=1,inplace=True)

In [6]:
df['Wind Direction (°)']=(df['Wind Direction (°)']-df['Wind Direction (°)'].mean())/(df['Wind Direction (°)'].std())
df['Wind Speed (m/s)']=(df['Wind Speed (m/s)']-df['Wind Speed (m/s)'].mean())/(df['Wind Speed (m/s)'].std())

In [7]:
df.head()

Unnamed: 0,Wind Speed (m/s),Wind Direction (°)
0,-0.531471,1.45871
1,-0.446111,1.551239
2,-0.554015,1.593229
3,-0.449066,1.579245
4,-0.468402,1.519489


#### Check the missing value

In [8]:
df.isnull().sum() 

Wind Speed (m/s)      0
Wind Direction (°)    0
dtype: int64

#### Spliting the dataset into Training and Testing

In [9]:
y_train = y[:42283]
y_test = y[42283:]
X_train = df.iloc[:42283]
X_test = df.iloc[42283:]

#### Model Selection

In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score, r2_score,mean_squared_error

In [11]:
xgr = XGBRegressor()
rf = RandomForestRegressor()
lr = LinearRegression()
dt = DecisionTreeRegressor()
sm = SVR()

In [12]:
model_xg=xgr.fit(X_train,y_train)

model_rf=rf.fit(X_train,y_train)

model_lr=lr.fit(X_train,y_train)

model_dt=dt.fit(X_train,y_train)

model_sm=sm.fit(X_train,y_train)

In [13]:
y_xg = model_xg.predict(X_test)
y_rf = model_rf.predict(X_test)
y_lr = model_lr.predict(X_test)
y_dt = model_dt.predict(X_test)
y_sm = model_sm.predict(X_test)

In [14]:
score = {
    "R2": [
        r2_score(y_test,y_xg), 
        r2_score(y_test,y_rf), 
        r2_score(y_test,y_lr), 
        r2_score(y_test,y_dt), 
        r2_score(y_test,y_sm)
    ],
    "RMSE": [
        np.sqrt(mean_squared_error(y_test,y_xg)),
        np.sqrt(mean_squared_error(y_test,y_rf)),
        np.sqrt(mean_squared_error(y_test,y_lr)),
        np.sqrt(mean_squared_error(y_test,y_dt)),
        np.sqrt(mean_squared_error(y_test,y_sm))
    ]
}

index = ['XGBRegressor','RandomForestRegressor','LinearRegression','DecisionTreeRegressor','SVR']

r2_and_rmse = pd.DataFrame(score, index=index)

In [15]:
r2_and_rmse

Unnamed: 0,R2,RMSE
XGBRegressor,0.837875,552.970144
RandomForestRegressor,0.818536,585.022054
LinearRegression,0.818436,585.182907
DecisionTreeRegressor,0.720398,726.183953
SVR,0.889138,457.264029


In [16]:
params = {
    "learning_rate": [0.05, 0.01, 0.03, 0.1, 0.15, 0.2],
    "n_estimators": [50, 100, 150, 200, 500, 800, 1000, 1500],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15, 20, 25],
    "min_child_weight": [1, 3, 5, 7, 10, 15, 20, 25],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "subsample": [0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1],
    "reg_lambda": [0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1],
    "reg_alpha": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7, 0.9],
    "colsample_bylevel": [0.3, 0.4, 0.5, 0.7, 0.9],
}

In [17]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from datetime import datetime

In [18]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [19]:
random_search = RandomizedSearchCV(xgr,param_distributions=params,n_iter=10,n_jobs=-1,cv=5,verbose=3)
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train,y_train)
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 10 candidates, totalling 50 fits

 Time taken: 0 hours 0 minutes and 33.63 seconds.


In [20]:
random_search.best_estimator_

In [21]:
xg = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0.2,
             importance_type='gain', learning_rate=0.03, max_delta_step=0,
             max_depth=8, min_child_weight=25, missing=None, n_estimators=800,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0.2, reg_lambda=0.8, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.1, verbosity=1)

x=xgr.fit(X_train,y_train)

y1=x.predict(X_test)

r2_score(y_test,y1)

0.8378748583617237

In [22]:
r = RandomForestRegressor()
params_rf={
    "n_estimators"     : [50, 100, 150, 200, 500, 800,1000,1500] ,
    "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15,20,25]
 }

In [23]:
random_search=RandomizedSearchCV(rf,param_distributions=params_rf,n_iter=10,n_jobs=-1,cv=5,verbose=3)

In [24]:
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train,y_train)
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 10 candidates, totalling 50 fits

 Time taken: 0 hours 7 minutes and 41.73 seconds.


In [25]:
random_search.best_estimator_

In [26]:
sv = SVR(gamma='auto',C=100,epsilon=0.4)

x = rf.fit(X_train,y_train)

y1 = x.predict(X_test)

r2_score(y_test,y1)

0.8198452740346691