In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

plt.style.use("ggplot")

pd.set_option('display.max_columns', None)

In [24]:
df=pd.read_csv("../raw_data/trainset_full.csv", low_memory=False) #change here tomorrow

Below we do feature engineering:

- Have weeks, months and hour of day as feature and generate metafeatures of them with
    - weeks 13,14,5,7,37,29,30,31 are weeks where "planned" errors occur: highweek feature
    - bimonthpark measures interaction between park and bimonth
    -isnight check whether it is night because of noise reduction
    -isnoon check whether it is between 7-14 because there unplanned maintenance usually happens
    -Error makes the errors binary
    -speed and direction average rotor speed and generator speed and nacelle direction and wind direction, respectively

In [25]:
import datetime as dt
df["measured_at"]=pd.to_datetime(df.measured_at)
df["week"]=np.int64(df.measured_at.dt.isocalendar().week)


df["isweek14"]=df.week == 14
df["isweek13"] = df.week ==13
df["isweek5737"]= (df.week == 5) | (df.week == 7) | (df.week == 37)
df["isweek293031"]=(df.week == 29) | (df.week==30) | (df.week == 31)
df["highweek"]= (df.week == 14) | (df.week ==13) | (df.week == 5) | (df.week == 7) | (df.week == 37) | (df.week == 29) | (df.week==30) | (df.week == 31)
df["month"]=df.measured_at.dt.month
df["bimonth"]=df.month// 2 
df["bimonthpark"]= str(df.bimonth)+":"+str(df.park_id)
df["hourofday"]=df.measured_at.dt.hour
df["isnight"]=(df.hourofday >= 18) | (df.hourofday <=5)
df["isnoon"]=(df.hourofday >= 7) & (df.hourofday<=14)

df["Error"]=df.error_category != "NO_ERROR"

df["speed"]=(df.rotor_speed+df.generator_speed)
df["direction"]=(df.nacelle_direction+df.wind_direction)



Features for XGB below, drop the bimonthpark as XGB can figure out interaction itself

In [26]:
xgb_attribs=['turbine_id', 'wind_speed', 'power','week',
       'temp_environment', 'temp_hydraulic_oil', 'temp_gear_bearing', 'cosphi',
       'blade_angle_avg', 'hydraulic_pressure', 'park_id', 'month', 'speed', 'direction','isnight', 'isnoon','highweek']


Label Encoding

In [27]:


#ordinal encode labels
from sklearn.preprocessing import LabelEncoder

labelencoder=LabelEncoder()
labelencoder.fit(df.error_category)
df["EncodedErrors"]=labelencoder.transform(df.error_category)

X_train_xgb=df[xgb_attribs]
y_train_xgb=df.EncodedErrors

In [28]:
y_train_xgb.value_counts()

17    1272609
0       47724
1       21651
11      10224
14       4893
2        3743
6        1771
9        1494
16       1081
3         787
13        495
12        409
4         141
7         108
15         32
8          22
10          4
5           2
Name: EncodedErrors, dtype: int64

XGBClassifier with logloss (variant of AdaBoost), hyperparameters trained separately

In [29]:
from xgboost import XGBClassifier

#adjust parameters tomorrow

xgb=XGBClassifier(objective="multi:softmax",use_label_encoder=False, n_estimators=97 , learning_rate=0.1, max_depth=7,gamma=0.1, alpha=0.0)
xgb.fit(X_train_xgb, y_train_xgb)




XGBClassifier(alpha=0.0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0.1, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=7, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=97, n_jobs=12,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...)

In [30]:
import pickle
with open('xgbclassifier.pickle', 'wb+') as f:
            pickle.dump(xgb, f) 

In [31]:
with open('xgbclassifier.pickle', 'rb') as f:
    xgb_pickle=pickle.load(f)

In [32]:
fitted_y=xgb_pickle.predict(X_train_xgb)

Load imputed test data, copy, and create test feature matrix

In [None]:
tdf_raw=pd.read_csv("../raw_data/testset_full.csv",low_memory=False) 
tdf=tdf_raw.copy()

tdf["measured_at"]=pd.to_datetime(tdf.measured_at)
tdf["week"]=np.int64(tdf.measured_at.dt.isocalendar().week)
tdf["highweek"]= (tdf.week == 14) | (tdf.week ==13) | (tdf.week == 5) | (tdf.week == 7) | (tdf.week == 37) | (tdf.week == 29) | (tdf.week==30) | (tdf.week == 31)
tdf["month"]=tdf.measured_at.dt.month
tdf["hourofday"]=tdf.measured_at.dt.hour
tdf["isnight"]=(tdf.hourofday >= 18) | (tdf.hourofday <=5)
tdf["isnoon"]=(tdf.hourofday >= 7) & (tdf.hourofday<=14)

tdf["speed"]=(tdf.rotor_speed+tdf.generator_speed)
tdf["direction"]=(tdf.nacelle_direction+tdf.wind_direction)

X_test_xgb=tdf[xgb_attribs]

Predict and decode

In [38]:
#prediction
y_hat_xgb=xgb.predict(X_test_xgb)

#Decode
y_hat_xgb=labelencoder.inverse_transform(y_hat_xgb)

KeyboardInterrupt: 

Save as csv

In [18]:
#save as csv 
tdf_raw["error_category"]=y_hat_xgb

tdf_raw.to_csv("Prediction.csv")