In [1]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# Read in data
df = pd.read_csv('../data/df_clean.csv', index_col=0, parse_dates=True)

## Categoricals

In [3]:
# Get Categorical columns
categorical = df.select_dtypes(include='object')

# Instationate LabelEncoder, fit and transform on wind_direction cols
wind_dir_coder = LabelEncoder()
wind_dir_coder.fit(df['wind_madrid'])
for col in categorical.filter(regex='wind').columns:
    df[col] = wind_dir_coder.transform(df[col])
    

# Stack condition columns into single col
stacked_conditions = categorical.filter(regex='condition').stack()

# Instantiate Label encoder, fit and transform on condition cols
condition_coder = LabelEncoder()
condition_coder.fit(stacked_conditions)
for col in categorical.filter(regex='condition').columns:
    df[col] = condition_coder.transform(df[col])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 61040 entries, 2015-01-01 00:00:00 to 2021-12-30 23:00:00
Data columns (total 64 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   generation biomass                           61040 non-null  float64
 1   generation fossil brown coal/lignite         61040 non-null  float64
 2   generation fossil gas                        61040 non-null  float64
 3   generation fossil hard coal                  61040 non-null  float64
 4   generation fossil oil                        61040 non-null  float64
 5   generation hydro pumped storage consumption  61040 non-null  float64
 6   generation hydro run-of-river and poundage   61040 non-null  float64
 7   generation hydro water reservoir             61040 non-null  float64
 8   generation nuclear                           61040 non-null  float64
 9   generation other                     

## Split Data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['price tomorrow']), df['price tomorrow'], test_size=.3,
                                                    random_state=17)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=.5, random_state=17)

## Model

In [24]:
xgb_0 = XGBRegressor(random_state=17)
xgb_0.fit(X_train, y_train)
train_preds_0 = xgb_0.predict(X_train)
val_preds_0 = xgb_0.predict(X_val)


In [35]:
print('r-squared train:', xgb_0.score(X_train,y_train))
print('r-squared val:', xgb_0.score(X_val,y_val))

r-squared train: 0.9871055598257437
r-squared val: 0.9570112146169001
