In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns #For heatmap

%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [1]:
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tsfresh import extract_relevant_features

In [12]:
train_a = pd.read_parquet('data/A/train_targets.parquet')
train_b = pd.read_parquet('data/B/train_targets.parquet')
train_c = pd.read_parquet('data/C/train_targets.parquet')

# Estimated training data for each location
X_train_estimated_a = pd.read_parquet('data/A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('data/B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('data/C/X_train_estimated.parquet')

# Observed training data for each location
X_train_observed_a = pd.read_parquet('data/A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('data/B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('data/C/X_train_observed.parquet')

# Estimated test data for each location
X_test_estimated_a = pd.read_parquet('data/A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('data/B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('data/C/X_test_estimated.parquet')

In [13]:
train_merge = pd.read_parquet('data/merge/train_merge.parquet')
print(train_merge.iloc[1:100])

                  time  pv_measurement location
0  2018-12-31 23:00:00             NaN        C
1  2019-01-01 00:00:00             NaN        C
1  2019-01-01 00:00:00          0.0000        B
2  2019-01-01 01:00:00          0.0000        B
2  2019-01-01 01:00:00             NaN        C
3  2019-01-01 02:00:00             NaN        C
3  2019-01-01 02:00:00          0.0000        B
4  2019-01-01 03:00:00          0.0000        B
4  2019-01-01 03:00:00             NaN        C
5  2019-01-01 04:00:00          0.0000        B
5  2019-01-01 04:00:00             NaN        C
6  2019-01-01 05:00:00             NaN        C
6  2019-01-01 05:00:00          0.0000        B
7  2019-01-01 06:00:00             NaN        C
7  2019-01-01 06:00:00          0.0000        B
8  2019-01-01 07:00:00             NaN        C
8  2019-01-01 07:00:00          0.0000        B
9  2019-01-01 08:00:00             NaN        C
9  2019-01-01 08:00:00          0.0000        B
10 2019-01-01 09:00:00             NaN  

#### Dummy preprocessing

Checking lengths

In [14]:
print("X_train_estimated_a shape:", X_train_estimated_a.shape)
print("X_train_observed_a shape:", X_train_observed_a.shape)
print("Observed + estimated length", X_train_estimated_a.shape[0] + X_train_observed_a.shape[0])
print("Target value a shape:", train_a.shape)


X_train_estimated_a shape: (17576, 47)
X_train_observed_a shape: (118669, 46)
Observed + estimated length 136245
Target value a shape: (34085, 2)


In [15]:
print("time range target values: ", min(train_a.loc[:, 'time']), '-', max(train_a.loc[:, 'time']))

time range target values:  2019-06-02 22:00:00 - 2023-04-30 23:00:00
time range values 


Merging observed data on location A with target values. This way the dataset will contain datapoints for the same time values. Makes it easier to look at how observed values effect target.

In [16]:
X_train_observed_a.rename(columns={'date_forecast': 'time'}, inplace=True)
X_train_observed_a.loc[:10, 'time'] #quick check

0    2019-06-02 22:00:00
1    2019-06-02 22:15:00
2    2019-06-02 22:30:00
3    2019-06-02 22:45:00
4    2019-06-02 23:00:00
5    2019-06-02 23:15:00
6    2019-06-02 23:30:00
7    2019-06-02 23:45:00
8    2019-06-03 00:00:00
9    2019-06-03 00:15:00
10   2019-06-03 00:30:00
Name: time, dtype: datetime64[us]

In [17]:
merge_temp = pd.merge(train_a, X_train_observed_a, on='time')
merge_temp_dropped_NaN = merge_temp.dropna()
merge_temp_replaced_Nan_w_zero = merge_temp.fillna(0)
merge_temp_cols = merge_temp.columns

In [18]:
Y_no_Nan = merge_temp_dropped_NaN['pv_measurement']
X_no_Nan = merge_temp_dropped_NaN.drop(columns=['time', 'pv_measurement'])
Y_Nan_is_zeros = merge_temp_replaced_Nan_w_zero['pv_measurement']
X_Nan_is_zeros = merge_temp_replaced_Nan_w_zero.drop(columns=['time', 'pv_measurement'])

In [19]:
print("Y shape NaN rows removed: ", Y_no_Nan.shape)
print("X shape NaN rows removed:", X_no_Nan.shape)
print("Y shape Nan replaced with zero:", Y_Nan_is_zeros.shape)
print("X shape Nan replaced with zero:", X_Nan_is_zeros.shape)

Y shape NaN rows removed:  (485,)
X shape NaN rows removed: (485, 45)
Y shape Nan replaced with zero: (29667,)
X shape Nan replaced with zero: (29667, 45)


Relevance table for dataset where Nan rows are removed:

In [24]:
calculate_relevance_table(X_no_Nan, Y_no_Nan)

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
diffuse_rad:W,diffuse_rad:W,real,3.545998e-98,True
clear_sky_rad:W,clear_sky_rad:W,real,2.431672e-95,True
diffuse_rad_1h:J,diffuse_rad_1h:J,real,7.167294e-86,True
clear_sky_energy_1h:J,clear_sky_energy_1h:J,real,3.017082e-83,True
is_day:idx,is_day:idx,binary,3.355588e-82,True
direct_rad:W,direct_rad:W,real,3.982326e-80,True
is_in_shadow:idx,is_in_shadow:idx,binary,3.272072e-79,True
sun_elevation:d,sun_elevation:d,real,1.689667e-78,True
direct_rad_1h:J,direct_rad_1h:J,real,9.210044999999999e-77,True
visibility:m,visibility:m,real,5.373702e-12,True


Relevance table where Nan is replaced with zero:

In [23]:
calculate_relevance_table(X_Nan_is_zeros, Y_Nan_is_zeros)

Unnamed: 0_level_0,feature,type,p_value,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
absolute_humidity_2m:gm3,absolute_humidity_2m:gm3,real,0.0,True
air_density_2m:kgm3,air_density_2m:kgm3,real,0.0,True
is_in_shadow:idx,is_in_shadow:idx,binary,0.0,True
clear_sky_energy_1h:J,clear_sky_energy_1h:J,real,0.0,True
clear_sky_rad:W,clear_sky_rad:W,real,0.0,True
is_day:idx,is_day:idx,binary,0.0,True
wind_speed_v_10m:ms,wind_speed_v_10m:ms,real,0.0,True
dew_point_2m:K,dew_point_2m:K,real,0.0,True
diffuse_rad:W,diffuse_rad:W,real,0.0,True
diffuse_rad_1h:J,diffuse_rad_1h:J,real,0.0,True


### Same for merge

In [52]:
X_observed = pd.read_parquet('data/merge/X_train_observed_merge.parquet')
X_estimated = pd.read_parquet('data/merge/X_train_estimated_merge.parquet')
Y = pd.read_parquet('data/merge/train_merge.parquet')

X_observed.rename(columns={'date_forecast': 'time'}, inplace=True)

merged_df = pd.merge(X_observed, Y, on='time')
merged_df_no_NaN = merged_df.dropna()

print(merged_df.shape)

X = merged_df.drop(columns=['time', 'pv_measurement'])
Y = merged_df['pv_measurement']

calculate_relevance_table(X, Y)

(7665, 49)


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [46]:
calculate_relevance_table()

KeyError: 'time'

In [45]:
calculate_relevance_table(X, Y)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''