In [5]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from utils import save_processed_data, make_cyclic_features, plot_temperature_over_time, force_save_data
from filterpy.kalman import KalmanFilter
from scipy.stats import entropy
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import imblearn
from sklearn.preprocessing import SplineTransformer
from sklearn.kernel_approximation import Nystroem
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

import os
import smogn

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Minimal Processing

Here we just remove id and measurement_time

In [79]:
def minimal_process_and_save(filepath):
    
    df = pd.read_csv(filepath)
    
    # drop id column
    df = df.drop(columns=['measurement_time'])
    save_processed_data(df, filepath, 'minimal')

In [80]:
# Load the data
minimal_process_and_save('data/raw/train.csv')
minimal_process_and_save('data/raw/test.csv')

Saving the file in data/minimal/train.csv
Saving the file in data/minimal/test.csv


## Time Processing

The first feature enrichment step. Extract Year, Month, Day and Hour

In [135]:
def extract_time(filepath):
    
    df = pd.read_csv(filepath)
    
    # measurement_time is of the form 'YYYY-MM-DD HH:MM:SS'
    # Minutes and seconds are always 00
    # Extract the year, month, day, hour
    
    df['measurement_time'] = pd.to_datetime(df['measurement_time'])
    
    df['year'] = df['measurement_time'].dt.year
    df['month'] = df['measurement_time'].dt.month
    df['day'] = df['measurement_time'].dt.day
    df['hour'] = df['measurement_time'].dt.hour
    
    df = df.drop(columns=['measurement_time'])
    
    # Make them categorical
    df['year'] = df['year'].astype('category')
    df['month'] = df['month'].astype('category')
    df['day'] = df['day'].astype('category')
    df['hour'] = df['hour'].astype('category')
    
    save_processed_data(df, filepath, 'basic_time')

In [136]:
extract_time('data/day_of_week/train.csv')
extract_time('data/day_of_week/test.csv')

Saving the file in data/basic_time/train.csv
Saving the file in data/basic_time/test.csv


## Cyclic Processing

Get sin and cos of desired features

In [96]:
def make_cyclic(filepath):
    # Assume we continue from basic_time
    df = pd.read_pickle(filepath)
    
    # Make cyclic features for month, day, hour, wind_direction
    df = make_cyclic_features(df, 'month', 12)
    df = make_cyclic_features(df, 'day', 31) # 31 days in the longest month
    df = make_cyclic_features(df, 'hour', 24)
    df = make_cyclic_features(df, 'wind_direction', 360)
    
    # Drop year
    df = df.drop(columns=['year'])
    
    save_processed_data(df, filepath, 'cyclic')

In [97]:
make_cyclic('data/no_year/train.pkl')
make_cyclic('data/no_year/test.pkl')

Saving the file in data/cyclic/train.pkl
Saving the file in data/cyclic/test.pkl


## Accounting for distribution change via removal

In [137]:
def time_based_removal(filepath, type='year'):
    
    # Continues from basic_time
    # If type is year, remove all data points from 2023
    # If type is month, only keep data points between 08 and 12
    df = pd.read_pickle(filepath)
    
    # Print number of data points originally
    print(f'Original number of data points: {len(df)}')
    
    # Do not change test data
    if 'test' not in filepath:
        if type == 'year':
            df = df[df['year'] != 2023]
        elif type == 'month':
            # Convert month to int
            df['month'] = df['month'].astype(int)
            df = df[(df['month'] >= 8) & (df['month'] <= 11)]
            # Convert back to category
            df['month'] = df['month'].astype('category')
        elif type == 'simple_year':
            # Drop all 2023 data points, then all time features
            df = df[df['year'] != 2023]
            df = df.drop(columns=['month', 'day'])   
    
    if 'test' in filepath:   
        if type == 'simple_year':
            df = df.drop(columns=['month', 'day'])            
    
    # Print number of data points
    print(f'Number of data points: {len(df)}')
    
    save_processed_data(df, filepath, f'no_{type}')

In [138]:
time_based_removal('data/basic_time/train.pkl', 'simple_year')
time_based_removal('data/basic_time/test.pkl', 'simple_year')

Original number of data points: 7047
Number of data points: 5583
Saving the file in data/no_simple_year/train.pkl
Original number of data points: 1762
Number of data points: 1762
Saving the file in data/no_simple_year/test.pkl


## Just make wind and hour cyclic

In [152]:
def make_wind_direction_cyclic(filepath):
    
    # Continues from no_year
    df = pd.read_pickle(filepath)
    
    # Make wind direction cyclic
    df = make_cyclic_features(df, 'wind_direction', 360)
    
    # Make hour cyclic
    df = make_cyclic_features(df, 'hour', 24)
    
    # Make day of week cyclic, 0 is Monday
    df = make_cyclic_features(df, 'day_of_week', 7)
    
    save_processed_data(df, filepath, 'dow_cyclic')

In [153]:
make_wind_direction_cyclic('data/no_simple_year/train.pkl')
make_wind_direction_cyclic('data/no_simple_year/test.pkl')

Saving the file in data/dow_cyclic/train.pkl
Saving the file in data/dow_cyclic/test.pkl


## Day of the week

In [130]:
def get_day_of_week(filepath):
    # Used on raw data
    
    # if extension is not pkl, load the data
    if filepath[-3:] != 'pkl':
        df = pd.read_csv(filepath)
    else:
        df = pd.read_pickle(filepath)
        
    # Convert measurement_time to datetime
    df['measurement_time'] = pd.to_datetime(df['measurement_time'])
    
    df['day_of_week'] = df['measurement_time'].dt.dayofweek
    df['day_of_week'] = df['day_of_week'].astype('category')
    
    save_processed_data(df, filepath, 'day_of_week')

In [131]:
get_day_of_week('data/raw/train.csv')
get_day_of_week('data/raw/test.csv')

Saving the file in data/day_of_week/train.csv
Saving the file in data/day_of_week/test.csv


In [146]:
def holiday_simplification(filepath):
    # Simplify holidays to 1 and non-holidays to 0
    
    df = pd.read_pickle(filepath)
    
    # Set 5 and 6 to 1, rest to 0
    # Create new field
    df['holiday'] = 0
    df.loc[df['day_of_week'] == 5, 'holiday'] = 1
    df.loc[df['day_of_week'] == 6, 'holiday'] = 1
    
    # Drop day_of_week
    df = df.drop(columns=['day_of_week'])
    
    # Set to category
    df['holiday'] = df['holiday'].astype('category')
    
    # Print ratio of holidays
    print(f'Ratio of holidays: {len(df[df["holiday"] == 1]) / len(df)}')
    
    save_processed_data(df, filepath, 'holiday')

In [147]:
# from no_simple_year
holiday_simplification('data/no_simple_year/train.pkl')
holiday_simplification('data/no_simple_year/test.pkl')

Ratio of holidays: 0.28371843095110155
Saving the file in data/holiday/train.pkl
Ratio of holidays: 0.27298524404086266
Saving the file in data/holiday/test.pkl


# Day 2

Holiday seems to get the best results. We will stick with it

## Dumb simplification

In [2]:
def simplify_data(filepath):
    # Simplify data to only include the most important features
    
    df = pd.read_pickle(filepath)

    # Drop sun_radiation_east, sun_radiation_west, sun_radiation_south, sun_radiation_north and sun_radiation_perpendicular
    df = df.drop(columns=['sun_radiation_east', 'sun_radiation_west', 'sun_radiation_south', 'sun_radiation_north', 'sun_radiation_perpendicular', 'clouds', 'wind_direction', 'wind_speed'])
    
    save_processed_data(df, filepath, 'simple')

In [3]:
simplify_data('data/holiday/train.pkl')
simplify_data('data/holiday/test.pkl')

Saving the file in data/simple/train.pkl
Saving the file in data/simple/test.pkl


## Making work hours explicit

In [11]:
def work_hours(filepath):
    # Make hours cyclic
    
    df = pd.read_pickle(filepath)
    
    # Convert hour to int
    df['hour'] = df['hour'].astype(int)
    
    # Add a variable for work hours
    df['work_hours'] = 0
    df.loc[(df['hour'] >= 9) & (df['hour'] <= 17), 'work_hours'] = 1
    
    df = make_cyclic_features(df, 'hour', 24)
    
    save_processed_data(df, filepath, 'work_hours_cyclic')

In [12]:
work_hours('data/simple/train.pkl')
work_hours('data/simple/test.pkl')

Saving the file in data/work_hours_cyclic/train.pkl
Saving the file in data/work_hours_cyclic/test.pkl


In [4]:
# Readd the day_of_week feature, does not seem to be useful
def add_day_of_week(filepath):
    # Used on raw data
    
    # After last / but replace .csv with .pkl
    file_name = filepath.split('/')[-1].replace('.pkl', '.csv')
    
    # I just realized that this is a constant
    df = pd.read_csv(f'data/raw/{file_name}')

        
    # Convert measurement_time to datetime
    df['measurement_time'] = pd.to_datetime(df['measurement_time'])
    
    df['day_of_week'] = df['measurement_time'].dt.dayofweek
    df['day_of_week'] = df['day_of_week'].astype(float)
    
    # Add the day_of_week feature to new_df
    new_df = pd.read_pickle(filepath)
    new_df['day_of_week'] = df['day_of_week']
    
    # Make it cyclic
    new_df = make_cyclic_features(new_df, 'day_of_week', 7)
    
    
    save_processed_data(new_df, filepath, 'returned_day_of_week')

In [5]:
add_day_of_week('data/work_hours_cyclic/train.pkl')
add_day_of_week('data/work_hours_cyclic/test.pkl')

Saving the file in data/returned_day_of_week/train.pkl
Saving the file in data/returned_day_of_week/test.pkl


## Standardization

In [32]:
def standardize(folder_path, type='standard'):
    
    train_path = f'{folder_path}/train.pkl'
    test_path = f'{folder_path}/test.pkl'
    
    df_train = pd.read_pickle(train_path)
    df_test = pd.read_pickle(test_path)
    
    float_columns = df_train.select_dtypes(include=['float64']).columns
    
    # remove target
    float_columns = float_columns.drop('target')
    
    if type == 'standard':
        scaler = sk.preprocessing.StandardScaler()
    elif type == 'minmax':
        scaler = sk.preprocessing.MinMaxScaler()
        # power transforms
    elif type == 'power':
        scaler = sk.preprocessing.PowerTransformer()
    elif type == 'robust':
        scaler = sk.preprocessing.RobustScaler()
    else:
        raise ValueError('Invalid type')
    
    # Fit the scaler on the training data
    scaler.fit(df_train[float_columns])
    
    scaled_train = scaler.transform(df_train[float_columns])
    scaled_test = scaler.transform(df_test[float_columns])
    
    df_train[float_columns] = scaled_train
    df_test[float_columns] = scaled_test
    
    
    save_processed_data(df_train, train_path, f'{type}_scaler')
    save_processed_data(df_test, test_path, f'{type}_scaler')
    
    return df_train, df_test

In [34]:
scaled_train_df, scaled_test_df = standardize('data/standard', 'standard')

Saving the file in data/standard_scaler/train.pkl
Saving the file in data/standard_scaler/test.pkl


## Making categories from sensors

In [None]:
def add_discrete_sensor(filepath):
    # Too much info lost
    
    # Add a discrete sensor feature
    df = pd.read_pickle(filepath)
    
    # For source_1_temperature, 0 to 24, 24 to 38, 38 to all left
    df['discrete_sensor_1'] = 0
    df.loc[df['source_1_temperature'] > 24, 'discrete_sensor_1'] = 1
    df.loc[df['source_1_temperature'] > 38, 'discrete_sensor_1'] = 2
    
    df['discrete_sensor_1'] = df['discrete_sensor_1'].astype('category')
    
    # For source_2_temperature, 0 to 17, 17 to 20, 20 to all left
    df['discrete_sensor_2'] = 0
    df.loc[df['source_2_temperature'] > 17, 'discrete_sensor_2'] = 1
    df.loc[df['source_2_temperature'] > 20, 'discrete_sensor_2'] = 2
    
    df['discrete_sensor_2'] = df['discrete_sensor_2'].astype('category')
    
    # For source_3_temperature, 0 to 17, 17 to 19, 19 to 21, 21 to all left
    df['discrete_sensor_3'] = 0
    df.loc[df['source_3_temperature'] > 17, 'discrete_sensor_3'] = 1
    df.loc[df['source_3_temperature'] > 19, 'discrete_sensor_3'] = 2
    df.loc[df['source_3_temperature'] > 21, 'discrete_sensor_3'] = 3
    
    df['discrete_sensor_3'] = df['discrete_sensor_3'].astype('category')
    
    # For source_4_temperature, 0 to 19, 19 to 20, 20 to 23, 23 to 28, 28 to all left
    df['discrete_sensor_4'] = 0
    df.loc[df['source_4_temperature'] > 19, 'discrete_sensor_4'] = 1
    df.loc[df['source_4_temperature'] > 20, 'discrete_sensor_4'] = 2
    df.loc[df['source_4_temperature'] > 23, 'discrete_sensor_4'] = 3
    df.loc[df['source_4_temperature'] > 28, 'discrete_sensor_4'] = 4
    
    # Drop the original temperature columns
    df = df.drop(columns=['source_1_temperature', 'source_2_temperature', 'source_3_temperature', 'source_4_temperature'])
    
    save_processed_data(df, filepath, 'discrete_sensor')

In [32]:
add_discrete_sensor('data/work_hours_cyclic/train.pkl')
add_discrete_sensor('data/work_hours_cyclic/test.pkl')

Saving the file in data/discrete_sensor/train.pkl
Saving the file in data/discrete_sensor/test.pkl


## Temp difference

In [43]:
def temp_difference_and_mean(filepath):
    # difference between mean_room_temperature and outside_temperature
    df = pd.read_pickle(filepath)
    
    df['temp_diff'] = df['mean_room_temperature'] - df['outside_temperature']
    
    # Drop the original columns
    df = df.drop(columns=['mean_room_temperature', 'outside_temperature'])
    
    # Create a column for mean of sensor 1 and 4 and then 2 and 3
    df['heating_temp'] = (df['source_1_temperature'] + df['source_4_temperature']) / 2
    df['ventilation_temp'] = (df['source_2_temperature'] + df['source_3_temperature']) / 2
    
    # Drop the original columns
    # df = df.drop(columns=['source_1_temperature', 'source_2_temperature', 'source_3_temperature', 'source_4_temperature'])
    
    
    save_processed_data(df, filepath, 'temp_and_mean')

In [44]:
temp_difference_and_mean('data/work_hours_cyclic/train.pkl')
temp_difference_and_mean('data/work_hours_cyclic/test.pkl')

Saving the file in data/temp_and_mean/train.pkl
Saving the file in data/temp_and_mean/test.pkl


In [53]:
# Readd month, get month from basic_time
def add_month(filepath):
    # Used on raw data
    
    df = pd.read_csv('data/basic_time/train.csv')

    
    # Add the month feature to new_df
    new_df = pd.read_pickle(filepath)
    new_df['month'] = df['month']
    
    # Convert month into Dutch seasons. 1 value for each season, use loops 
    new_df['season'] = 0
    for i in range(1, 13):
        if i in [3, 4, 5]:
            new_df.loc[new_df['month'] == i, 'season'] = 1
        elif i in [6, 7, 8]:
            new_df.loc[new_df['month'] == i, 'season'] = 2
        elif i in [9, 10, 11]:
            new_df.loc[new_df['month'] == i, 'season'] = 3
        else:
            new_df.loc[new_df['month'] == i, 'season'] = 4
            
    new_df['season'] = new_df['season'].astype('category')
    
    # month cyclic
    new_df = make_cyclic_features(new_df, 'month', 12)
    
    save_processed_data(new_df, filepath, 'returned_month')

In [54]:
add_month('data/work_hours_cyclic/train.pkl')
add_month('data/work_hours_cyclic/test.pkl')

Saving the file in data/returned_month/train.pkl
Saving the file in data/returned_month/test.pkl


## Special days
Does not seem to help

In [6]:
def add_special_holidays(filepath):
    
    # Account for special days like Christmas, Easter, etc.
    # Month: Day
    special_days = {
        1: [1], # New Year's Day, 1 January
        3: [29, 31], # Good Friday, 29 March and Easter Monday, 31 March
        4: [1, 27], # Easter Sunday, 1 April and King's Day, 27 April
        5: [5, 9, 19, 20], # Liberation Day, 5 May and Ascension Day, 9 May and Whit Sunday, 19 May and Whit Monday, 20 May
        12: [25, 26], # Christmas and boxing day, 25 and 26 December
    }
    
    # Load basic_time
    original_time = pd.read_csv('data/basic_time/train.csv')
    
    # Set holiday to 1 where it is a special day based on time mentioned in original_time
    current_df = pd.read_pickle(filepath)
    
    # Use holiday variable to store special days, it already exists
    for month, days in special_days.items():
        for day in days:
            current_df.loc[(original_time['month'] == month) & (original_time['day'] == day), 'holiday'] = 1
            
    save_processed_data(current_df, filepath, 'special_holidays')
    
    

In [7]:
add_special_holidays('data/work_hours_cyclic/train.pkl')
add_special_holidays('data/work_hours_cyclic/test.pkl')

Saving the file in data/special_holidays/train.pkl
Saving the file in data/special_holidays/test.pkl


In [None]:
def readd_radiations(filepath):
    
    # After last / but replace .csv with .pkl
    file_name = filepath.split('/')[-1].replace('.pkl', '.csv')
    
    original_df = pd.read_csv(f'data/raw/{file_name}')
    
    # Load raw
    df = pd.read_pickle(filepath)
    
    # Add the radiations back
    df['sun_radiation_east'] = original_df['sun_radiation_east']
    df['sun_radiation_west'] = original_df['sun_radiation_west']
    df['sun_radiation_south'] = original_df['sun_radiation_south']
    df['sun_radiation_north'] = original_df['sun_radiation_north']
    df['sun_radiation_perpendicular'] = original_df['sun_radiation_perpendicular']
    
    save_processed_data(df, filepath, 'readded_radiations')

In [9]:
readd_radiations('data/work_hours_cyclic/train.pkl')
readd_radiations('data/work_hours_cyclic/test.pkl')

Saving the file in data/readded_radiations/train.pkl
Saving the file in data/readded_radiations/test.pkl


In [26]:
def drop_sources(filepath):
    # Drop source columns
    
    df = pd.read_pickle(filepath)
    
    df = df.drop(columns=['source_4_temperature'])
    
    save_processed_data(df, filepath, 'simple_hvac')

drop_sources('data/work_hours_cyclic/train.pkl')
drop_sources('data/work_hours_cyclic/test.pkl')

Saving the file in data/simple_hvac/train.pkl
Saving the file in data/simple_hvac/test.pkl


In [25]:
def rounding(filepath):
    # Round all floats except target to X decimals
    
    df = pd.read_pickle(filepath)
    
    float_columns = df.select_dtypes(include=['float64']).columns
    
    # remove target
    filename = filepath.split('/')[-1]
    
    if 'train' in filename:
        float_columns = float_columns.drop('target')
    
    df[float_columns] = df[float_columns].round(4)
    
    # Drop year column
    df = df.drop(columns=['year'])
    
    save_processed_data(df, filepath, 'rounded')
    
rounding('data/work_hours_cyclic/train.pkl')
rounding('data/work_hours_cyclic/test.pkl')

Saving the file in data/rounded/train.pkl
Saving the file in data/rounded/test.pkl


## SMOGN

In [40]:
def apply_smogn(filepath):
    
    # Apply smogn to the data
    df = pd.read_pickle(filepath)
    
    # reset index
    df = df.reset_index(drop=True)
    
    # Apply smogn
    df = smogn.smoter(data = df, y = 'target')
    
    middle_path = '/'.join(filepath.split('/')[1:-1])
    folder_path = '/'.join(filepath.split('/')[:-1])
    save_processed_data(df, f'{folder_path}/smogn.pkl', middle_path)

In [41]:
apply_smogn('data/standard_scaler/train.pkl')

dist_matrix: 100%|##########| 878/878 [01:46<00:00,  8.27it/s]
synth_matrix: 100%|##########| 878/878 [00:01<00:00, 569.82it/s]
r_index: 100%|##########| 157/157 [00:00<00:00, 1143.57it/s]
1       0
2       0
3       0
4       0
       ..
5576    0
5579    0
5580    0
5581    0
5582    0
Name: holiday, Length: 4704, dtype: category
Categories (2, int64): [0, 1]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data_new.iloc[:, j] = data_new.iloc[:, j].astype(feat_dtypes_orig[j])
1       NaN
2         1
3         1
4       NaN
       ... 
5576      0
5579      1
5580      1
5581      1
5582      1
Name: work_hours, Length: 4704, dtype: category
Categories (2, int64): [0, 1]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  data_new.iloc[:, j] = data_new.iloc[:, j].astype(feat_dtypes_orig[j])


Saving the file in data/standard_scaler/smogn.pkl


In [46]:
# Make work_hours and holiday categorical in smogn
def make_categorical(filepath):
    
    df = pd.read_pickle(filepath)
    
    df['work_hours'] = df['work_hours'].astype('category')
    df['holiday'] = df['holiday'].astype('category')
    
    save_processed_data(df, filepath, 'smogn')

In [47]:
make_categorical('data/smogn/train.pkl')

Saving the file in data/smogn/train.pkl


In [2]:
def take_sqrt(filepath):
    # Take the square root of the source_x_temperature columns
    # Remove the original columns
    
    df = pd.read_pickle(filepath)
    
    for i in range(1, 5):
        df[f'source_{i}_temperature'] = np.sqrt(df[f'source_{i}_temperature'])
    
    save_processed_data(df, filepath, 'sqrt')

In [4]:
take_sqrt('data/standard/train.pkl')
take_sqrt('data/standard/test.pkl')

Saving the file in data/sqrt/train.pkl
Saving the file in data/sqrt/test.pkl


In [1]:
def readd_columns(filepath):
    
    # After last / but replace .csv with .pkl
    file_name = filepath.split('/')[-1].replace('.pkl', '.csv')
    
    original_df = pd.read_csv(f'data/raw/{file_name}')
    
    # Load raw
    df = pd.read_pickle(filepath)
    
    columns_to_add = ['clouds', 'sun_radiation_west', 'sun_radiation_north', 'sun_radiation_perpendicular', 'sun_radiation_east']
    
    for column in columns_to_add:
        df[column] = original_df[column]
    
    save_processed_data(df, filepath, 'retry_data')

In [4]:
readd_columns('data/standard/train.pkl')
readd_columns('data/standard/test.pkl')

Saving the file in data/retry_data/train.pkl
Saving the file in data/retry_data/test.pkl


In [5]:
def round_target(filepath):
    
    df = pd.read_pickle(filepath)
    
    if 'target' in df.columns:
        df['target'] = df['target'].round(4)
    
    save_processed_data(df, filepath, 'rounded_target')

In [6]:
round_target('data/standard/train.pkl')
round_target('data/standard/test.pkl')

Saving the file in data/rounded_target/train.pkl
Saving the file in data/rounded_target/test.pkl


In [2]:
def readd_datetime(filepath):
    
    # After last / but replace .csv with .pkl
    file_name = filepath.split('/')[-1].replace('.pkl', '.csv')
    
    original_df = pd.read_csv(f'data/raw/{file_name}')
    
    # Load raw
    df = pd.read_pickle(filepath)
    
    df['measurement_time'] = original_df['measurement_time']
    
    save_processed_data(df, filepath, 'standard_date')

In [3]:
readd_datetime('data/standard/train.pkl')
readd_datetime('data/standard/test.pkl')

Saving the file in data/standard_date/train.pkl
Saving the file in data/standard_date/test.pkl


In [43]:
def apply_kalman_filter(filepath):
    # Apply Kalman filter to source_x_temperature columns
    df = pd.read_pickle(filepath)
    
    for i in range(1, 5):
        column = f'source_{i}_temperature'
        
        # Initialize Kalman filter
        kf = KalmanFilter(dim_x=2, dim_z=1)
        kf.x = np.array([df[column].iloc[0], 0])  # initial state (location and velocity)
        kf.F = np.array([[1, 1], [0, 1]])        # state transition matrix
        kf.H = np.array([[1, 0]])                # measurement function
        kf.P *= 1000.                            # covariance matrix
        kf.R = 10                                 # measurement noise
        kf.Q = np.array([[0.1, 0], [0, 0.1]])    # process noise
        
        filtered_values = []
        for value in df[column]:
            kf.predict()
            kf.update(value)
            filtered_values.append(kf.x[0])
        
        df[column] = filtered_values
        
    save_processed_data(df, filepath, 'kalman')

In [78]:
apply_kalman_filter('data/source_1_fixed/train.pkl')
apply_kalman_filter('data/source_1_fixed/test.pkl')

Saving the file in data/kalman/train.pkl
Saving the file in data/kalman/test.pkl


In [79]:
def drop_time(filepath):
    
    df = pd.read_pickle(filepath)
    
    df = df.drop(columns=['measurement_time'])
    
    save_processed_data(df, filepath, 'no_measurement_time')

In [80]:
drop_time('data/kalman/train.pkl')
drop_time('data/kalman/test.pkl')

Saving the file in data/no_measurement_time/train.pkl
Saving the file in data/no_measurement_time/test.pkl


In [9]:
def add_timeline(filepath, n):
    
    # For each source_x_temperature, include the previous n values and the next n values
    # For boundary cases, use the same value
    
    df = pd.read_pickle(filepath)
    
    for i in range(1, 5):
        column = f'source_{i}_temperature'
        
        for j in range(1, n + 1):
            df[f'{column}_prev_{j}'] = df[column].shift(j)
            df[f'{column}_next_{j}'] = df[column].shift(-j)
            
    # Fill Nan values with mean of the column, take care of non-numeric columns
    for column in df.columns:
        if df[column].dtype == 'float64':
            df[column] = df[column].fillna(df[column].mean())
        else:
            df[column] = df[column].fillna(df[column].mode()[0])
            
    # Check number of NaN values
    print(f'Number of NaN values: {df.isna().sum().sum()}')
            
    save_processed_data(df, filepath, f'timeline_{n}')

In [8]:
add_timeline('data/standard/train.pkl', 3)
add_timeline('data/standard/test.pkl', 3)

Number of NaN values: 0
Saving the file in data/timeline_3/train.pkl
Number of NaN values: 0
Saving the file in data/timeline_3/test.pkl


In [71]:
def drop_between_months(filepath):
    
    df = pd.read_pickle(filepath)
    
    # measurement_time must be present and converted to datetime
    df['measurement_time'] = pd.to_datetime(df['measurement_time'])
    
    # Replace source_1_temperature with mean of data set betweem months 7 and 9
    mean_temp = df[(df['measurement_time'].dt.month >= 7) & (df['measurement_time'].dt.month <= 9)]['source_1_temperature'].mean()
    
    # replace selected timeframe with mean
    df.loc[(df['measurement_time'].dt.month >= 7) & (df['measurement_time'].dt.month <= 9), 'source_1_temperature'] = mean_temp
    
    # Drop measurement_time
    # df = df.drop(columns=['measurement_time'])
    
    # Add noise to affected source_1_temperature values
    df.loc[(df['source_1_temperature'] == mean_temp), 'source_1_temperature'] += 10 * np.random.uniform(0.01, 0.999, len(df[df['source_1_temperature'] == mean_temp]))
  
    
    save_processed_data(df, filepath, 'source_1_fixed')

In [72]:
drop_between_months('data/standard_date/train.pkl')

Saving the file in data/source_1_fixed/train.pkl


In [35]:
def agricultural_seasons(filepath):
    
    # After last / but replace .csv with .pkl
    file_name = filepath.split('/')[-1].replace('.pkl', '.csv')
    
    original_df = pd.read_csv(f'data/basic_time/{file_name}')
    
    # Drop 2023 data points
    original_df = original_df[original_df['year'] != 2023]
    
    # Create a new column for agricultural seasons
    # If between 4 to 10 -> 1, else 0
    df = pd.read_pickle(filepath)
    
    df['agricultural_season'] = 0
    
    df.loc[(original_df['month'] >= 4) & (original_df['month'] <= 10), 'agricultural_season'] = 1
    
    df['agricultural_season'] = df['agricultural_season'].astype('category')
    
    save_processed_data(df, filepath, 'agricultural_seasons')

In [36]:
agricultural_seasons('data/degree_day_dynamic/train.pkl')
agricultural_seasons('data/degree_day_dynamic/test.pkl')

Saving the file in data/agricultural_seasons/train.pkl
Saving the file in data/agricultural_seasons/test.pkl


In [33]:
def degree_day_dynamic(filepath):
    
    T_h = 12
    T_c = 15
    
    # T_max and T_min are the maximum and minimum outside_temperature of the day
    # Cooling = max((T_max + T_min) / 2 - T_c, 0)
    # Heating = max(T_h - (T_max + T_min) / 2, 0)
    
    file_name = filepath.split('/')[-1].replace('.pkl', '.csv')
    
    # Used to determine day
    original_df = pd.read_csv(f'data/basic_time/{file_name}')
    
    # Drop year=2023
    original_df = original_df[original_df['year'] != 2023]
    
    df = pd.read_pickle(filepath)
    
    df['degree_day_heating'] = 0.0
    df['degree_day_cooling'] = 0.0
    
    original_df['date'] = pd.to_datetime(original_df[['year', 'month', 'day']])

    for date in original_df['date'].unique():
        day_df = original_df[original_df['date'] == date]
        T_max = day_df['outside_temperature'].max()
        T_min = day_df['outside_temperature'].min()
        
        heating = max(T_h - (T_max + T_min) / 2, 0)
        cooling = max((T_max + T_min) / 2 - T_c, 0)
        
        indices = day_df.index
        df.loc[indices, 'degree_day_heating'] = heating
        df.loc[indices, 'degree_day_cooling'] = cooling
        
    # Drop outside_temperature
    # df = df.drop(columns=['outside_temperature'])
    
    save_processed_data(df, filepath, 'degree_day_dynamic')

In [32]:
degree_day_dynamic('data/standard/train.pkl')
degree_day_dynamic('data/standard/test.pkl')

Saving the file in data/degree_day_dynamic/train.pkl
Saving the file in data/degree_day_dynamic/test.pkl


In [38]:
def readd_month(filepath):
    
    # After last / but replace .csv with .pkl
    file_name = filepath.split('/')[-1].replace('.pkl', '.csv')
    
    original_df = pd.read_csv(f'data/basic_time/{file_name}')
    
    # Drop 2023 data points
    original_df = original_df[original_df['year'] != 2023]
    
    # Load raw
    df = pd.read_pickle(filepath)
    
    df['month'] = original_df['month']
    
    # Make month cyclic
    df = make_cyclic_features(df, 'month', 12)
    
    save_processed_data(df, filepath, 'readded_month')

In [39]:
readd_month('data/standard/train.pkl')
readd_month('data/standard/test.pkl')

Saving the file in data/readded_month/train.pkl
Saving the file in data/readded_month/test.pkl


## Start from the top

In [32]:
def encode_spline_features(train_df, test_df, column_name, n_knots=5, degree=3):
    """
    Encode a column using spline features.

    Parameters:
    train_df (pd.DataFrame): Training DataFrame containing the column to encode.
    test_df (pd.DataFrame): Test DataFrame containing the column to encode.
    column_name (str): Name of the column to encode.
    n_knots (int): Number of knots to use. Default is 5.
    degree (int): Degree of the spline. Default is 3 (cubic spline).
    """

    # Combine train and test data for fitting the spline transformer
    combined_df = pd.concat([train_df[[column_name]], test_df[[column_name]]])

    # Initialize SplineTransformer
    spline = SplineTransformer(n_knots=n_knots, degree=degree, include_bias=False)

    # Fit on combined data
    spline.fit(combined_df)

    # Transform both training and test data
    train_spline = spline.transform(train_df[[column_name]])
    test_spline = spline.transform(test_df[[column_name]])

    # Create DataFrames with spline features
    spline_feature_names = [f"{column_name}_spline_{i}" for i in range(train_spline.shape[1])]
    train_spline_df = pd.DataFrame(train_spline, columns=spline_feature_names, index=train_df.index)
    test_spline_df = pd.DataFrame(test_spline, columns=spline_feature_names, index=test_df.index)

    # Concatenate spline features with original DataFrames
    train_df = pd.concat([train_df.drop(columns=[column_name]), train_spline_df], axis=1)
    test_df = pd.concat([test_df.drop(columns=[column_name]), test_spline_df], axis=1)

    return train_df, test_df

def sum_radiations(df):
    
    # Sum the radiations
    df['total_radiation'] = df['sun_radiation_east'] + df['sun_radiation_west'] + df['sun_radiation_south'] + df['sun_radiation_north'] + df['sun_radiation_perpendicular']
    
    # Drop the original columns
    df = df.drop(columns=['sun_radiation_east', 'sun_radiation_west', 'sun_radiation_south', 'sun_radiation_north', 'sun_radiation_perpendicular'])
    
    return df

def apply_nystrom(train_df, test_df, columns, kernel='rbf', n_components=100):
    """
    Apply the Nystrom method to approximate a kernel matrix.

    Parameters:
    train_df (pd.DataFrame): Training DataFrame.
    test_df (pd.DataFrame): Test DataFrame.
    columns (list): List of columns to apply the Nystrom method on.
    kernel (str): Kernel type to use. Default is 'rbf'.
    n_components (int): Number of components for the Nystrom approximation. Default is 100.
    """
    # Initialize Nystroem transformer
    nystroem = Nystroem(kernel=kernel, n_components=n_components, random_state=42)

    # Combine train and test data for fitting the Nystroem transformer
    combined_df = pd.concat([train_df[columns], test_df[columns]])
    
    # Fit on combined data
    nystroem.fit(combined_df[columns])
    
    # Transform train data
    train_transformed = nystroem.transform(train_df[columns])

    # Transform test data
    test_transformed = nystroem.transform(test_df[columns])

    # Create DataFrames with Nystroem features
    nystroem_feature_names = [f"nystroem_{i}" for i in range(train_transformed.shape[1])]
    train_nystroem_df = pd.DataFrame(train_transformed, columns=nystroem_feature_names, index=train_df.index)
    test_nystroem_df = pd.DataFrame(test_transformed, columns=nystroem_feature_names, index=test_df.index)

    # Concatenate Nystroem features with original DataFrames
    train_df = pd.concat([train_df.drop(columns=columns), train_nystroem_df], axis=1)
    test_df = pd.concat([test_df.drop(columns=columns), test_nystroem_df], axis=1)

    return train_df, test_df

def apply_kernel_pca(train_df, test_df, columns, n_components=5, kernel='rbf'):
    """
    Apply Kernel PCA to reduce dimensionality of specified columns.

    Parameters:
    train_df (pd.DataFrame): Training DataFrame.
    test_df (pd.DataFrame): Test DataFrame.
    columns (list): List of columns to apply Kernel PCA on.
    n_components (int): Number of components to keep. Default is 5.
    kernel (str): Kernel type to use in Kernel PCA. Default is 'rbf'.

    Returns:
    pd.DataFrame, pd.DataFrame: Transformed training and test DataFrames.
    """

    # Combine data for fitting
    combined_data = pd.concat([train_df[columns], test_df[columns]], ignore_index=True)

    # Initialize Kernel PCA
    kpca = KernelPCA(n_components=n_components, kernel=kernel, random_state=42)

    # Fit Kernel PCA
    kpca.fit(combined_data)

    # Transform the data
    train_kpca = kpca.transform(train_df[columns])
    test_kpca = kpca.transform(test_df[columns])

    # Create DataFrames with KPCA components
    kpca_columns = [f"kpca_{i+1}" for i in range(n_components)]
    train_kpca_df = pd.DataFrame(train_kpca, columns=kpca_columns, index=train_df.index)
    test_kpca_df = pd.DataFrame(test_kpca, columns=kpca_columns, index=test_df.index)

    # Drop original columns and add KPCA components
    train_df = pd.concat([train_df.drop(columns=columns), train_kpca_df], axis=1)
    test_df = pd.concat([test_df.drop(columns=columns), test_kpca_df], axis=1)

    return train_df, test_df

In [11]:
# Start from the beginning

# Load the data from raw which is csv
train_raw = pd.read_csv('data/raw/train.csv')
test_raw = pd.read_csv('data/raw/test.csv')

# Convert measurement_time to datetime
train_raw['measurement_time'] = pd.to_datetime(train_raw['measurement_time'])
test_raw['measurement_time'] = pd.to_datetime(test_raw['measurement_time'])

# Added is working hours categorical (9 to 17)
train_raw['work_hours'] = 0
train_raw.loc[(train_raw['measurement_time'].dt.hour >= 9) & (train_raw['measurement_time'].dt.hour <= 17), 'work_hours'] = 1

test_raw['work_hours'] = 0
test_raw.loc[(test_raw['measurement_time'].dt.hour >= 9) & (test_raw['measurement_time'].dt.hour <= 17), 'work_hours'] = 1

# Add is holiday categorical (Saturday and Sunday)
train_raw['holiday'] = 0
train_raw.loc[(train_raw['measurement_time'].dt.dayofweek == 5) | (train_raw['measurement_time'].dt.dayofweek == 6), 'holiday'] = 1

test_raw['holiday'] = 0
test_raw.loc[(test_raw['measurement_time'].dt.dayofweek == 5) | (test_raw['measurement_time'].dt.dayofweek == 6), 'holiday'] = 1


# Add hour column
train_raw['hour'] = train_raw['measurement_time'].dt.hour
test_raw['hour'] = test_raw['measurement_time'].dt.hour

# Encode 'hour' using splines with 24 knots (one for each hour)
train_raw = make_cyclic_features(train_raw, 'hour', 24)
test_raw = make_cyclic_features(test_raw, 'hour', 24)

# Drop measurement_time
train_raw = train_raw.drop(columns=['measurement_time'])
test_raw = test_raw.drop(columns=['measurement_time'])

# Combine train and test data for imputation
combined = pd.concat([train_raw, test_raw], ignore_index=True)

# Initialize the IterativeImputer
imputer = IterativeImputer(random_state=42)

# Fit and transform the combined data
imputed_data = imputer.fit_transform(combined)

# Convert the imputed data back to DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=combined.columns)

# Standardize the data except target, scaling harms performance?
# scaler = sk.preprocessing.StandardScaler()
# float_columns = imputed_df.select_dtypes(include=['float64']).columns
# float_columns = float_columns.drop('target')
# float_columns = float_columns.drop('ID')
# scaler.fit(imputed_df[float_columns])

# scaled_data = scaler.transform(imputed_df[float_columns])
# imputed_df[float_columns] = scaled_data

# Split the combined data back into train and test
train_imputed = imputed_df.iloc[:len(train_raw)].reset_index(drop=True)
test_imputed = imputed_df.iloc[len(train_raw):].reset_index(drop=True)

# Make IDs int
train_imputed['ID'] = train_imputed['ID'].astype(int)
test_imputed['ID'] = test_imputed['ID'].astype(int)

# Make work_hours and holiday categorical
train_imputed['work_hours'] = train_imputed['work_hours'].astype(int)
train_imputed['work_hours'] = train_imputed['work_hours'].astype('category')
train_imputed['holiday'] = train_imputed['holiday'].astype(int)
train_imputed['holiday'] = train_imputed['holiday'].astype('category')

# Make work_hours and holiday categorical for test
test_imputed['work_hours'] = test_imputed['work_hours'].astype(int)
test_imputed['work_hours'] = test_imputed['work_hours'].astype('category')
test_imputed['holiday'] = test_imputed['holiday'].astype(int)
test_imputed['holiday'] = test_imputed['holiday'].astype('category')

# remove target from test
test_imputed = test_imputed.drop(columns=['target'])

# drop wind_speed, wind_direction, clouds
train_imputed = train_imputed.drop(columns=['wind_speed', 'wind_direction', 'clouds'])
test_imputed = test_imputed.drop(columns=['wind_speed', 'wind_direction', 'clouds'])

# Just drop the radiation columns
radiation_columns = ['sun_radiation_east', 'sun_radiation_west', 'sun_radiation_south', 'sun_radiation_north', 'sun_radiation_perpendicular']
train_imputed = train_imputed.drop(columns=radiation_columns)
test_imputed = test_imputed.drop(columns=radiation_columns)

# Save the data
train_imputed.to_pickle('data/pure/train.pkl')
test_imputed.to_pickle('data/pure/test.pkl')

# Add csvs also
train_imputed.to_csv('data/pure/train.csv', index=False)
test_imputed.to_csv('data/pure/test.csv', index=False)

In [12]:
# print data types
print(train_imputed.dtypes)

ID                          int64
target                    float64
source_1_temperature      float64
source_2_temperature      float64
source_3_temperature      float64
source_4_temperature      float64
mean_room_temperature     float64
outside_temperature       float64
work_hours               category
holiday                  category
hour_sin                  float64
hour_cos                  float64
dtype: object


In [44]:
def remove_constant_temperature_readings(train_df, periods=4):
    """
    Remove data points where temperature readings remain constant across consecutive periods.

    Parameters:
    train_df (pd.DataFrame): Training DataFrame.
    periods (int): Number of consecutive periods to check for constant temperatures. Default is 5.

    Returns:
    pd.DataFrame: Filtered DataFrame with constant temperature periods removed.
    """
    temperature_columns = [col for col in train_df.columns if 'source_' in col and 'temperature' in col]
    mask = pd.Series(True, index=train_df.index)

    for col in temperature_columns:
        # Calculate differences between consecutive readings
        diffs = train_df[col].diff()
        
        # Create a rolling window and check if all differences are 0
        is_constant = diffs.rolling(window=periods).apply(lambda x: (x == 0).all())
        
        # Update mask to keep only rows where temperatures are changing
        mask = mask & (is_constant == False)

    return train_df[mask]

const_imputed = remove_constant_temperature_readings(train_imputed)

# Print length of train
print(f'Length of train after removing constant temperature readings: {len(const_imputed)}')

# Save the data
force_save_data(const_imputed, 'data/constant_removed/train')
force_save_data(test_imputed, 'data/constant_removed/test')

Length of train after removing constant temperature readings: 6958
Saving the file in data/constant_removed/train
Saving the file in data/constant_removed/test


In [45]:
def remove_constant_slopes(train_df, window=5, threshold=0.01):
    """
    Remove data points where temperature changes are too linear (constant slope).

    Parameters:
    train_df (pd.DataFrame): Training DataFrame
    window (int): Size of rolling window to check for linear changes
    threshold (float): Maximum allowed variation in second differences to consider slope constant

    Returns:
    pd.DataFrame: Filtered DataFrame with constant slope periods removed
    """
    temperature_columns = [col for col in train_df.columns if 'source_' in col and 'temperature' in col]
    mask = pd.Series(True, index=train_df.index)

    for col in temperature_columns:
        # Calculate first differences (rate of change)
        first_diff = train_df[col].diff()
        
        # Calculate second differences (acceleration)
        second_diff = first_diff.diff()
        
        # Create rolling window to check if second differences are near zero
        is_linear = second_diff.rolling(window=window).apply(
            lambda x: abs(x).mean() < threshold
        )
        
        # Update mask to keep only rows where temperature changes are not too linear
        mask = mask & (is_linear == False)

    return train_df[mask]

slope_imputed = remove_constant_slopes(const_imputed)

# Print length of train
print(f'Length of train after removing constant slope periods: {len(slope_imputed)}')

# Save the data
force_save_data(slope_imputed, 'data/slope_removed/train')
force_save_data(test_imputed, 'data/slope_removed/test')

Length of train after removing constant slope periods: 6824
Saving the file in data/slope_removed/train
Saving the file in data/slope_removed/test
