In [50]:
import pandas as pd
import os
import numpy as np
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

root_path = r'C:\Users\ADE17\Desktop\Masters\Projects\AIChallenge_OWL\SollIch-Hackathon_Daten\Data_Participants3'
def load_data(root_path, mode='train'):
    data_dict = {}
    if mode == 'train':
        x_folder = "Train_X"
        y_folder = "Train_Y"
    else:
        x_folder = "Eval_X"
        y_folder = "Eval_Y"
    x_files = os.listdir(os.path.join(root_path, x_folder))
    y_files = os.listdir(os.path.join(root_path, y_folder))

    for x_file, y_file in zip(x_files, y_files):
        if x_file.endswith('.pq') and y_file.endswith('.pq'):
            path_X = os.path.join(x_folder, x_file)
            path_Y = os.path.join(y_folder, y_file)
            df_X = pd.read_parquet(os.path.join(root_path, path_X))
            df_Y = pd.read_parquet(os.path.join(root_path, path_Y))
            target_col_1 = df_X['ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage']
            targets = pd.concat([df_Y, target_col_1], axis=1)
            df_X = df_X.drop(['ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage'], axis=1)
            date = x_file.split('_')[1:]  # Extracting month and day
            date_key = '_'.join(date)[:5]  # Creating the 'MM_DD' format
            data_dict[date_key] = {'features': df_X, 'targets': targets}

    return data_dict

train_data = load_data(root_path)
val_data = load_data(root_path, mode='val')
def custom_weighted_error(true_values, predicted_values):
    absolute_errors = np.abs(true_values - predicted_values)
    
    points = 0
    for error in absolute_errors:
        if error <= 0.05:
            points += 1
        elif 0.05 < error <= 0.1:
            points += 0.5
        elif 0.1 < error <= 0.5:
            points += 0.25
        else:
            points += 0
    
    return points 

def custom_weighted_error_xgb(preds, dtrain):
    true_values = dtrain.get_label()
    
    absolute_errors = np.abs(true_values - preds)
    
    points = 0
    for error in absolute_errors:
        if error <= 0.05:
            points += 1
        elif 0.05 < error <= 0.1:
            points += 0.5
        elif 0.1 < error <= 0.5:
            points += 0.25
        else:
            points += 0
    
    return 'custom_weighted_error', points / len(preds) * 100

def custom_error_duration(preds, dtrain):
    true_values = dtrain.get_label()
    
    preds_array = preds.astype(float)
    
    absolute_errors = np.abs(true_values - preds_array)
    
    error_less_than_01 = np.sum(absolute_errors < 0.1) / len(absolute_errors)
    
    error_intervals = []
    current_interval = 0
    for error in absolute_errors:
        if error > 0.1:
            current_interval += 1
        else:
            if current_interval > 0:
                error_intervals.append(current_interval)
                current_interval = 0
    
    max_intervals_1 = len(absolute_errors) / 2
    max_intervals_2 = len(absolute_errors) / 8
    
    points = 0
    for interval in error_intervals:
        if interval <= 1:
            points += max(0, 0.5 - (interval / max_intervals_1))
        elif 2 <= interval <= 10:
            points += max(0, 0.25 - (interval / max_intervals_2))
    
    # Calculate the error duration metric
    error_duration_points = (1 - error_less_than_01) * 100 + points
    
    return 'custom_error_duration', error_duration_points 
def add_time_columns(df):
    df['Hour'] = df.index.hour
    df['Minute'] = df.index.minute
    df['Second'] = df.index.second
    return df

In [2]:
sample_train_features_1 = add_time_columns(train_data['03_14']['features'])
sample_train_targets_1 = train_data['03_14']['targets']
sample_train_features_2 = add_time_columns(train_data['03_15']['features'])
sample_train_targets_2 = train_data['03_15']['targets']

final_train_feat = pd.concat([sample_train_features_1, sample_train_features_2], axis=0)
final_train_targets = pd.concat([sample_train_targets_1, sample_train_targets_2], axis=0)
final_train_targets_1 = final_train_targets['ProzessData_ActData_AB1_Temperature_DR1_MassMixingStage']
final_train_targets_2 = final_train_targets['ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage']

sample_val_features = add_time_columns(val_data['03_16']['features'])
sample_val_targets = val_data['03_16']['targets']
sample_val_targets_1 = val_data['03_16']['targets']['ProzessData_ActData_AB1_Temperature_DR1_MassMixingStage']
sample_val_targets_2 = val_data['03_16']['targets']['ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage']


In [48]:
def add_rolling_features(data, window_sizes):
    extended_data = data.copy()
    
    for window in window_sizes:
        for col in data.columns:
            # Rolling mean
            extended_data[f'{col}_rolling_mean_{window}'] = data[col].rolling(window=window, min_periods=1).mean()
            
            # Rolling standard deviation
            extended_data[f'{col}_rolling_std_{window}'] = data[col].rolling(window=window, min_periods=1).std()
            
            # Rolling maximum
            extended_data[f'{col}_rolling_max_{window}'] = data[col].rolling(window=window, min_periods=1).max()
    
    return extended_data

In [51]:
window_size = [1, 2, 3]
train_features = add_rolling_features(sample_val_features, window_size)
val_features = add_rolling_features()

In [52]:
train_features

Unnamed: 0,ProzessData_ActData_AB1_Analogs_DX1_MassPressure,ProzessData_ActData_AB1_Analogs_GY1_MassLevelTank,ProzessData_ActData_AB1_Current_DV1_Scraper,ProzessData_ActData_AB1_Current_DW1_RiserPumpFwd,ProzessData_ActData_AB1_Speed_DV1_Scraper,ProzessData_ActData_AB1_Speed_DW1_RiserPumpFwd,ProzessData_ActData_AB1_Temperature_DP1_MassHeatingStage,ProzessData_ActData_AB1_Temperature_DP1_WaterHeatingStage,ProzessData_ActData_AB1_Temperature_DQ1_MassCoolingStage,ProzessData_ActData_AB1_Temperature_DQ1_WaterCoolingStage,...,ProzessData_ActData_AB1_Temperature_DX1_MassInfeed_rolling_max_3,Hour_rolling_mean_3,Hour_rolling_std_3,Hour_rolling_max_3,Minute_rolling_mean_3,Minute_rolling_std_3,Minute_rolling_max_3,Second_rolling_mean_3,Second_rolling_std_3,Second_rolling_max_3
2023-03-16 01:00:00+00:00,0.74,64.070000,0.0,0.0,0.0,0.0,41.299999,42.000000,41.599998,42.0,...,41.900002,1.000000,,1.0,0.000000,,0.0,0.0,,0.0
2023-03-16 01:00:01+00:00,0.75,64.070000,0.0,0.0,0.0,0.0,41.299999,42.099998,41.599998,42.0,...,41.900002,1.000000,0.00000,1.0,0.000000,0.000000,0.0,0.5,0.707107,1.0
2023-03-16 01:00:02+00:00,0.74,64.070000,0.0,0.0,0.0,0.0,41.299999,42.099998,41.599998,42.0,...,41.900002,1.000000,0.00000,1.0,0.000000,0.000000,0.0,1.0,1.000000,2.0
2023-03-16 01:00:03+00:00,0.75,64.070000,0.0,0.0,0.0,0.0,41.299999,42.099998,41.599998,42.0,...,41.900002,1.000000,0.00000,1.0,0.000000,0.000000,0.0,2.0,1.000000,3.0
2023-03-16 01:00:04+00:00,0.75,64.070000,0.0,0.0,0.0,0.0,41.299999,42.099998,41.599998,42.0,...,41.900002,1.000000,0.00000,1.0,0.000000,0.000000,0.0,3.0,1.000000,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-16 19:59:56+00:00,0.73,63.910000,0.0,0.0,0.0,0.0,41.299999,42.000000,41.700001,42.0,...,41.900002,19.000000,0.00000,19.0,59.000000,0.000000,59.0,55.0,1.000000,56.0
2023-03-16 19:59:57+00:00,0.74,63.919998,0.0,0.0,0.0,0.0,41.299999,42.000000,41.700001,42.0,...,41.900002,19.000000,0.00000,19.0,59.000000,0.000000,59.0,56.0,1.000000,57.0
2023-03-16 19:59:58+00:00,0.73,63.910000,0.0,0.0,0.0,0.0,41.299999,42.000000,41.700001,42.0,...,41.900002,19.000000,0.00000,19.0,59.000000,0.000000,59.0,57.0,1.000000,58.0
2023-03-16 19:59:59+00:00,0.74,63.910000,0.0,0.0,0.0,0.0,41.299999,42.000000,41.700001,42.0,...,41.900002,19.000000,0.00000,19.0,59.000000,0.000000,59.0,58.0,1.000000,59.0


In [10]:
# params = {
#     'objective': 'reg:squarederror',
#     'eta': 0.075,
#     'max_depth': 6,
#     'min_child_weight': 1,
#     'subsample': 1.0,
#     'colsample_bytree': 1.0,
#     'eval_metric': 'mae',
#     'seed': 5
# }
params = {'eval_metric': 'mae'}

# Convert data into DMatrix format for XGBoost
dtrain = xgb.DMatrix(final_train_feat, label=final_train_targets_1)
dvalid = xgb.DMatrix(sample_val_features, label=sample_val_targets_1)

# Training the model
num_round = 1000
early_stopping_rounds = 100
max_time_for_learner = 360  # in seconds

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_boost_round=num_round, evals=watchlist,
                  early_stopping_rounds=early_stopping_rounds,
                  feval=custom_weighted_error_xgb,
                  maximize=False, verbose_eval=True)



[0]	train-mae:24.86623	train-custom_weighted_error:0.00000	eval-mae:24.79254	eval-custom_weighted_error:0.00000
[1]	train-mae:17.40678	train-custom_weighted_error:0.00000	eval-mae:17.35576	eval-custom_weighted_error:0.00000
[2]	train-mae:12.18512	train-custom_weighted_error:0.00000	eval-mae:12.14595	eval-custom_weighted_error:0.00000
[3]	train-mae:8.52982	train-custom_weighted_error:0.00000	eval-mae:8.50094	eval-custom_weighted_error:0.00000
[4]	train-mae:5.97109	train-custom_weighted_error:0.00000	eval-mae:5.94706	eval-custom_weighted_error:0.00000
[5]	train-mae:4.17994	train-custom_weighted_error:0.00000	eval-mae:4.16069	eval-custom_weighted_error:0.00000
[6]	train-mae:2.92610	train-custom_weighted_error:0.00000	eval-mae:2.91042	eval-custom_weighted_error:0.00000
[7]	train-mae:2.04840	train-custom_weighted_error:0.00000	eval-mae:2.03517	eval-custom_weighted_error:0.00000
[8]	train-mae:1.43397	train-custom_weighted_error:0.00000	eval-mae:1.42234	eval-custom_weighted_error:0.00036
[9]	

In [29]:
preds = model.predict(xgb.DMatrix(sample_val_features))
print(custom_weighted_error(sample_val_targets_1, preds.round(2))/len(preds) * 100)

95.79903802575986


In [41]:
window_size = 20 # Adjust the window size as needed
smoothed_preds = pd.Series(preds).rolling(window=window_size, min_periods=1).mean()

In [42]:
print(custom_weighted_error(sample_val_targets_1.to_numpy(), smoothed_preds) /len(preds) * 100)

95.82791187263344


In [43]:
import matplotlib.pyplot as plt
%matplotlib qt
plt.plot(sample_val_targets_1.to_numpy().flatten(), label='true')
plt.plot(smoothed_preds)
plt.legend()

<matplotlib.legend.Legend at 0x1f9ec052be0>

In [47]:
import matplotlib.pyplot as plt
%matplotlib qt
plt.plot(np.abs(sample_val_targets_1.to_numpy().flatten() - preds), label='normal')
plt.plot(np.abs(sample_val_targets_1.to_numpy().flatten() - smoothed_preds), label='smooth')
plt.legend()

<matplotlib.legend.Legend at 0x1f9eec38eb0>