In [1]:
import pandas as pd
import os
import numpy as np
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

root_path = r'C:\Users\ADE17\Desktop\Masters\Projects\AIChallenge_OWL\SollIch-Hackathon_Daten\Data_Participants3'
def load_data(root_path, mode='train'):
    data_dict = {}
    if mode == 'train':
        x_folder = "Train_X"
        y_folder = "Train_Y"
    else:
        x_folder = "Eval_X"
        y_folder = "Eval_Y"
    x_files = os.listdir(os.path.join(root_path, x_folder))
    y_files = os.listdir(os.path.join(root_path, y_folder))

    for x_file, y_file in zip(x_files, y_files):
        if x_file.endswith('.pq') and y_file.endswith('.pq'):
            path_X = os.path.join(x_folder, x_file)
            path_Y = os.path.join(y_folder, y_file)
            df_X = pd.read_parquet(os.path.join(root_path, path_X))
            df_Y = pd.read_parquet(os.path.join(root_path, path_Y))
            # target_col_1 = df_X['ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage']
            targets = df_Y
            df_X = df_X.drop(['ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage'], axis=1)
            date = x_file.split('_')[1:]  # Extracting month and day
            date_key = '_'.join(date)[:5]  # Creating the 'MM_DD' format
            data_dict[date_key] = {'features': df_X, 'targets': targets}

    return data_dict

train_data = load_data(root_path)
val_data = load_data(root_path, mode='val')
def custom_weighted_error(true_values, predicted_values):
    absolute_errors = np.abs(true_values - predicted_values)
    
    points = 0
    for error in absolute_errors:
        if error <= 0.05:
            points += 1
        elif 0.05 < error <= 0.1:
            points += 0.5
        elif 0.1 < error <= 0.5:
            points += 0.25
        else:
            points += 0
    
    return points 

def custom_weighted_error_xgb(preds, dtrain):
    true_values = dtrain.get_label()
    
    absolute_errors = np.abs(true_values - preds)
    
    points = 0
    for error in absolute_errors:
        if error <= 0.05:
            points += 1
        elif 0.05 < error <= 0.1:
            points += 0.5
        elif 0.1 < error <= 0.5:
            points += 0.25
        else:
            points += 0
    
    return 'custom_weighted_error', points / len(preds) * 100

def custom_error_duration(preds, dtrain):
    true_values = dtrain.get_label()
    
    preds_array = preds.astype(float)
    
    absolute_errors = np.abs(true_values - preds_array)
    
    error_less_than_01 = np.sum(absolute_errors < 0.1) / len(absolute_errors)
    
    error_intervals = []
    current_interval = 0
    for error in absolute_errors:
        if error > 0.1:
            current_interval += 1
        else:
            if current_interval > 0:
                error_intervals.append(current_interval)
                current_interval = 0
    
    max_intervals_1 = len(absolute_errors) / 2
    max_intervals_2 = len(absolute_errors) / 8
    
    points = 0
    for interval in error_intervals:
        if interval <= 1:
            points += max(0, 0.5 - (interval / max_intervals_1))
        elif 2 <= interval <= 10:
            points += max(0, 0.25 - (interval / max_intervals_2))
    
    # Calculate the error duration metric
    error_duration_points = (1 - error_less_than_01) * 100 + points
    
    return 'custom_error_duration', error_duration_points 
def add_time_columns(df):
    df['Hour'] = df.index.hour
    df['Minute'] = df.index.minute
    df['Second'] = df.index.second
    return df
def calculate_points(estimated_values):
    thresholds = [(0, 1), (2, 10)]
    point_values = [0.5, 0.25]
    num_estimates = len(estimated_values)
    values_below_threshold = sum(1 for value in estimated_values if abs(value) <= 0.1)
    thresh_arr = [1 if abs(i)>0.1 else 0 for i in estimated_values]
    #print(thresh_arr)
    err_points = 0  # sum for abserr > 0.1
    count = 0
    for i in thresh_arr:
        if i == 1:
            count = count +1
        else:
            if count == 1:
                err_points = err_points + 0.5
            elif count <= 10 and count >=2:
                err_points = err_points + (0.25 * count)
            count = 0
    if count == 1:
        err_points = err_points + 0.5
    elif count <= 10 and count >=2:
        err_points = err_points + (0.25 * count)
    #print (err_points)
    return (values_below_threshold + err_points) * 100 /  num_estimates
def calculate_points(estimated_values):
    num_estimates = len(estimated_values)
    values_below_threshold = sum(1 for value in estimated_values if abs(value) <= 0.1)
    thresh_arr = [1 if abs(i)>0.1 else 0 for i in estimated_values]
    #print(thresh_arr)
    err_points = 0  # sum for abserr > 0.1
    count = 0
    for i in thresh_arr:
        if i == 1:
            count = count +1
        else:
            if count == 1:
                err_points = err_points + 0.5
            elif count <= 10 and count >=2:
                err_points = err_points + (0.25 * count)
            count = 0
    if count == 1:
        err_points = err_points + 0.5
    elif count <= 10 and count >=2:
        err_points = err_points + (0.25 * count)
    #print (err_points)
    return (values_below_threshold + err_points) * 100 /  num_estimates

In [2]:
notime_sample_train_targets_1 = train_data['04_11']['features']
sample_train_features_1 = add_time_columns(train_data['04_11']['features'])
sample_train_targets_1 = train_data['04_11']['targets']

no_time_sample_train_targets_2 = train_data['04_12']['features']
sample_train_features_2 = add_time_columns(train_data['04_12']['features'])
sample_train_targets_2 = train_data['04_12']['targets']

notime_final_train_feat = pd.concat([notime_sample_train_targets_1, no_time_sample_train_targets_2], axis=0)
final_train_feat = pd.concat([sample_train_features_1, sample_train_features_2], axis=0)
final_train_targets = pd.concat([sample_train_targets_1, sample_train_targets_2], axis=0)
final_train_targets = final_train_targets['ProzessData_ActData_AB1_Temperature_DR1_MassMixingStage']

notime_sample_val_features = val_data['04_13']['features']
sample_val_features = add_time_columns(val_data['04_13']['features'])
# sample_val_targets = val_data['03_16']['targets']
sample_val_targets = val_data['04_13']['targets']['ProzessData_ActData_AB1_Temperature_DR1_MassMixingStage']


In [3]:
def add_rolling_features(data, window_sizes):
    extended_data = data.copy()
    
    for window in window_sizes:
        for col in data.columns:
            # Rolling mean
            extended_data[f'{col}_rolling_mean_{window}'] = data[col].rolling(window=window, min_periods=1).mean()
            
            # Rolling standard deviation
            extended_data[f'{col}_rolling_std_{window}'] = data[col].rolling(window=window, min_periods=1).std()
            
            # Rolling maximum
            extended_data[f'{col}_rolling_max_{window}'] = data[col].rolling(window=window, min_periods=1).max()
    
    return extended_data

In [4]:
window_size = [1, 2, 3]
train_features = add_rolling_features(final_train_feat, window_size)
val_features = add_rolling_features(sample_val_features, window_size)
notime_train_features = add_time_columns(add_rolling_features(notime_final_train_feat, window_size))
notime_val_features = add_time_columns(add_rolling_features(notime_sample_val_features, window_size))

In [5]:
# params = {
#     'objective': 'reg:squarederror',
#     'eta': 0.075,
#     'max_depth': 6,
#     'min_child_weight': 1,
#     'subsample': 1.0,
#     'colsample_bytree': 1.0,
#     'eval_metric': 'mae',
#     'seed': 5
# }
params = {'eval_metric': 'mae'}

# Convert data into DMatrix format for XGBoost
dtrain = xgb.DMatrix(train_features, label=final_train_targets)
dvalid = xgb.DMatrix(val_features, label=sample_val_targets)
# dtrain = xgb.DMatrix(notime_train_features, label=final_train_targets)
# dvalid = xgb.DMatrix(notime_val_features, label=sample_val_targets)

# Training the model
num_round = 1000
early_stopping_rounds = 100
max_time_for_learner = 360  # in seconds

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_boost_round=num_round, evals=watchlist,
                  early_stopping_rounds=early_stopping_rounds,
                  feval=custom_weighted_error_xgb,
                  maximize=False, verbose_eval=True)

[0]	train-mae:24.79517	train-custom_weighted_error:0.00000	eval-mae:25.00784	eval-custom_weighted_error:0.00000
[1]	train-mae:17.35711	train-custom_weighted_error:0.00000	eval-mae:17.52536	eval-custom_weighted_error:0.00000
[2]	train-mae:12.15037	train-custom_weighted_error:0.00000	eval-mae:12.28875	eval-custom_weighted_error:0.00000
[3]	train-mae:8.50553	train-custom_weighted_error:0.00000	eval-mae:8.62462	eval-custom_weighted_error:0.00000
[4]	train-mae:5.95409	train-custom_weighted_error:0.00000	eval-mae:6.05399	eval-custom_weighted_error:0.00000
[5]	train-mae:4.16803	train-custom_weighted_error:0.00000	eval-mae:4.21811	eval-custom_weighted_error:0.00000
[6]	train-mae:2.91775	train-custom_weighted_error:0.00000	eval-mae:2.93335	eval-custom_weighted_error:0.04057
[7]	train-mae:2.04254	train-custom_weighted_error:0.00000	eval-mae:2.03373	eval-custom_weighted_error:0.03363
[8]	train-mae:1.42988	train-custom_weighted_error:0.00036	eval-mae:1.40885	eval-custom_weighted_error:0.05300
[9]	

In [6]:
preds = model.predict(xgb.DMatrix(val_features))
print(custom_weighted_error(sample_val_targets, preds.round(2))/len(preds) * 100)

63.15733688103975


In [7]:
window_size = 10# Adjust the window size as needed
smoothed_preds = pd.Series(preds.round(2)).rolling(window=window_size, min_periods=1).mean()

In [8]:
print(custom_weighted_error(sample_val_targets.to_numpy(), smoothed_preds) /len(preds) * 100)

62.83680063156971


In [9]:
abs_errors_array = np.abs(smoothed_preds - sample_val_targets.to_numpy())

In [10]:
calculate_points(abs_errors_array)

77.74776684551395

In [12]:
import matplotlib.pyplot as plt
%matplotlib qt
plt.plot(sample_val_targets.to_numpy().flatten(), label='true')
# plt.plot(preds.round(2), label='preds')
plt.plot(smoothed_preds, label='smooth')

plt.legend()

<matplotlib.legend.Legend at 0x1ad00c6b6a0>

In [47]:
import matplotlib.pyplot as plt
%matplotlib qt
plt.plot(np.abs(sample_val_targets_1.to_numpy().flatten() - preds), label='normal')
plt.plot(np.abs(sample_val_targets_1.to_numpy().flatten() - smoothed_preds), label='smooth')
plt.legend()

<matplotlib.legend.Legend at 0x1f9eec38eb0>