In [106]:
import pandas as pd
import os
import numpy as np
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

root_path = r'C:\Users\ADE17\Desktop\Masters\Projects\AIChallenge_OWL\SollIch-Hackathon_Daten\Data_Participants3'
def load_data(root_path, mode='train'):
    data_dict = {}
    if mode == 'train':
        x_folder = "Train_X"
        y_folder = "Train_Y"
    else:
        x_folder = "Eval_X"
        y_folder = "Eval_Y"
    x_files = os.listdir(os.path.join(root_path, x_folder))
    y_files = os.listdir(os.path.join(root_path, y_folder))

    for x_file, y_file in zip(x_files, y_files):
        if x_file.endswith('.pq') and y_file.endswith('.pq'):
            path_X = os.path.join(x_folder, x_file)
            path_Y = os.path.join(y_folder, y_file)
            df_X = pd.read_parquet(os.path.join(root_path, path_X))
            df_Y = pd.read_parquet(os.path.join(root_path, path_Y))
            target_col_1 = df_X['ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage']
            targets = pd.concat([df_Y, target_col_1], axis=1)
            df_X = df_X.drop(['ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage'], axis=1)
            date = x_file.split('_')[1:]  # Extracting month and day
            date_key = '_'.join(date)[:5]  # Creating the 'MM_DD' format
            data_dict[date_key] = {'features': df_X, 'targets': targets}

    return data_dict

train_data = load_data(root_path)
val_data = load_data(root_path, mode='val')
def custom_weighted_error(true_values, predicted_values):
    absolute_errors = np.abs(true_values - predicted_values)
    
    points = 0
    for error in absolute_errors:
        if error <= 0.05:
            points += 1
        elif 0.05 < error <= 0.1:
            points += 0.5
        elif 0.1 < error <= 0.5:
            points += 0.25
        else:
            points += 0
    
    return points 

def custom_weighted_error_xgb(preds, dtrain):
    true_values = dtrain.get_label()
    
    absolute_errors = np.abs(true_values - preds)
    
    points = 0
    for error in absolute_errors:
        if error <= 0.05:
            points += 1
        elif 0.05 < error <= 0.1:
            points += 0.5
        elif 0.1 < error <= 0.5:
            points += 0.25
        else:
            points += 0
    
    return 'custom_weighted_error', points / len(preds) * 100

def custom_error_duration(preds, dtrain):
    true_values = dtrain.get_label()
    
    preds_array = preds.astype(float)
    
    absolute_errors = np.abs(true_values - preds_array)
    
    error_less_than_01 = np.sum(absolute_errors < 0.1) / len(absolute_errors)
    
    error_intervals = []
    current_interval = 0
    for error in absolute_errors:
        if error > 0.1:
            current_interval += 1
        else:
            if current_interval > 0:
                error_intervals.append(current_interval)
                current_interval = 0
    
    max_intervals_1 = len(absolute_errors) / 2
    max_intervals_2 = len(absolute_errors) / 8
    
    points = 0
    for interval in error_intervals:
        if interval <= 1:
            points += max(0, 0.5 - (interval / max_intervals_1))
        elif 2 <= interval <= 10:
            points += max(0, 0.25 - (interval / max_intervals_2))
    
    # Calculate the error duration metric
    error_duration_points = (1 - error_less_than_01) * 100 + points
    
    return 'custom_error_duration', error_duration_points 
def add_time_columns(df):
    df['Hour'] = df.index.hour
    df['Minute'] = df.index.minute
    df['Second'] = df.index.second
    return df

In [60]:
sample_train_features_1 = add_time_columns(train_data['03_14']['features'])
sample_train_targets_1 = train_data['03_14']['targets']
sample_train_features_2 = add_time_columns(train_data['03_15']['features'])
sample_train_targets_2 = train_data['03_15']['targets']

final_train_feat = pd.concat([sample_train_features_1, sample_train_features_2], axis=0)
final_train_targets = pd.concat([sample_train_targets_1, sample_train_targets_2], axis=0)
final_train_targets_1 = final_train_targets['ProzessData_ActData_AB1_Temperature_DR1_MassMixingStage']
final_train_targets_2 = final_train_targets['ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage']

sample_val_features = add_time_columns(val_data['03_16']['features'])
sample_val_targets = val_data['03_16']['targets']
sample_val_targets_1 = val_data['03_16']['targets']['ProzessData_ActData_AB1_Temperature_DR1_MassMixingStage']
sample_val_targets_2 = val_data['03_16']['targets']['ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage']


In [122]:
# params = {
#     'objective': 'reg:squarederror',
#     'eta': 0.075,
#     'max_depth': 6,
#     'min_child_weight': 1,
#     'subsample': 1.0,
#     'colsample_bytree': 1.0,
#     'eval_metric': 'mae',
#     'seed': 5
# }
params = {'eval_metric': 'mae'}

# Convert data into DMatrix format for XGBoost
dtrain = xgb.DMatrix(final_train_feat, label=final_train_targets_1)
dvalid = xgb.DMatrix(sample_val_features, label=sample_val_targets_1)

# Training the model
num_round = 1000
early_stopping_rounds = 100
max_time_for_learner = 360  # in seconds

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_boost_round=num_round, evals=watchlist,
                  early_stopping_rounds=early_stopping_rounds,
                  feval=custom_weighted_error_xgb
                  maximize=False, verbose_eval=True)

[0]	train-mae:24.86623	eval-mae:24.79254
[1]	train-mae:17.40678	eval-mae:17.35576
[2]	train-mae:12.18512	eval-mae:12.14595
[3]	train-mae:8.52982	eval-mae:8.50094
[4]	train-mae:5.97109	eval-mae:5.94706
[5]	train-mae:4.17994	eval-mae:4.16069
[6]	train-mae:2.92610	eval-mae:2.91042
[7]	train-mae:2.04840	eval-mae:2.03517
[8]	train-mae:1.43397	eval-mae:1.42234
[9]	train-mae:1.00389	eval-mae:0.99459
[10]	train-mae:0.70280	eval-mae:0.69457
[11]	train-mae:0.49202	eval-mae:0.48528
[12]	train-mae:0.34454	eval-mae:0.33967
[13]	train-mae:0.24138	eval-mae:0.23826
[14]	train-mae:0.16930	eval-mae:0.16709
[15]	train-mae:0.11911	eval-mae:0.11785
[16]	train-mae:0.08458	eval-mae:0.08395
[17]	train-mae:0.06113	eval-mae:0.06064
[18]	train-mae:0.04525	eval-mae:0.04492
[19]	train-mae:0.03480	eval-mae:0.03428
[20]	train-mae:0.02824	eval-mae:0.02755
[21]	train-mae:0.02360	eval-mae:0.02334
[22]	train-mae:0.02071	eval-mae:0.02061
[23]	train-mae:0.01863	eval-mae:0.01861
[24]	train-mae:0.01720	eval-mae:0.01749
[25]

In [121]:
preds = model.predict(xgb.DMatrix(sample_val_features))
print(custom_weighted_error(sample_val_targets_1, preds)/len(preds) * 100)

95.21863715442757


In [123]:
preds = model.predict(xgb.DMatrix(sample_val_features))
print(custom_weighted_error(sample_val_targets_1, preds)/len(preds) * 100)

95.55525504013099


In [127]:
import matplotlib.pyplot as plt
plt.plot(preds)

[<matplotlib.lines.Line2D at 0x27ad3057f70>]