In [1]:
import pandas as pd
import os
import numpy as np
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [2]:
root_path = r'C:\Users\ADE17\Desktop\Masters\Projects\AIChallenge_OWL\SollIch-Hackathon_Daten\Data_Participants3'
def load_data(root_path, mode='train'):
    data_dict = {}
    if mode == 'train':
        x_folder = "Train_X"
        y_folder = "Train_Y"
    else:
        x_folder = "Eval_X"
        y_folder = "Eval_Y"
    x_files = os.listdir(os.path.join(root_path, x_folder))
    y_files = os.listdir(os.path.join(root_path, y_folder))

    for x_file, y_file in zip(x_files, y_files):
        if x_file.endswith('.pq') and y_file.endswith('.pq'):
            path_X = os.path.join(x_folder, x_file)
            path_Y = os.path.join(y_folder, y_file)
            df_X = pd.read_parquet(os.path.join(root_path, path_X))
            df_Y = pd.read_parquet(os.path.join(root_path, path_Y))
            # target_col_1 = df_X['ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage']
            targets = df_Y
            # df_X = df_X.drop(['ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage'], axis=1)
            date = x_file.split('_')[1:]  # Extracting month and day
            date_key = '_'.join(date)[:5]  # Creating the 'MM_DD' format
            data_dict[date_key] = {'features': df_X, 'targets': targets}

    return data_dict

train_data = load_data(root_path)
val_data = load_data(root_path, mode='val')
def custom_weighted_error(true_values, predicted_values):
    absolute_errors = np.abs(true_values - predicted_values)
    
    points = 0
    for error in absolute_errors:
        if error <= 0.05:
            points += 1
        elif 0.05 < error <= 0.1:
            points += 0.5
        elif 0.1 < error <= 0.5:
            points += 0.25
        else:
            points += 0
    
    return points 

def custom_weighted_error_xgb(preds, dtrain):
    true_values = dtrain.get_label()
    
    absolute_errors = np.abs(true_values - preds)
    
    points = 0
    for error in absolute_errors:
        if error <= 0.05:
            points += 1
        elif 0.05 < error <= 0.1:
            points += 0.5
        elif 0.1 < error <= 0.5:
            points += 0.25
        else:
            points += 0
    
    return 'custom_weighted_error', points / len(preds) * 100

def custom_error_duration(preds, dtrain):
    true_values = dtrain.get_label()
    
    preds_array = preds.astype(float)
    
    absolute_errors = np.abs(true_values - preds_array)
    
    error_less_than_01 = np.sum(absolute_errors < 0.1) / len(absolute_errors)
    
    error_intervals = []
    current_interval = 0
    for error in absolute_errors:
        if error > 0.1:
            current_interval += 1
        else:
            if current_interval > 0:
                error_intervals.append(current_interval)
                current_interval = 0
    
    max_intervals_1 = len(absolute_errors) / 2
    max_intervals_2 = len(absolute_errors) / 8
    
    points = 0
    for interval in error_intervals:
        if interval <= 1:
            points += max(0, 0.5 - (interval / max_intervals_1))
        elif 2 <= interval <= 10:
            points += max(0, 0.25 - (interval / max_intervals_2))
    
    # Calculate the error duration metric
    error_duration_points = (1 - error_less_than_01) * 100 + points
    
    return 'custom_error_duration', error_duration_points 
def add_time_columns(df):
    df['Hour'] = df.index.hour
    df['Minute'] = df.index.minute
    df['Second'] = df.index.second
    return df
def create_time_series_windows(data, window_size):
    windows_X = []
    for i in range(len(data) - window_size):
        window = data.iloc[i:i + window_size]

        windows_X.append(window.values.flatten())

    return np.array(windows_X)

In [14]:
sample_train_features_1 = add_time_columns(train_data['03_14']['features'])
sample_train_targets_1 = train_data['03_14']['targets']
sample_train_features_2 = add_time_columns(train_data['03_15']['features'])
sample_train_targets_2 = train_data['03_15']['targets']

final_train_feat = pd.concat([sample_train_features_1, sample_train_features_2], axis=0)
final_train_targets = pd.concat([sample_train_targets_1, sample_train_targets_2], axis=0)

sample_val_features = add_time_columns(val_data['03_16']['features'])
sample_val_targets = val_data['03_16']['targets']

In [66]:
params = {
    'objective': 'reg:squarederror',
    'eta': 0.075,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 1.0,
    'colsample_bytree': 1.0,
    'eval_metric': 'rmse',
    'seed': 5
}

# Convert data into DMatrix format for XGBoost
dtrain = xgb.DMatrix(final_train_feat, label=final_train_targets)
dvalid = xgb.DMatrix(sample_val_features, label=sample_val_targets)

# Training the model
num_round = 1000
early_stopping_rounds = 100
max_time_for_learner = 360  # in seconds

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_boost_round=num_round, evals=watchlist,
                  early_stopping_rounds=early_stopping_rounds,
                  feval=custom_weighted_error_xgb,
                  maximize=False, verbose_eval=True)



[0]	train-rmse:33.32636	train-custom_weighted_error:0.00000	eval-rmse:33.24421	eval-custom_weighted_error:0.00000
[1]	train-rmse:30.82710	train-custom_weighted_error:0.00000	eval-rmse:30.75050	eval-custom_weighted_error:0.00000
[2]	train-rmse:28.51529	train-custom_weighted_error:0.00000	eval-rmse:28.44378	eval-custom_weighted_error:0.00000
[3]	train-rmse:26.37685	train-custom_weighted_error:0.00000	eval-rmse:26.31114	eval-custom_weighted_error:0.00000
[4]	train-rmse:24.39879	train-custom_weighted_error:0.00000	eval-rmse:24.33754	eval-custom_weighted_error:0.00000
[5]	train-rmse:22.56907	train-custom_weighted_error:0.00000	eval-rmse:22.51123	eval-custom_weighted_error:0.00000
[6]	train-rmse:20.87659	train-custom_weighted_error:0.00000	eval-rmse:20.82343	eval-custom_weighted_error:0.00000
[7]	train-rmse:19.31101	train-custom_weighted_error:0.00000	eval-rmse:19.26103	eval-custom_weighted_error:0.00000
[8]	train-rmse:17.86286	train-custom_weighted_error:0.00000	eval-rmse:17.81667	eval-cust

In [67]:
preds = model.predict(xgb.DMatrix(sample_val_features))
custom_weighted_error(sample_val_targets.to_numpy().flatten(), preds) / len(preds) * 100

95.85605473604187

In [68]:
def calculate_points(estimated_values):
    thresholds = [(0, 1), (2, 10)]
    point_values = [0.5, 0.25]

    num_estimates = len(estimated_values)

    values_below_threshold = sum(1 for value in estimated_values if abs(value) < 0.1)
    percentage_below_threshold = values_below_threshold / num_estimates * 100

    points_from_percentage = percentage_below_threshold

    total_points_intervals = 0
    for i, (low, high) in enumerate(thresholds):
        count_intervals = sum(1 for value in estimated_values if abs(value) > 0.1 and low <= value <= high)
        
        max_possible_intervals = num_estimates / (2 ** i) if i < len(point_values) else 0
        
        if max_possible_intervals > 0:
            points_for_range = 100 - (count_intervals / max_possible_intervals * 100)
            total_points_intervals += points_for_range * point_values[i]

    total_points = points_from_percentage + total_points_intervals

    return total_points

In [70]:
calculate_points(absolute_errors)

170.32901565766582

In [74]:
absolute_errors = np.abs(sample_val_targets.to_numpy().flatten() - preds)
from sklearn.metrics import mean_squared_error

In [80]:
import matplotlib.pyplot as plt
%matplotlib qt
plt.plot(preds)
plt.plot(sample_val_targets.to_numpy().flatten())

[<matplotlib.lines.Line2D at 0x24206552190>]

In [81]:
import matplotlib.pyplot as plt
%matplotlib qt
plt.plot(absolute_errors)

[<matplotlib.lines.Line2D at 0x242065e41f0>]

In [77]:
mean_squared_error(sample_val_targets.to_numpy().flatten(), preds)

0.0022303796236327893