This notebook shows the generation methodology of the bpmi challenge energy data generation.

First a simple pandas dataframe is created, symbolizing one machine that is operating every process

In [1]:
import pandas as pd
from six import iteritems

# From a start date, end date, and frequency
start = '2018-01-01'
end = '2018-12-31'
dti = pd.date_range(start, end, freq='15min')

Add to each data point a periodic signal based on two factors; the time of day based on a broadened sine wave, which peaks after lunch hours and reaches the minimum 12h later. The amplitude is modified based on the workday, so it is diminished by 60% on weekends.

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

def generate_sine_wave(timestamps, amplitude=1.0, vertical_shift=0.0, peak_hour_shift=14.0):
    """
    Generate a sine wave signal based on timestamps with a 24-hour period.

    Parameters:
    -----------
    timestamps : list-like or pandas Series
        Timestamps in format 'YYYY:MM:DD:hh:mm:ss'
    amplitude : float, optional (default=1.0)
        The amplitude of the sine wave
    vertical_shift : float, optional (default=0.0)
        Vertical shift of the entire wave (moves the wave up or down)
    peak_hour_shift : float, optional (default=14.0)
        Hour of the day when the peak should occur (default 14.0 = 2 PM)
        Can be fractional for fine-tuning

    Returns:
    --------
    pandas.Series : Sine wave values corresponding to input timestamps
    """

    # Convert timestamps to datetime objects
    if isinstance(timestamps, str):
        timestamps = [timestamps]

    datetime_objects = pd.Series([
        datetime.strptime(ts, '%Y:%m:%d:%H:%M:%S')
        for ts in timestamps
    ])

    # Extract hours and minutes as fractional hours
    hours = datetime_objects.dt.hour + datetime_objects.dt.minute/60 + datetime_objects.dt.second/3600

    # Calculate phase shift based on peak_hour_shift
    # We want the peak at peak_hour_shift hours
    phase_shift = (peak_hour_shift - 6) * (2 * np.pi / 24)  # -6 because sine peaks at π/2

    # Generate sine wave
    # Period is 24 hours = 2π
    angular_freq = 2 * np.pi / 24
    sine_wave = amplitude * np.sin(angular_freq * hours - phase_shift) + vertical_shift
    sine_wave_series = pd.Series(sine_wave)
    sine_wave_series.index = datetime_objects
    return sine_wave_series

def generate_timestamp_range(start_timestamp, end_timestamp, freq='15min'):
    """
    Generate a range of timestamps between start and end times.

    Parameters:
    -----------
    start_timestamp : str
        Start timestamp in format 'YYYY:MM:DD:hh:mm:ss'
    end_timestamp : str
        End timestamp in format 'YYYY:MM:DD:hh:mm:ss'
    freq : str, optional (default='1H')
        Frequency of timestamps (pandas frequency string)

    Returns:
    --------
    list : List of timestamp strings in the specified format
    """
    start_dt = datetime.strptime(start_timestamp, '%Y:%m:%d:%H:%M:%S')
    end_dt = datetime.strptime(end_timestamp, '%Y:%m:%d:%H:%M:%S')

    date_range = pd.date_range(start=start_dt, end=end_dt, freq=freq)
    return [dt.strftime('%Y:%m:%d:%H:%M:%S') for dt in date_range]

# Example usage
if __name__ == "__main__":
    # Generate timestamps for 3 days with hourly intervals
    timestamps = generate_timestamp_range(
        '2018:01:01:00:00:00',
        '2018:12:31:23:59:59',
        freq='15min'
    )

    # Generate different sine waves with various parameters

    normal_wave = generate_sine_wave(timestamps)
    shifted_up = generate_sine_wave(timestamps, vertical_shift=2.0)
    larger_amplitude = generate_sine_wave(timestamps, amplitude=2.0)
    peak_at_15 = generate_sine_wave(timestamps, peak_hour_shift=15.0)  # Peak at 3 PM

    # Create DataFrame with all waves
    results = pd.DataFrame({
        'normal': normal_wave,
        'shifted_up': shifted_up,
        'larger_amplitude': larger_amplitude,
        'peak_at_15': peak_at_15,
    })
    results.index = timestamps

    print(results.head(24))

                       normal  shifted_up  larger_amplitude  peak_at_15
2018:01:01:00:00:00 -0.866025    1.133975         -1.732051   -0.707107
2018:01:01:00:15:00 -0.896873    1.103127         -1.793745   -0.751840
2018:01:01:00:30:00 -0.923880    1.076120         -1.847759   -0.793353
2018:01:01:00:45:00 -0.946930    1.053070         -1.893860   -0.831470
2018:01:01:01:00:00 -0.965926    1.034074         -1.931852   -0.866025
2018:01:01:01:15:00 -0.980785    1.019215         -1.961571   -0.896873
2018:01:01:01:30:00 -0.991445    1.008555         -1.982890   -0.923880
2018:01:01:01:45:00 -0.997859    1.002141         -1.995718   -0.946930
2018:01:01:02:00:00 -1.000000    1.000000         -2.000000   -0.965926
2018:01:01:02:15:00 -0.997859    1.002141         -1.995718   -0.980785
2018:01:01:02:30:00 -0.991445    1.008555         -1.982890   -0.991445
2018:01:01:02:45:00 -0.980785    1.019215         -1.961571   -0.997859
2018:01:01:03:00:00 -0.965926    1.034074         -1.931852   -1

On top of the periodic signal, the energy generator receives a significant amount of noise in order to account for energy consumption unrelated to the economic processes of the machines like OS calculations, overhead storage, appliances that are not related to customer processes, updates or similar occurences.

In [3]:
import random

def noise_gen(noise_scaling=1):
    while True:
        yield noise_scaling * random.uniform(80,90)

def apply_noise(value, noise):
    """
    Applies noise to a given value.

    Parameters:
    -----------
    value : float
        The original value to which noise will be applied
    noise : float
        The noise factor to apply

    Returns:
    --------
    float : The value after applying noise
    """
    return value * noise

The relevant component of the signal generation is the actual amount of energy that is typically consumed by the specific processes that are running in the current time interval. In this step the number of running processes is computed using the event log and for each process event the postulated energy consumption is added in accordance with the event type. Additionally, a small amount of surplus energy is added whenever an event has been processed but is still running, or some events are surpassed by follow up events. This is done in order to account for reading, saving and communication processing that occurs whenever states have to change or previous steps have to be saved for later processing.

In [4]:
from math import floor
import event_object
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import List, Dict

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import List, Dict, Any, Union

def distribute_event_costs_series(energy_series: pd.Series,
                           events: List['EventObject'],
                           cost_mapping: Dict[str, float],
                           timestamp_format: str = '%Y:%m:%d:%H:%M:%S') -> pd.DataFrame:
    """
    Distributes the cost of events across energy measurement intervals based on event duration.
    Works with a pandas Series with timestamps as index and energy values as data.

    Parameters:
    -----------
    energy_series : pd.Series
        Series with timestamps as index and energy measurements as values
    events : List[EventObject]
        List of EventObject instances with timestamp and duration attributes
    cost_mapping : Dict[str, float]
        Dictionary mapping event types to their base costs
    timestamp_format : str, default='%Y:%m:%d:%H:%M:%S'
        Format string for parsing timestamps if they are strings

    Returns:
    --------
    pd.DataFrame
        DataFrame with columns for timestamp, original energy values, event costs,
        and total (energy + cost)
    """
    # Make a copy of the input series to avoid modifying the original
    series_copy = energy_series.copy()

    # Convert index to datetime if it isn't already
    if not isinstance(series_copy.index, pd.DatetimeIndex):
        # If index is not already datetime, attempt to convert
        try:
            # Try to convert using the specified format
            timestamps = pd.to_datetime(series_copy.index, format=timestamp_format)
            series_copy.index = timestamps
        except ValueError:
            # Fall back to pandas default parser
            try:
                timestamps = pd.to_datetime(series_copy.index)
                series_copy.index = timestamps
            except:
                raise ValueError(
                    f"Could not parse timestamps with format '{timestamp_format}'. "
                    "Please check your data or provide the correct format."
                )

    # Create DataFrame from Series, handling unnamed index
    if series_copy.index.name:
        # Use the existing index name
        timestamp_col_name = series_copy.index.name
    else:
        # Use a default name if index has no name
        timestamp_col_name = 'timestamp'

    # Reset index to convert index to column and name it appropriately
    result_df = series_copy.reset_index()

    # Rename the index column if it's unnamed
    if result_df.columns[0] == 'index':
        result_df.rename(columns={'index': timestamp_col_name}, inplace=True)

    # Rename the values column to 'energy'
    if series_copy.name:
        energy_col_name = series_copy.name
    else:
        energy_col_name = 'energy'
        result_df.rename(columns={0: energy_col_name}, inplace=True)

    # Ensure the timestamp column is in datetime format
    timestamp_col = result_df.columns[0]  # First column contains timestamps
    if not pd.api.types.is_datetime64_any_dtype(result_df[timestamp_col]):
        result_df[timestamp_col] = pd.to_datetime(result_df[timestamp_col])

    # Sort the dataframe by timestamp
    result_df = result_df.sort_values(by=timestamp_col)

    # Add a new column for event costs, initialized to 0
    result_df['event_cost'] = 0.0

    # Calculate the interval duration (assuming equal intervals)
    if len(result_df) > 1:
        interval_duration = (result_df[timestamp_col].iloc[1] -
                            result_df[timestamp_col].iloc[0]).total_seconds()
    else:
        raise ValueError("Energy Series must have at least two elements to determine interval duration")

    # copy result_df three times for each basecost_scaling
    result_df_list = []
    result_df_list.append(result_df)
    result_df_list.append(result_df.copy())
    result_df_list.append(result_df.copy())
    # Process each event
    for event_type in events:
        # Check if the event type has a defined cost
        processed_types = 0
        print(f"Processing {event_type}: {processed_types/len(events)*100}% processed")
        if event_type not in cost_mapping:
            continue
        print(f"{len(events[event_type])} to process for {event_type}")
        base_cost = cost_mapping[event_type]
        for event in events[event_type]:
            # Ensure event timestamp is in datetime format
            event_start = event.timestamp
            if isinstance(event_start, str):
                try:
                    event_start = datetime.strptime(event_start, timestamp_format)
                except ValueError:
                    # Try default parsing if format doesn't match
                    event_start = pd.to_datetime(event_start)

            event_start = event_start
            event_end = event_start + pd.Timedelta(seconds=event.duration)
                    # Ensure event timestamps are timezone-naive
            if event_start.tz is not None:
                event_start = event_start.tz_localize(None)
            if event_end.tz is not None:
                event_end = event_end.tz_localize(None)


        # Find all intervals that overlap with the event

            interval_ends = pd.Series(
                index=result_df.index,
                data=list(result_df.index[1:]) + [result_df.index[-1] + floor(event.duration/pd.Timedelta(seconds=interval_duration).total_seconds())
    ])

            #mask = (result_df[timestamp_col] <= event_end) & (interval_ends >= event_start)

            overlapping_intervals = result_df[
                (result_df[timestamp_col] <= event_end.to_datetime64()) &
                (result_df[timestamp_col].shift(-1, fill_value=result_df[timestamp_col].iloc[-1] +
                                         timedelta(seconds=interval_duration)) >= event_start.to_datetime64())
            ]

            if overlapping_intervals.empty:
                continue

        # Calculate the overlap duration and distribute cost for each interval
            for idx, row in overlapping_intervals.iterrows():
                interval_start = row[timestamp_col]

                # Calculate interval end (using next row's timestamp or adding interval duration for the last row)
                if idx < result_df.index.max():
                    next_idx = result_df.index[result_df.index.get_loc(idx) + 1]
                    interval_end = result_df.loc[next_idx, timestamp_col]
                else:
                    interval_end = interval_start + timedelta(seconds=interval_duration)

                # Calculate overlap between event and this interval
                overlap_start = max(event_start, interval_start)
                overlap_end = min(event_end, interval_end)
                overlap_duration = (overlap_end - overlap_start).total_seconds()

                # Skip if there's no overlap
                if overlap_duration <= 0:
                    continue

                # Calculate the proportion of event duration in this interval
                proportion = overlap_duration / event.duration

                # Distribute cost proportionally
                cost_in_interval = base_cost * proportion

                # Add to the event_cost column
                base_cost_scalings = [0.1, 1, 10]
                for result_df, base_cost_scaling in zip(result_df_list, base_cost_scalings):
                    result_df.at[idx, 'event_cost'] += cost_in_interval * base_cost_scaling

        # Update the processed types count
        processed_types += 1

    # Add a column with the total (energy + event cost)
    for dataframe in result_df_list:
        dataframe['total'] = dataframe[energy_col_name] + dataframe['event_cost']

    return result_df_list


# Example usage:
"""
# Create sample energy dataframe
timestamps = pd.date_range(start='2023-01-01', periods=24, freq='1H')
energy_df = pd.DataFrame({
    'timestamp': timestamps,
    'energy': np.random.uniform(10, 20, size=24)  # Random energy values
})

# Example cost mapping
cost_mapping = {
    'login': 5.0,
    'logout': 2.0,
    'purchase': 10.0,
    'search': 3.0
}

# Assume 'events' is your list of EventObject instances

# Calculate and add event costs
result = distribute_event_costs(energy_df, events, cost_mapping)
print(result)
"""


"\n# Create sample energy dataframe\ntimestamps = pd.date_range(start='2023-01-01', periods=24, freq='1H')\nenergy_df = pd.DataFrame({\n    'timestamp': timestamps,\n    'energy': np.random.uniform(10, 20, size=24)  # Random energy values\n})\n\n# Example cost mapping\ncost_mapping = {\n    'login': 5.0,\n    'logout': 2.0,\n    'purchase': 10.0,\n    'search': 3.0\n}\n\n# Assume 'events' is your list of EventObject instances\n\n# Calculate and add event costs\nresult = distribute_event_costs(energy_df, events, cost_mapping)\nprint(result)\n"

With all elements present the choices of the energy generator can be made. The energy generator will take the following parameters:
- **01.01.2018**: The start date of the energy generation period.
- **31.12.2018**: The end date of the energy generation period.
- **1**: The number of devices to be simulated.
- **1**: The vertical shift of the sine wave.

In order to gauge the efficacy of the energy evaluation the following parameters will be differed in each run:
- **0.1 - 10**: The scaling factor for the noise generator.
- **0.1 - 10**: The scaling factor for the event base costs.
- **0.5 - 1.5**: The amplitude of the sine wave.
- **15min - 1h**: The event cost sampling rate.

The values are scaled by a factor each repetition to encapsule a wide range of possible values. The concrete values are listed within the parameter lists. Whenever one parameter is changed, the others are fixed at the median value. As each process and implementation differ widely, this way the research will help to identify if an application of the method can be considered.

In [5]:
import pickle
from event_object import EventObject
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

#data
events = event_object.read_event_objects_from_pickle("event_objects_all_variants.pkl")
# fixed parameters
start = '2018:01:01:00:00:00'
end = '2018:12:31:23:59:59'
vertical_shift = 1.1
measurements = 1

#experiment parameters
noise_scaling_list = [0.1, 1, 10]
event_base_cost_scaling_list = [0.1, 1, 10]
amplitude_scaling_list = [0.5, 1, 1.5]
sampling_rate_list = ['15m', '30min', '1h']

#Basecosts dictionary
__BASECOSTS__ = {
    "Record Goods Receipt": 99,
    "Create Purchase Order Item": 98,
    "Record Invoice Receipt": 97,
    "Vendor creates invoice": 54,
    "Clear invoice": 56,
    "Record Service Entry Sheet": 81,
    "Remove Payment Block": 59,
    "Create Purchase Requisition Item": 11,
    "Receive Order Confirmation": 66,
    "Change Quantity": 2,
    "Change Price": 1,
    "Delete Purchase Order item": 8,
    "Cancel Invoice Receipt": 9,
    "Change Approval for Purchase Order": 21,
    "Vendor creates debit memo": 20,
    "Change Delivery Indicator": 52,
    "Cancel Goods Receipt": 10,
    "Release Purchase Order": 60,
    "SRM: In Transfer to Execution Syst.": 87,
    "SRM: Created": 78,
    "SRM: Complete": 77,
    "SRM: Awaiting Approval": 63,
    "SRM Document Completed": 70,
    "SRM: Ordered": 44,
    "SRM Change was Transmitted": 92,
    "Reactivate Purchase Order Item": 57,
    "Block Purchase Order Item": 31,
    "Cancel Subsequent Invoice": 32,
    "Change Storage Location": 36,
    "Update Order Confirmation": 22,
    "Record Subsequent Invoice": 40,
    "Release Purchase Requisition": 62,
    "Set Payment Block": 26,
    "SRM: Deleted": 5,
    "Change Currency": 150,
    "Change Final Invoice Indicator": 144,
    "SRM: Transaction Completed": 121,
    "SRM: Incomplete": 111,
    "SRM: Held": 88,
    "Change payment term": 4,
    "Change Rejection Indicator": 200
}

def gen_experiment_parameters():
    '''
    Generates the parameters for the experiment. The parameters are:
    - noise_scaling
    - event_base_cost_scaling
    - amplitude_scaling
    - sampling_rate
    The parameters are generated in a way that each parameter is fixed at the median value of the other parameters, while changing only one parameter through the list.
    :return:
    A list of parameters containing [noise_scaling, event_base_cost_scaling, amplitude_scaling, sampling_rate]
    '''
    for noise in noise_scaling_list:
        yield [noise, 1, '30min']

    for amplitude_scaling in amplitude_scaling_list:
        yield [1, amplitude_scaling, '30min']

    for sampling_rate in sampling_rate_list:
        yield [1, 1, sampling_rate]

    return None


#dataframe
def run_experiment(start, end, vertical_shift,
                   noise_scaling,
                   amplitude_scaling, sampling_rate):
    """
    Run the experiment with the given parameters.
    """
    # Generate timestamps
    dti = generate_timestamp_range(start, end, sampling_rate)
    # Generate sine wave
    dti = generate_sine_wave(dti, amplitude=amplitude_scaling,
                                    vertical_shift=vertical_shift)
    dti.rename('timestamp')
    # Generate noise
    for index, values in dti.items():
        noise = next(noise_gen(noise_scaling))
        dti[index] = apply_noise(values, noise)



    #add event consumption
    dtis = distribute_event_costs_series(dti, events, __BASECOSTS__)

    return dtis

# Run the experiment with all parameters

parameters = list(gen_experiment_parameters())
for parameter in parameters:
    result_dfs = run_experiment(start, end, vertical_shift,parameter[0], parameter[1], parameter[2])
    # Save the results to CSV files
    for result, basecost in zip(result_dfs, event_base_cost_scaling_list):
        with open(f"experiment_{parameter[0]}_{basecost}_{parameter[1]}_{parameter[2]}.csv", "w") as f:
            result.to_csv(f, index=False)
            print(f"Experiment with parameters {parameter} finished.")

    #end

Processing SRM: Created: 0.0% processed
1625 to process for SRM: Created
Processing SRM: Complete: 0.0% processed
1625 to process for SRM: Complete
Processing SRM: Awaiting Approval: 0.0% processed
1625 to process for SRM: Awaiting Approval
Processing SRM: Document Completed: 0.0% processed
Processing SRM: In Transfer to Execution Syst.: 0.0% processed
1687 to process for SRM: In Transfer to Execution Syst.
Processing SRM: Ordered: 0.0% processed
1625 to process for SRM: Ordered
Processing SRM: Change was Transmitted: 0.0% processed
Processing Create Purchase Order Item: 0.0% processed
251352 to process for Create Purchase Order Item
Processing Vendor creates invoice: 0.0% processed
216700 to process for Vendor creates invoice
Processing Record Goods Receipt: 0.0% processed
305837 to process for Record Goods Receipt
Processing Record Invoice Receipt: 0.0% processed
218375 to process for Record Invoice Receipt
Processing Clear Invoice: 0.0% processed
Processing Record Service Entry Shee

KeyboardInterrupt: 

To reduce the waiting time a reindex and sum of the dataframes is possible to illustrate how behavior of the data changes when the measurement interval is changed. Also the changes of the periodic signal can be converted by multiplication/addition of the non event energy values. Consider that this does conserve the random noise of the original dataframe and a generation from the method above will apply the noise again during generation.

In [7]:
def convert_30min_to_1h_reset_index(df, timestamp_column='timestamp'):
    """
    Convert a pandas DataFrame with 30-minute interval timestamps in a column to 1-hour intervals
    by summing pairs of consecutive rows.

    Parameters:
    df (pandas.DataFrame): DataFrame with timestamps in a column at 30-minute intervals
    timestamp_column (str): Name of the column containing timestamp values

    Returns:
    pandas.DataFrame: DataFrame with timestamps at 1-hour intervals
    """
    # Create a copy of the input DataFrame
    df_copy = df.copy()

    # Ensure the timestamp column is datetime type
    df_copy[timestamp_column] = pd.to_datetime(df_copy[timestamp_column])

    # Set the timestamp column as the index temporarily
    df_indexed = df_copy.set_index(timestamp_column)

    # Resample to hourly intervals and sum
    df_hourly = df_indexed.resample('1h').sum()

    # Reset the index to move timestamps back to a column
    df_hourly_reset = df_hourly.reset_index()

    return df_hourly_reset

def resize_sine_wave(df, sine_amplitude_scalar, timestamp_column='timestamp'):
    """
    Resize a DataFrame with sine wave values to a new amplitude.

    Parameters:
    df (pandas.DataFrame): DataFrame with sine wave values
    sine_amplitude_scalar (float): Scalar to resize the sine wave amplitude
    timestamp_column (str): Name of the column containing timestamp values

    Returns:
    pandas.DataFrame: Resized DataFrame with new sine amplitude
    """
    # Ensure the timestamp column is datetime type
    df[timestamp_column] = pd.to_datetime(df[timestamp_column])

    # adjust total energy consumption by adding the sine wave energy
    df['total'] = df['total'] + df['energy'] * (1-sine_amplitude_scalar)

    # adjust energy values by multiplying with the sine amplitude scalar
    df['energy'] = df['energy'] * sine_amplitude_scalar


    return df

resample_experiments = []
with open('experiment_1_0.1_1_30min.csv', 'r') as f:
    resample_experiments.append(pd.read_csv(f))
with open('experiment_1_1_1_30min.csv', 'r') as f:
    resample_experiments.append(pd.read_csv(f))
with open('experiment_1_10_1_30min.csv', 'r') as f:
    resample_experiments.append(pd.read_csv(f))

for dataframe, basecost_name in zip(resample_experiments, [0.1, 1, 10]):
    dataframe = convert_30min_to_1h_reset_index(dataframe)
    with open(f"resample_experiment_1_{basecost_name}_1_to1h.csv", "w") as f:
        dataframe.to_csv(f, index=False)
        print(f"Resize for {basecost_name} finished.")

scale_amplitude_experiments = []
with open('experiment_1_0.1_1_30min.csv', 'r') as f:
    scale_amplitude_experiments.append(pd.read_csv(f))
with open('experiment_1_1_1_30min.csv', 'r') as f:
    scale_amplitude_experiments.append(pd.read_csv(f))
with open('experiment_1_10_1_30min.csv', 'r') as f:
    scale_amplitude_experiments.append(pd.read_csv(f))

for scale_dataframe, basecost_name in zip(scale_amplitude_experiments, [0.1, 1, 10]):
    scale_dataframe = resize_sine_wave(scale_dataframe, 1.5)
    with open(f"rescale_experiment_1_{basecost_name}_to1.5_30min.csv", "w") as f:
        scale_dataframe.to_csv(f, index=False)
        print(f"Rescale for {basecost_name} finished.")

Resize for 0.1 finished.
Resize for 1 finished.
Resize for 10 finished.
Resize for 0.1 finished.
Resize for 1 finished.
Resize for 10 finished.
