In [None]:
import pandas as pd

### Aux functions

In [None]:
def expand_by_probability_deterministic(df: pd.DataFrame,
                                        prob_col: str = "prob",
                                        total_rows: int = 1000) -> pd.DataFrame:

    # Extract the Series containing the probabilities
    p = pd.to_numeric(df[prob_col], errors="coerce")

    # Determine the number of times each original row will be repeated in the final dataframe. Put this info in 'scaled'.
    # NOTE: we are ignoring small rounding errors, which might occur if the probabilites are expressed with more than two decimal values.
    #       Hence, we are OK if the final dataframe has slightly less/more rows than 'total_rows'
    scaled = (p * int(total_rows)).astype(int)

    # Repeat the original rows by (1) repeating the original index entries, and then (2) using the repeated index entries to repeatedly
    # select the original rows.
    repeated_index = df.index.repeat(scaled)
    return df.loc[repeated_index].drop(columns = prob_col).reset_index(drop=True)

### Main code

In [None]:
orig_specs = pd.read_csv('./specs_2/specs workload EFRA.csv')
orig_specs.info()
print(f"Sum of probabilities: {orig_specs['prob'].sum()}")
display(orig_specs)

In [None]:
# Create a synthetic workload.
simulated_workload = expand_by_probability_deterministic(orig_specs, 'prob', 1000)

# Shuffle the pods.
simulated_workload = simulated_workload.sample(frac=1).reset_index(drop=True)

# Rename the pods.
simulated_workload['name'] = 'openb-pod-' + simulated_workload.index.astype(str).str.zfill(4)

simulated_workload

In [None]:
# Initialize the remaining columns expected by the simulator. 
# Use constant values -- they will be ignored during the simulations anyways...

simulated_workload.loc[:, ['qos', 'pod_phase', 'creation_time', 'deletion_time', 'scheduled_time']] = 'LS', 'Running', 0, 20, 10
simulated_workload.to_csv('./openb_pod_list_default.csv', index = False)
simulated_workload