In [41]:
import pandas as pd 

# https://www.cs.huji.ac.il/labs/parallel/workload/swf.html
columns = [
    'jobId', # 1
    'submitDate', # 2 
    'waitTime',  # 3
    'runTime', # 4
    'proc alloc', # 5
    'cpu used', # 6
    'mem used', # 7
    'proc req', # 8
    'user est', # 9
    'mem req', # 10
    'status', # 11
    'userId', # 12
    'groupId', # 13
    'exe num',  # 14
    'queueId', # 15
    'partitionNumber', # 16
    'prevJobId', # 17
    'think time' # 18
    ]
df = pd.read_csv("../data/KIT-FH2-2016-1.swf", skiprows=58, delimiter="\t", names=columns, index_col=False)
df = df.drop(['think time', 'prevJobId', 'exe num', 'cpu used', 'mem used'], axis=1)
df

Unnamed: 0,jobId,submitDate,waitTime,runTime,proc alloc,proc req,user est,mem req,status,userId,groupId,queueId,partitionNumber
0,1,0,4,14400,100,100,14400,-1,1,1,1,1,1
1,2,37,0,14400,100,100,14400,-1,1,1,1,1,1
2,3,76068,9,45,20,20,3600,-1,1,2,2,1,1
3,4,108213,16386,154,1,1,3600,-1,1,3,3,1,1
4,5,108300,16977,50,1,1,3600,-1,1,3,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
114350,114351,50332433,3,60,1,1,600,-1,1,45,45,1,1
114351,114352,50332603,1,92,20,20,600,-1,1,45,45,1,1
114352,114353,50332628,3,49,20,20,300,-1,1,150,150,1,1
114353,114354,50332673,7,65,1,1,600,-1,1,45,45,1,1


## Convert submitDate to real date 

In [42]:
import pandas as pd
import datetime
import pytz

# Define the reference date
reference_date_str = "Wed Jun 01 02:12:45 2016"
reference_date_format = "%a %b %d %H:%M:%S %Y"
timezone = "Europe/Berlin"

# Convert the reference date to a timezone-aware datetime object
reference_date = datetime.datetime.strptime(reference_date_str, reference_date_format)
reference_date = pytz.timezone(timezone).localize(reference_date)

# Function to convert seconds to datetime
def seconds_to_date(seconds):
    return reference_date + datetime.timedelta(seconds=seconds)

# Apply the function to the data column
df['submitDate'] = df['submitDate'].apply(seconds_to_date)

# Print the results
df

Unnamed: 0,jobId,submitDate,waitTime,runTime,proc alloc,proc req,user est,mem req,status,userId,groupId,queueId,partitionNumber
0,1,2016-06-01 02:12:45+02:00,4,14400,100,100,14400,-1,1,1,1,1,1
1,2,2016-06-01 02:13:22+02:00,0,14400,100,100,14400,-1,1,1,1,1,1
2,3,2016-06-01 23:20:33+02:00,9,45,20,20,3600,-1,1,2,2,1,1
3,4,2016-06-02 08:16:18+02:00,16386,154,1,1,3600,-1,1,3,3,1,1
4,5,2016-06-02 08:17:45+02:00,16977,50,1,1,3600,-1,1,3,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
114350,114351,2018-01-04 14:26:38+01:00,3,60,1,1,600,-1,1,45,45,1,1
114351,114352,2018-01-04 14:29:28+01:00,1,92,20,20,600,-1,1,45,45,1,1
114352,114353,2018-01-04 14:29:53+01:00,3,49,20,20,300,-1,1,150,150,1,1
114353,114354,2018-01-04 14:30:38+01:00,7,65,1,1,600,-1,1,45,45,1,1


In [43]:
# Validate this logic
df["Start Date"] = df.submitDate + pd.to_timedelta(df["waitTime"], unit='s')
df["End Date"] = df["Start Date"] + pd.to_timedelta(df["runTime"], unit='s')
df

Unnamed: 0,jobId,submitDate,waitTime,runTime,proc alloc,proc req,user est,mem req,status,userId,groupId,queueId,partitionNumber,Start Date,End Date
0,1,2016-06-01 02:12:45+02:00,4,14400,100,100,14400,-1,1,1,1,1,1,2016-06-01 02:12:49+02:00,2016-06-01 06:12:49+02:00
1,2,2016-06-01 02:13:22+02:00,0,14400,100,100,14400,-1,1,1,1,1,1,2016-06-01 02:13:22+02:00,2016-06-01 06:13:22+02:00
2,3,2016-06-01 23:20:33+02:00,9,45,20,20,3600,-1,1,2,2,1,1,2016-06-01 23:20:42+02:00,2016-06-01 23:21:27+02:00
3,4,2016-06-02 08:16:18+02:00,16386,154,1,1,3600,-1,1,3,3,1,1,2016-06-02 12:49:24+02:00,2016-06-02 12:51:58+02:00
4,5,2016-06-02 08:17:45+02:00,16977,50,1,1,3600,-1,1,3,3,1,1,2016-06-02 13:00:42+02:00,2016-06-02 13:01:32+02:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114350,114351,2018-01-04 14:26:38+01:00,3,60,1,1,600,-1,1,45,45,1,1,2018-01-04 14:26:41+01:00,2018-01-04 14:27:41+01:00
114351,114352,2018-01-04 14:29:28+01:00,1,92,20,20,600,-1,1,45,45,1,1,2018-01-04 14:29:29+01:00,2018-01-04 14:31:01+01:00
114352,114353,2018-01-04 14:29:53+01:00,3,49,20,20,300,-1,1,150,150,1,1,2018-01-04 14:29:56+01:00,2018-01-04 14:30:45+01:00
114353,114354,2018-01-04 14:30:38+01:00,7,65,1,1,600,-1,1,45,45,1,1,2018-01-04 14:30:45+01:00,2018-01-04 14:31:50+01:00


## Add count data

In [55]:
df_count = create_count_data(df.iloc[:10], interval_length=30)
df_count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "Period"] = np.array(


Unnamed: 0,Period,Sessions
0,2016-06-01 02:00:00+02:00,3500
1,2016-06-01 02:30:00+02:00,6000
2,2016-06-01 03:00:00+02:00,6000
3,2016-06-01 03:30:00+02:00,6000
4,2016-06-01 04:00:00+02:00,6000
...,...,...
164,2016-06-04 12:00:00+02:00,30000
165,2016-06-04 12:30:00+02:00,30000
166,2016-06-04 13:00:00+02:00,30000
167,2016-06-04 13:30:00+02:00,30000


In [47]:
df

Unnamed: 0,jobId,submitDate,waitTime,runTime,proc alloc,proc req,user est,mem req,status,userId,groupId,queueId,partitionNumber,Start Date,End Date,Period
0,1,2016-06-01 02:12:45+02:00,4,14400,100,100,14400,-1,1,1,1,1,1,2016-06-01 02:12:49+02:00,2016-06-01 06:12:49+02:00,"DatetimeIndex(['2016-06-01 02:12:49+02:00', '2..."
1,2,2016-06-01 02:13:22+02:00,0,14400,100,100,14400,-1,1,1,1,1,1,2016-06-01 02:13:22+02:00,2016-06-01 06:13:22+02:00,"DatetimeIndex(['2016-06-01 02:13:22+02:00', '2..."
2,3,2016-06-01 23:20:33+02:00,9,45,20,20,3600,-1,1,2,2,1,1,2016-06-01 23:20:42+02:00,2016-06-01 23:21:27+02:00,"DatetimeIndex(['2016-06-01 23:20:42+02:00'], d..."
3,4,2016-06-02 08:16:18+02:00,16386,154,1,1,3600,-1,1,3,3,1,1,2016-06-02 12:49:24+02:00,2016-06-02 12:51:58+02:00,"DatetimeIndex(['2016-06-02 12:49:24+02:00', '2..."
4,5,2016-06-02 08:17:45+02:00,16977,50,1,1,3600,-1,1,3,3,1,1,2016-06-02 13:00:42+02:00,2016-06-02 13:01:32+02:00,"DatetimeIndex(['2016-06-02 13:00:42+02:00'], d..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114350,114351,2018-01-04 14:26:38+01:00,3,60,1,1,600,-1,1,45,45,1,1,2018-01-04 14:26:41+01:00,2018-01-04 14:27:41+01:00,"DatetimeIndex(['2018-01-04 14:26:41+01:00', '2..."
114351,114352,2018-01-04 14:29:28+01:00,1,92,20,20,600,-1,1,45,45,1,1,2018-01-04 14:29:29+01:00,2018-01-04 14:31:01+01:00,"DatetimeIndex(['2018-01-04 14:29:29+01:00', '2..."
114352,114353,2018-01-04 14:29:53+01:00,3,49,20,20,300,-1,1,150,150,1,1,2018-01-04 14:29:56+01:00,2018-01-04 14:30:45+01:00,"DatetimeIndex(['2018-01-04 14:29:56+01:00'], d..."
114353,114354,2018-01-04 14:30:38+01:00,7,65,1,1,600,-1,1,45,45,1,1,2018-01-04 14:30:45+01:00,2018-01-04 14:31:50+01:00,"DatetimeIndex(['2018-01-04 14:30:45+01:00', '2..."


In [46]:
def static_censor(df_in, censor_threshold = [2,2,2,2,2,2,2,2], min_tot_plugs = [2,2,2,2,2,2,2,2], censor_scheme = 1, 
                  to_train_data = False, cap_observed_values = False):
    ## df_in: df to copy and add aditional columns
    ## censor_threshold: the number is subtracted from the total number of plugs. This will be the observed number of events from the model
    ## min_session_observed: the minimum number of plugs that should be present, before censoring is applied

    df_out = df_in.copy()

    plot_list_sessions = df_in.columns[0:8].values
    plot_list_capacity = df_in.columns[8:].values
    df_size = len(df_out)

    for i in np.arange(len(plot_list_sessions)): 
        session_name = plot_list_sessions[i]
        cap_name = plot_list_capacity[i]  

        if (cap_observed_values == True):
            ## truncates observations to be maximum the number of available plugs
            df_out.loc[(df_out[session_name]) > df_out[cap_name], session_name] = df_out[cap_name]      
        
        df_out[session_name + '_CENSORED'] = df_out[cap_name]
        df_out[session_name + '_IS_CENSORED'] = np.ones(df_size)
        
        if (censor_scheme == 1):
            ## Static censoring scheme: ones a user-defined threshold is reached, user-defined tau defines the maximum events that we can observe.
            ## Any observed value equal to or above the threshold is a censored variable and the maximum value observed is tau
            df_out.loc[(df_out[cap_name] > min_tot_plugs[i]), session_name + '_CENSORED'] =  censor_threshold[i]
            df_out.loc[(df_out[session_name] >= df_out[session_name + '_CENSORED']), session_name + '_IS_CENSORED'] =  0

            if (to_train_data == True):
                ## If true, the observed value is changed to be the censored value. First, observed value is saved as a true value
                ## then observed is changed to censored 
                df_out[session_name + '_TRUE'] = df_out[session_name]  #outcomment to remove true label
                df_out.loc[(df_out[session_name + '_IS_CENSORED'] == 0), session_name] = df_out[session_name + '_CENSORED']
                df_out[cap_name] = df_out[session_name + '_CENSORED']
                
                df_out = df_out.rename(columns = {cap_name: session_name + '_TAU'})
                df_out = df_out.drop(columns = session_name + '_CENSORED')
                df_out = df_out.drop(columns = session_name + '_IS_CENSORED')

        elif(censor_scheme == 2):
            ## Dynamic censoring scheme: ones a user-defined threshold is reached, a tau is defined as the maximum no. plugs subtracted 
            ## with a user-defined threshold, i.e. tau will dynammically change if the plug capacity of a hub is increased.
            ## Any observed value equal to or above tau is a censored variable and the maximum value observed is tau
            df_out.loc[(df_out[cap_name] > min_tot_plugs[i]), session_name + '_CENSORED'] =  (df_out[cap_name]-censor_threshold[i])
            df_out.loc[(df_out[session_name] >= df_out[session_name + '_CENSORED']), session_name + '_IS_CENSORED'] =  0
            
            if (to_train_data == True):
                ## If true, the observed value is changed to be the censored value
                df_out[session_name + '_TRUE'] = df_out[session_name]  #outcomment to remove true label
                df_out.loc[(df_out[session_name + '_IS_CENSORED'] == 0), session_name] = df_out[session_name + '_CENSORED']
                df_out[cap_name] = df_out[session_name + '_CENSORED']
                
                df_out = df_out.rename(columns = {cap_name: session_name + '_TAU'})
                df_out = df_out.drop(columns = session_name + '_CENSORED')
                df_out = df_out.drop(columns = session_name + '_IS_CENSORED')
    
    return df_out