## Preprocessing & pipelines

### Imports

Here we import the data, and do the basic initial transformations like creating the unix time column

In [1]:
import numpy as np
import pandas as pd
import os
import psutil

np.random.seed(42)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
plt.rcParams["figure.figsize"] = [16, 11]
plt.rcParams["figure.autolayout"] = True

import time
import datetime
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
# helper functions

def data_split(df):
    """returns 10% of the data"""
    return df[: int((len(df)/10))]


def EventTime(data):
    """function to add the time of the event"""
    for i in list(data["case concept:name"].unique()):
        data.loc[data["case concept:name"] == i, "nextTIME"] = data.loc[data["case concept:name"] == i, 
                                                                "event time:timestamp"].shift(-1)
    return data

def PrevEventTime(data):
    """function to add the time of the event"""
    for i in list(data["case concept:name"].unique()):
        data.loc[data["case concept:name"] == i, "prevTIME"] = data.loc[data["case concept:name"] == i, 
                                                                "event time:timestamp"].shift(1)
    return data

def next_event(data, lst, case, nxt, name):
    """function to add the next event of a trace"""
    for i in lst:
        data.loc[data[case] == i, nxt] = data.loc[data[case] == i, name].shift(-1)
    return data

def prev_event(data, lst, case, nxt, name):
    """function to add the next event of a trace"""
    for i in lst:
        data.loc[data[case] == i, nxt] = data.loc[data[case] == i, name].shift(1)
    return data

def UnixTime(df, col="timestamp", newcol="Unix"):
    """Adds a new column to the dataframe containing the UNIX time of the timestamp"""
    cop = df.copy()
    cop[col] = pd.to_datetime(cop[col], dayfirst=True)
    unixTransform = lambda x: time.mktime(x.timetuple())
    df[newcol] = cop[col].apply(unixTransform)

In [3]:
path = r"D:/University/Year 2/Q3/DBL/Data/"
train = pd.read_csv(f'{path}BPI Challenge 2017/BPI_Challenge_2017-training.csv')
test = pd.read_csv(f'{path}BPI Challenge 2017/BPI_Challenge_2017-test.csv')


data = pd.concat([train, test])

In [4]:
# strip whitespace of column names and add extra time columns
data.columns = data.columns.str.strip()
data["timestamp"] = data["event time:timestamp"].copy()
data["event time:timestamp"] = pd.to_datetime(data["event time:timestamp"], dayfirst=True)
data['time of day'] = data["timestamp"].str.split(expand=True)[1]

In [5]:
data["weekday"] = data["event time:timestamp"].dt.day_name()

In [6]:
UnixTime(data)

In [7]:
data.sort_values(by=['Unix'], inplace=True)
data.head()

Unnamed: 0,eventID,case LoanGoal,case ApplicationType,case concept:name,case RequestedAmount,event Action,event org:resource,event concept:name,event EventOrigin,event EventID,event lifecycle:transition,event time:timestamp,timestamp,time of day,weekday,Unix
0,0,Existing loan takeover,New credit,Application_652823628,20000.0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 10:51:15.304,01-01-2016 10:51:15.304,10:51:15.304,Friday,1451641875.0
1,1,Existing loan takeover,New credit,Application_652823628,20000.0,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 10:51:15.352,01-01-2016 10:51:15.352,10:51:15.352,Friday,1451641875.0
2,2,Existing loan takeover,New credit,Application_652823628,20000.0,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 10:51:15.774,01-01-2016 10:51:15.774,10:51:15.774,Friday,1451641875.0
3,3,Existing loan takeover,New credit,Application_652823628,20000.0,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 10:52:36.392,01-01-2016 10:52:36.392,10:52:36.392,Friday,1451641956.0
4,4,Existing loan takeover,New credit,Application_652823628,20000.0,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 10:52:36.403,01-01-2016 10:52:36.403,10:52:36.403,Friday,1451641956.0


In [8]:
# test = data[: 100].copy()

In [9]:
# drop useless columns
def dropper(df, lbls=["eventID", "event EventID", "timestamp"]):
    df.drop(labels=lbls, axis=1, inplace=True)

dropper(data)

In [10]:
data

Unnamed: 0,case LoanGoal,case ApplicationType,case concept:name,case RequestedAmount,event Action,event org:resource,event concept:name,event EventOrigin,event lifecycle:transition,event time:timestamp,time of day,weekday,Unix
0,Existing loan takeover,New credit,Application_652823628,20000.000,Created,User_1,A_Create Application,Application,complete,2016-01-01 10:51:15.304,10:51:15.304,Friday,1451641875.000
1,Existing loan takeover,New credit,Application_652823628,20000.000,statechange,User_1,A_Submitted,Application,complete,2016-01-01 10:51:15.352,10:51:15.352,Friday,1451641875.000
2,Existing loan takeover,New credit,Application_652823628,20000.000,Created,User_1,W_Handle leads,Workflow,schedule,2016-01-01 10:51:15.774,10:51:15.774,Friday,1451641875.000
3,Existing loan takeover,New credit,Application_652823628,20000.000,Deleted,User_1,W_Handle leads,Workflow,withdraw,2016-01-01 10:52:36.392,10:52:36.392,Friday,1451641956.000
4,Existing loan takeover,New credit,Application_652823628,20000.000,Created,User_1,W_Complete application,Workflow,schedule,2016-01-01 10:52:36.403,10:52:36.403,Friday,1451641956.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
241054,Existing loan takeover,New credit,Application_637536789,17250.000,statechange,User_131,A_Validating,Application,complete,2017-02-01 15:00:30.347,15:00:30.347,Wednesday,1485957630.000
241055,Existing loan takeover,New credit,Application_637536789,17250.000,Released,User_131,W_Validate application,Workflow,suspend,2017-02-01 15:01:24.191,15:01:24.191,Wednesday,1485957684.000
241056,"Other, see explanation",New credit,Application_16193774,45000.000,Released,User_56,W_Call incomplete files,Workflow,suspend,2017-02-01 15:09:03.331,15:09:03.331,Wednesday,1485958143.000
241057,Existing loan takeover,New credit,Application_586083090,50000.000,Obtained,User_100,W_Validate application,Workflow,resume,2017-02-01 15:10:52.793,15:10:52.793,Wednesday,1485958252.000


In [11]:
data["Unix"] = data["Unix"].astype(int)

### Event/time adder

In [12]:
# assign long column names to variables for easier use
cases = "case concept:name"
reg_date = "case REG_DATE"
amount_req = "case AMOUNT_REQ"
event_name = "event concept:name"
lifecycle = "event lifecycle:transition"
tmstmp = "event time:timestamp"
nxt_event = "next event"
dtime = "delta time"

In [13]:
# event shifting
data[nxt_event] = data.groupby('case concept:name')['event concept:name'].shift(-1)
data["prev event"] = data.groupby('case concept:name')['event concept:name'].shift(1)

In [14]:
# time shifting
data["nextUnix"] = data.groupby('case concept:name')['Unix'].shift(-1)
data["prevUnix"] = data.groupby('case concept:name')['Unix'].shift(1)
data["nextTime"] = data.groupby('case concept:name')['event time:timestamp'].shift(-1)
data["prevTime"] = data.groupby('case concept:name')['event time:timestamp'].shift(1)

In [15]:
len(data[data["prevTime"].isna()])/len(data)

0.026207988741269617

In [16]:
len(data[data["prevTime"].isna()])

31509

In [17]:
len(data)

1202267

In [18]:
import math

# Adding time features
data['day'] = data['event time:timestamp'].dt.day;
data['month'] = data['event time:timestamp'].dt.month;
data['hour'] = data['event time:timestamp'].dt.hour;
data['day_of_week'] = data['event time:timestamp'].dt.weekday;

# Cyclical encoding
data["hour"] = 2 * math.pi * data["hour"] / data["hour"].max()
data["hour_cos"] = np.cos(data["hour"])
data["hour_sin"] = np.sin(data["hour"])
data["day_of_week"] = 2 * math.pi * data["day_of_week"] / data["day_of_week"].max()
data["day_of_week_cos"] = np.cos(data["day_of_week"])
data["day_of_week_sin"] = np.sin(data["day_of_week"])

In [28]:
data['day_of_week'].unique()

array([4.1887902 , 5.23598776, 6.28318531, 0.        , 1.04719755,
       2.0943951 , 3.14159265])

In [20]:
data

Unnamed: 0,case LoanGoal,case ApplicationType,case concept:name,case RequestedAmount,event Action,event org:resource,event concept:name,event EventOrigin,event lifecycle:transition,event time:timestamp,...,nextTime,prevTime,day,month,hour,day_of_week,hour_cos,hour_sin,day_of_week_cos,day_of_week_sin
0,Existing loan takeover,New credit,Application_652823628,20000.000,Created,User_1,A_Create Application,Application,complete,2016-01-01 10:51:15.304,...,2016-01-01 10:51:15.352,NaT,1,1,2.732,4.189,-0.917,0.398,-0.500,-0.866
1,Existing loan takeover,New credit,Application_652823628,20000.000,statechange,User_1,A_Submitted,Application,complete,2016-01-01 10:51:15.352,...,2016-01-01 10:51:15.774,2016-01-01 10:51:15.304,1,1,2.732,4.189,-0.917,0.398,-0.500,-0.866
2,Existing loan takeover,New credit,Application_652823628,20000.000,Created,User_1,W_Handle leads,Workflow,schedule,2016-01-01 10:51:15.774,...,2016-01-01 10:52:36.392,2016-01-01 10:51:15.352,1,1,2.732,4.189,-0.917,0.398,-0.500,-0.866
3,Existing loan takeover,New credit,Application_652823628,20000.000,Deleted,User_1,W_Handle leads,Workflow,withdraw,2016-01-01 10:52:36.392,...,2016-01-01 10:52:36.403,2016-01-01 10:51:15.774,1,1,2.732,4.189,-0.917,0.398,-0.500,-0.866
4,Existing loan takeover,New credit,Application_652823628,20000.000,Created,User_1,W_Complete application,Workflow,schedule,2016-01-01 10:52:36.403,...,2016-01-01 10:52:36.413,2016-01-01 10:52:36.392,1,1,2.732,4.189,-0.917,0.398,-0.500,-0.866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241054,Existing loan takeover,New credit,Application_637536789,17250.000,statechange,User_131,A_Validating,Application,complete,2017-02-01 15:00:30.347,...,2017-02-01 15:01:24.191,2017-02-01 15:00:30.275,1,2,4.098,2.094,-0.577,-0.817,-0.500,0.866
241055,Existing loan takeover,New credit,Application_637536789,17250.000,Released,User_131,W_Validate application,Workflow,suspend,2017-02-01 15:01:24.191,...,NaT,2017-02-01 15:00:30.347,1,2,4.098,2.094,-0.577,-0.817,-0.500,0.866
241056,"Other, see explanation",New credit,Application_16193774,45000.000,Released,User_56,W_Call incomplete files,Workflow,suspend,2017-02-01 15:09:03.331,...,NaT,2017-02-01 14:50:56.564,1,2,4.098,2.094,-0.577,-0.817,-0.500,0.866
241057,Existing loan takeover,New credit,Application_586083090,50000.000,Obtained,User_100,W_Validate application,Workflow,resume,2017-02-01 15:10:52.793,...,2017-02-01 15:11:03.499,2017-02-01 14:54:03.544,1,2,4.098,2.094,-0.577,-0.817,-0.500,0.866


In [21]:
# lst_events = data[cases].unique().tolist()
# next_event(data, lst_events, cases, nxt_event, event_name);
# EventTime(data);

In [22]:
data.to_pickle("processed2017.pkl")

In [23]:
# data["timestamp"] = data["event time:timestamp"].copy()
# data['time of day'] = data["timestamp"].str.split(expand=True)[1]
# data["event time:timestamp"] = pd.to_datetime(data["event time:timestamp"], dayfirst=True)

In [24]:
test = data[: 100]
test

Unnamed: 0,case LoanGoal,case ApplicationType,case concept:name,case RequestedAmount,event Action,event org:resource,event concept:name,event EventOrigin,event lifecycle:transition,event time:timestamp,...,nextTime,prevTime,day,month,hour,day_of_week,hour_cos,hour_sin,day_of_week_cos,day_of_week_sin
0,Existing loan takeover,New credit,Application_652823628,20000.000,Created,User_1,A_Create Application,Application,complete,2016-01-01 10:51:15.304,...,2016-01-01 10:51:15.352,NaT,1,1,2.732,4.189,-0.917,0.398,-0.500,-0.866
1,Existing loan takeover,New credit,Application_652823628,20000.000,statechange,User_1,A_Submitted,Application,complete,2016-01-01 10:51:15.352,...,2016-01-01 10:51:15.774,2016-01-01 10:51:15.304,1,1,2.732,4.189,-0.917,0.398,-0.500,-0.866
2,Existing loan takeover,New credit,Application_652823628,20000.000,Created,User_1,W_Handle leads,Workflow,schedule,2016-01-01 10:51:15.774,...,2016-01-01 10:52:36.392,2016-01-01 10:51:15.352,1,1,2.732,4.189,-0.917,0.398,-0.500,-0.866
3,Existing loan takeover,New credit,Application_652823628,20000.000,Deleted,User_1,W_Handle leads,Workflow,withdraw,2016-01-01 10:52:36.392,...,2016-01-01 10:52:36.403,2016-01-01 10:51:15.774,1,1,2.732,4.189,-0.917,0.398,-0.500,-0.866
4,Existing loan takeover,New credit,Application_652823628,20000.000,Created,User_1,W_Complete application,Workflow,schedule,2016-01-01 10:52:36.403,...,2016-01-01 10:52:36.413,2016-01-01 10:52:36.392,1,1,2.732,4.189,-0.917,0.398,-0.500,-0.866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Car,New credit,Application_1806387393,5000.000,Created,User_1,W_Handle leads,Workflow,schedule,2016-01-01 18:58:26.524,...,2016-01-02 10:38:15.267,2016-01-01 18:58:26.216,1,1,4.917,4.189,0.203,-0.979,-0.500,-0.866
97,Not speficied,New credit,Application_1111870538,5000.000,statechange,User_1,A_Submitted,Application,complete,2016-01-01 20:13:17.442,...,2016-01-01 20:13:17.750,NaT,1,1,5.464,4.189,0.683,-0.731,-0.500,-0.866
98,Not speficied,New credit,Application_1111870538,5000.000,Created,User_1,W_Handle leads,Workflow,schedule,2016-01-01 20:13:17.750,...,2016-01-01 20:13:17.386,2016-01-01 20:13:17.442,1,1,5.464,4.189,0.683,-0.731,-0.500,-0.866
96,Not speficied,New credit,Application_1111870538,5000.000,Created,User_1,A_Create Application,Application,complete,2016-01-01 20:13:17.386,...,2016-01-01 20:13:53.205,2016-01-01 20:13:17.750,1,1,5.464,4.189,0.683,-0.731,-0.500,-0.866


In [26]:
from sklearn.base import BaseEstimator, TransformerMixin

class Tester(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):


IndentationError: expected an indented block (Temp/ipykernel_7732/2623523266.py, line 8)