In [4]:
import datetime
import pandas as pd
import numpy as np

In [None]:
def get_data(train, test):
    """ 
    Function that initalizes the data
    param: paths to train and test of the csv files
    return: combined dataframe with clean column names
    """
    
    df_train = pd.read_csv(train)
    df_test = pd.read_csv(test)
    data = pd.concat([df_train, df_test])
    data.columns = data.columns.str.strip()
    if not DEMO:
        pass
    else:
        data = data_split(data)
    return data

def data_split(df):
    """returns 10% of the data"""
    return df[: int((len(df)/10))]

In [11]:
def preprocess(data, labels=["event time:timestamp", "time of day", "weekday"], time=True):
    """
    Preprocesses all the data
    param: a dataframe
    return 
    """
    if time:
        time_process(data, labels)    
    if not time:
        print("TIME = FALSE")
                
    feature_process(data, time)
    return data

#### time

In [4]:
def time_process(data, labels):
    """
    Time cleaning of the preprocessing
    """
    # remove milliseconds
    remove_ms(data)
    time_features(data, labels)
    sort_time(data)

def time_features(data, labels):
    """
    Decomposes and creates time objects
    """
    data[labels[0]] = f_memoize_dt(data[labels[0]])
    data[labels[1]] = data[labels[0]].dt.time
    data[labels[2]] = data[labels[0]].dt.day_name()

def sort_time(data):
    """
    Sorts the data in chronological order to prevent future data
    from leaking in
    """
    data.sort_values(by=["event time:timestamp"], inplace=True)
    data.reset_index(inplace=True, drop=True)

def remove_ms(df):
    """Removes milliseconds"""
    df['event time:timestamp'] = df['event time:timestamp'].apply(lambda x: x.split('.')[0])
    return df


def f_memoize_dt(s):
    """
    Memorization technique to convert to datetime
    """
    dates = {date:datetime.datetime.strptime(date,"%d-%m-%Y %H:%M:%S") for date in s.unique()}
    return s.map(dates)


def dropper(df, lbls=["event EventID", "timestamp"]):
    df.drop(labels=lbls, axis=1, inplace=True)


#### events

In [7]:
def feature_process(data, time=True):
    """
    Cleans the data and adds feautres, seperate branch for random forest
    """
    log_data(data)
    trace_data(data)
    if time:
        add_time_features(data)
        cyclical_time(data)
    else:
        print("TIME = FALSE")
    cleaner(data)
    
def cleaner(data):
    """
    Cleans some columns
    """
    data["case RequestedAmount"] = data["case RequestedAmount"].astype(int)
    data["next_event"].fillna("LAST EVENT", inplace=True)
    data["prev_event"].fillna("FIRST EVENT", inplace=True)
    data["2prev_event"].fillna("FIRST EVENT", inplace=True)
    data["prev_lifecycle"].fillna("FIRST EVENT", inplace=True)
    

def log_data(data):
    """
    Adds the data characteristcs based on the log
    """
    # events
    data["prev_event_log"] = data["event concept:name"].shift(1)
    data["next_event_log"] = data["event concept:name"].shift(-1)
    # time
    data["prev_time_log"] = data["event time:timestamp"].shift(1)
    data["next_time_log"] = data["event time:timestamp"].shift(-1)


def trace_data(data):
    """
    Adds the data characteristics based on the traces
    """
    # events
    data["prev_event"] = data.groupby("case concept:name")[
                                "event concept:name"].shift(1)
    data["2prev_event"] = data.groupby("case concept:name")[
                                "event concept:name"].shift(2)
    data["next_event"] = data.groupby("case concept:name")[
                                "event concept:name"].shift(-1)
    data["prev_lifecycle"] = data.groupby('case concept:name')[
                                'event lifecycle:transition'].shift(1)
    # time
    data["prev_time"] = data.groupby("case concept:name")[
                                "event time:timestamp"].shift(1)
    data["next_time"] = data.groupby("case concept:name")[
                                "event time:timestamp"].shift(-1)


def add_time_features(data):
    """
    Add time features needed to train models
    """
    data['day'] = data['event time:timestamp'].dt.day
    data['month'] = data['event time:timestamp'].dt.month
    data['hour'] = data['event time:timestamp'].dt.hour
    data['day_of_week'] = data['event time:timestamp'].dt.weekday
    data["avg_time"] = (data['event time:timestamp'] - data['event time:timestamp'].shift()
                    ).fillna(pd.Timedelta(seconds=0)).reset_index(drop=True)
    data["avg_time"] = data["avg_time"].agg("mean")
    data["avg_time"] = data["avg_time"].dt.seconds
    data["completion_time"] = data["next_time"] - data["event time:timestamp"]
    data["completion_time"] = data["completion_time"].dt.seconds

def cyclical_time(data):
    """
    Adds the cyclical time features
    """
    data["hour"] = 2 * np.pi * data["hour"] / data["hour"].max()
    data["hour_cos"] = np.cos(data["hour"])
    data["hour_sin"] = np.sin(data["hour"])
    data["day_of_week"] = 2 * np.pi * \
    data["day_of_week"] / data["day_of_week"].max()
    data["day_of_week_cos"] = np.cos(data["day_of_week"])
    data["day_of_week_sin"] = np.sin(data["day_of_week"])

### Split

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [8]:
def features_split(data, time=True):
    """
    Takes in a dataframe, then splits off the train and test sets
    It will encode all columns such that proper predictions can be made
    param: dataframe
    return: encoded features in train/test split
    """
    X_train, X_test, y_train, y_test = split(data, time)
    encoder(data, X_train, X_test, y_train, y_test)
    return X_train, X_test, y_train, y_test

def split(data, time):
    """
    splits the data for model training with a seperated random forest branch
    """
    if time:
        
        y_labels = [
                "next_event", 
                "next_time", 
                "completion_time"
               ]
        x_labels = data.drop(y_labels, axis=1)
        new_data = data.dropna(subset=y_labels)

    else:
        
        y_labels = ["next_event"]
        x_labels = data.drop(y_labels,axis=1)
        new_data = data.dropna(subset=y_labels)
        
    train, test = train_test_split(new_data, 
                random_state=42, shuffle=False, test_size=0.3)

    del_intersection(train, test)
    X_train, X_test = train.drop(y_labels, axis=1), test.drop(y_labels, axis=1)
    y_train, y_test = train[y_labels], test[y_labels]
    return X_train, X_test, y_train, y_test

def encoder(data, X_train, X_test, y_train, y_test):
    """
    encode the columns that are needed in the algorithms
    """
    event_encoder = LabelEncoder()

    labels_name = X_train["prev_event"].unique().tolist() + ["LAST EVENT"]
    event_encoder.fit(labels_name)
    X_train[["event concept:name","prev_event", "2prev_event"]] = X_train[["event concept:name", 
                                    "prev_event", "2prev_event"]].apply(event_encoder.transform)
    X_test[["event concept:name", "prev_event", "2prev_event"]] = X_test[["event concept:name", 
                                        "prev_event", "2prev_event"]].apply(event_encoder.transform)
    
    
    labels_lifecycle = data["prev_lifecycle"].unique()
    event_encoder.fit(labels_lifecycle)
    X_train[["event lifecycle:transition", 'prev_lifecycle']] = X_train[
        ["event lifecycle:transition", 'prev_lifecycle']].apply(event_encoder.transform)
    X_test[["event lifecycle:transition", 'prev_lifecycle']] = X_test[
        ["event lifecycle:transition", 'prev_lifecycle']].apply(event_encoder.transform)
    
    labels_action = data["event Action"].unique()
    event_encoder.fit(labels_action)
    X_train["event Action"] = X_train[["event Action"]].apply(event_encoder.transform)
    X_test["event Action"] = X_test[["event Action"]].apply(event_encoder.transform)
    
    labels_resource = data["event org:resource"].unique()
    event_encoder.fit(labels_resource)
    X_train["event org:resource"] = X_train[["event org:resource"]].apply(event_encoder.transform)
    X_test["event org:resource"] = X_test[["event org:resource"]].apply(event_encoder.transform)
    
    
    return X_train, X_test, y_train, y_test


def del_intersection(train, test):
    lst_tr = train['case concept:name'].unique().tolist()
    lst_te = test['case concept:name'].unique().tolist()

    lst_int = set(lst_tr).intersection(lst_te)

    train = train[~train['case concept:name'].isin(lst_int)]
    test = test[~test['case concept:name'].isin(lst_int)]
    return train, test