In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%run functions/preprocess.py
%run functions/linear_regression.py

np.set_printoptions(formatter={'float_kind': '{:f}'.format})
pd.set_option('display.float_format', lambda x: '%.3f' % x)
plt.style.use('seaborn')


## Loading the data

In [2]:
cols = ['eventID', 'case concept:name', 'case RequestedAmount', 'day', 'hour_cos',
        'hour_sin', 'day_of_week_cos', 'day_of_week_sin', 'event_no',
        'event_of_day', 'eventname_A_Cancelled', 'eventname_A_Complete',
        'eventname_A_Concept', 'eventname_A_Create Application',
        'eventname_A_Denied', 'eventname_A_Incomplete', 'eventname_A_Pending', 'eventname_A_Validating',
        'eventname_O_Cancelled', 'eventname_O_Create Offer',
        'eventname_O_Created', 'eventname_O_Refused', 'eventname_O_Returned',
        'eventname_O_Sent (mail and online)', 'eventname_O_Sent (online only)',
        'eventname_W_Call after offers', 'eventname_W_Call incomplete files',
        'eventname_W_Complete application', 'eventname_W_Handle leads',
        'eventname_W_Shortened completion ', 'eventname_W_Validate application',
        'lifecycle_resume', 'lifecycle_schedule', 'lifecycle_start',
        'lifecycle_suspend', 'lifecycle_withdraw', 'time_diff']


In [3]:
df_data = pd.read_pickle('processed2017.pkl')


# encoding data
df_data = encode(df_data)
df_data = time_diff(df_data, outlier='keep')


df_data = df_data[cols]


In [4]:
def sliding_window(window_size, df, X, Y):
    """transforms df_data into supervised form 
    with rolling window implementations

    Args:
        window_size (int): size of rolling window

    Returns:
        (X, Y): tuple of input and output arrays
    """

    windows = list(df.drop(['case concept:name','eventID'],
                   axis=1).rolling(window=window_size))
    for i in windows[window_size-1:]:
        # split into X and Y
        # i = i.drop('case concept:name', axis=1)
        temp = i.to_numpy()
        temp = [item for sublist in temp for item in sublist]
        Y.append(temp.pop(-1))
        X.append(temp)
    return None


def del_id(i):
    del i[0]
    del i[35]
    del i[70]
    return None


In [5]:
from sklearn.model_selection import train_test_split
np.random.seed(1587)
df_train, df_test = train_test_split(df_data, test_size=0.3, shuffle=False)

# randomly sample traces from log
# df_train = df_train[df_train['case concept:name'].isin(np.random.choice(
#     df_train['case concept:name'].unique(), size=2000, replace=True))]
# if DEMO == True:
#     cases = df_train['case concept:name'].unique()[11000:]
#     df_train = df_train[df_train['case concept:name'].isin(cases)]
df_train, df_test = del_intersection(df_train, df_test)


In [6]:
len(df_train['case concept:name'].unique()), len(
    df_test['case concept:name'].unique())


(20499, 8656)

In [7]:
# print(f'lag 3:{np.tile(df_data.columns[1:].to_numpy(),3)[[1, 11, 14, 18, 26, 31]]}\n')
# print(f'lag 2:{np.tile(df_data.columns[1:].to_numpy(),3)[[41, 47, 50, 54, 67, 68]]}\n')
# print(f'lag 1:{np.tile(df_data.columns[1:].to_numpy(),3)[[81]]}')


---

## Training

In [8]:
X = []
Y = []
# transform into windows
df_train.groupby('case concept:name').apply(
    lambda x: sliding_window(3, x, X, Y))
# for i in X:
#     del_id(i)


In [9]:
X_train = pd.DataFrame(X)
Y_train = pd.DataFrame(Y).rename(columns={0: 'y'})
# del X, Y


In [10]:
import statsmodels.api as sm
drop = [1, 11, 14, 18, 26, 31, 41, 47, 50, 54, 67, 68, 81]
X_train = X_train.drop(drop, axis=1)

model = sm.OLS(Y_train, X_train).fit()
del Y_train, X_train


In [11]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error


# error = cv[0]

# mse = np.mean([mean_squared_error(i[0], i[1]) for i in error])
# r2 = np.mean([r2_score(i[0], i[1]) for i in error])
# mae = np.mean([mean_absolute_error(i[0], i[1]) for i in error])
# rmse = np.mean([mean_squared_error(i[0], i[1], squared=False) for i in error])
# print(f'mse: {np.exp(mse)}\nr^2: {r2}\nrmse: {np.exp(rmse)}\nmae:{np.exp(mae)}')


---

## Results

In [12]:
X = []
Y = []

# transform into windows
df_test.groupby('case concept:name').apply(
    lambda x: sliding_window(3, x, X, Y))
# X = [np.delete(i, [0,36,72]) for i in X]
# for i in X:
#     del_id(i)


In [13]:
# drop unsignificant features
X_test = pd.DataFrame(X)
X_test = X_test.drop(drop, axis=1)
# predict on test set
y_pred = model.predict(X_test).to_list()


In [14]:
# match the order of predictions with the features
error_test = df_test.drop(df_test.groupby(
    ['case concept:name']).head(2).index, axis=0)
order = [*error_test.groupby('case concept:name').groups.values()]
order = [item for sublist in order for item in sublist]
error_test.loc[order, 'predicted'] = y_pred
df_test.loc[order, 'predicted'] = y_pred
# del X


In [15]:
mse = mean_squared_error(error_test['time_diff'], error_test['predicted'])
r2 = r2_score(error_test['time_diff'], error_test['predicted'])
mae = mean_absolute_error(error_test['time_diff'], error_test['predicted'])
rmse = mean_squared_error(
    error_test['time_diff'], error_test['predicted'], squared=False)
print(f'mse: {np.exp(mse)}\nr^2: {r2}\nrmse: {np.exp(rmse)}\nmae:{np.exp(mae)}')
# del X_test, Y, y_pred


mse: 861.5897711832736
r^2: 0.639614211608512
rmse: 13.460577531402812
mae:5.571917618468414


In [16]:
# df_test.to_pickle('complex_linear_predictions.pkl')


In [17]:
# y_true = pd.DataFrame(Y)
# y_pred = pd.DataFrame(y_pred)
# df_error = pd.concat([y_true, y_pred], axis=1)
# df_error.columns = ['y_true', 'y_pred']
# df_error.reset_index(inplace=True)
# df_error['error'] = df_error['y_true'] - df_error['y_pred']

# fig, ax = plt.subplots(figsize=(10, 7))
# # df_error.plot.hist(x='index', y='error', figsize=(10,5), ax=ax, bins=10)
# sns.histplot(data=df_error, x='error', ax=ax, color='salmon')
# ax.set_title('Error distribution with log transform', size=30)
# ax.set_ylabel('count', fontsize=20)
# ax.set_xlabel('Error in e^seconds', fontsize=20)
# plt.yticks(fontsize=20)
# plt.xticks(fontsize=20)


In [18]:
# model.summary()


In [19]:
# from statsmodels.stats.outliers_influence import variance_inflation_factor


# # the independent variables set
# X = df_train.drop(['time_diff', 'case concept:name'], axis=1)

# # VIF dataframe
# vif_data = pd.DataFrame()
# vif_data["feature"] = X.columns

# # calculating VIF for each feature
# vif_data["VIF"] = [variance_inflation_factor(X.values, i)
#                    for i in range(len(X.columns))]

# print(vif_data)
