In [1]:
import pandas as pd
import numpy as np

### Loading in data

In [2]:
test_data = pd.read_csv('test_set.csv')
train_data = pd.read_csv('train_set.csv')

In [3]:
train_data

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,01-10-2011 00:38:44.546
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 00:38:44.880
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,01-10-2011 00:39:37.906
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,01-10-2011 00:39:38.875
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,A_SUBMITTED,COMPLETE,01-10-2011 08:08:58.256
...,...,...,...,...,...,...,...
214372,38835094290529,201854,2012-01-18T02:09:07.029+01:00,50000,O_CANCELLED,COMPLETE,14-03-2012 15:30:19.361
214373,38835094290528,201854,2012-01-18T02:09:07.029+01:00,50000,A_CANCELLED,COMPLETE,14-03-2012 15:30:19.361
214374,38835094290530,201854,2012-01-18T02:09:07.029+01:00,50000,W_Nabellen incomplete dossiers,COMPLETE,14-03-2012 15:30:23.187
214375,35858681954366,199678,2012-01-10T19:16:52.800+01:00,30000,W_Nabellen offertes,START,14-03-2012 15:36:15.299


In [4]:
train_data.describe()

Unnamed: 0,eventID,case concept:name,case AMOUNT_REQ
count,214377.0,214377.0,214377.0
mean,22682390000000.0,190155.106742,15349.297327
std,13240090000000.0,9592.111876,12179.774398
min,0.0,173688.0,0.0
25%,10909220000000.0,181658.0,6500.0
50%,23283020000000.0,190591.0,10300.0
75%,34041910000000.0,198373.0,20000.0
max,44959720000000.0,206321.0,99999.0


### Findings 

Findings so far:

    No NaN values in the data
    10469 unique cases in train
    2618 unique cases in test
    20 outliers in case amount, but not really outlier as its just a request for money
    24 different event concepts
    3 different event lifecycle (Start, complete, schedule)
    

#### Number of unique cases 

In [5]:
train_data['case concept:name'].nunique()

10469

#### Count nan values

In [6]:
train_data.isna().sum()

eventID                       0
case concept:name             0
case REG_DATE                 0
case AMOUNT_REQ               0
event concept:name            0
event lifecycle:transition    0
event time:timestamp          0
dtype: int64

#### Find outliers 

In [7]:
def outliers(data, columns:list):
    '''Function to remove the outliers of each column.
    Only for numeric values.
    '''
    outliers = []
    for column in columns:
        #first determine an interval of + or - 3*standard deviations of the mean
        min_value = data.iloc[:,column].mean() - 3 * data.iloc[:,column].std()
        max_value = data.iloc[:,column].mean() + 3 * data.iloc[:,column].std()
        #then replace the values that fall outside of this interval with nan.
        for i in range(len(data.iloc[:,column])):
            if data.iloc[i, column] < min_value or data.iloc[i, column] > max_value:
                if data.iloc[i,column] not in outliers:
                    outliers.append(data.iloc[i,column])
            
    return outliers

In [8]:
outliers = outliers(train_data, [3])

In [9]:
outliers.sort()
outliers

[52000,
 53000,
 55000,
 56000,
 57000,
 57600,
 59400,
 60000,
 62537,
 64000,
 65000,
 68000,
 69000,
 70000,
 72000,
 75000,
 80000,
 90000,
 99000,
 99999]

#### Finding all event types 

In [10]:
events = []
for event in train_data['event concept:name']:
    if event not in events:
        events.append(event)
print(events)
len(events)

['A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'W_Completeren aanvraag', 'A_DECLINED', 'W_Afhandelen leads', 'A_ACCEPTED', 'O_SELECTED', 'A_FINALIZED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes', 'O_CANCELLED', 'A_CANCELLED', 'W_Beoordelen fraude', 'O_SENT_BACK', 'W_Valideren aanvraag', 'W_Nabellen incomplete dossiers', 'O_ACCEPTED', 'A_APPROVED', 'A_ACTIVATED', 'A_REGISTERED', 'O_DECLINED', 'W_Wijzigen contractgegevens']


24