In [541]:
import os
import sys
sys.path.insert(0, os.path.abspath('../HD360 Git/py'))

In [544]:
import pandas as pd
from IPython.display import display, Javascript, HTML
import json
from sklearn import cluster
import event_seq as es

In [545]:
display(Javascript("require.config({paths: {d3: 'https://d3js.org/d3.v4.min', tip: 'https://bl.ocks.org/davegotz/raw/bd54b56723c154d25eedde6504d30ad7/b63d30f77840a908c40ad3bbcfaf8baa90ae1f14/d3-tip.js'}});"))

<IPython.core.display.Javascript object>

# Event Data

In [546]:
df_events = pd.read_csv('data/events_small.csv')
del df_events['index'], df_events['enhedsnummer'], df_events['gyldigtil']
df_events.rename(columns={'cvrnummer': 'seq_id', 'eventtype': 'event_type', 'attribute': 'event_attribute', 'gyldigfra': 'timestamp'}, inplace=True)
df_events.head()

Unnamed: 0,seq_id,event_type,event_attribute,timestamp
0,86299410,navn,732519,1979-05-07
1,86299410,penhed,1002743637,1979-05-07
2,86299410,virksomhedsstatus,1,1979-05-07
3,86299410,virksomhedsstatus,9,2011-06-22
4,86299410,virksomhedsstatus,11,2011-10-31


In [547]:
df_status_types = pd.read_csv('data/virksomhedsstatus.csv')
df_status_types.head()

Unnamed: 0,index,virksomhedsstatusid,virksomhedsstatus
0,0,6864,AKTIV
1,1,1,NORMAL
2,2,11844,OPLØST
3,3,19,OPLØST EFTER ERKLÆRING
4,4,11,OPLØST EFTER FRIVILLIG LIKVIDATION


In [548]:
df_outcome = df_events[df_events['event_type'] == 'virksomhedsstatus']
df_outcome = df_outcome[(df_outcome['event_attribute'] == 4) | (df_outcome['event_attribute'] == 10) | (df_outcome['event_attribute'] == 3) | (df_outcome['event_attribute'] == 2)]
df_outcome = df_outcome.sort_values('timestamp').groupby('seq_id', as_index=False).first()
df_outcome['outcome'] = 'failing'
del df_outcome['event_type'], df_outcome['event_attribute']
df_outcome = df = df_events.drop_duplicates(subset=['seq_id'])[['seq_id']].merge(df_outcome, how='left', on='seq_id')
df_outcome['timestamp'].fillna(pd.Timestamp.today(), inplace=True)
df_outcome['outcome'].fillna('normal', inplace=True)
df_outcome.rename(columns={'timestamp': 'timestamp_o'}, inplace=True)
df_outcome.head()

Unnamed: 0,seq_id,timestamp_o,outcome
0,86299410,2019-09-30 12:35:21.053299,normal
1,71312518,2019-09-30 12:35:21.053299,normal
2,61017313,2019-09-30 12:35:21.053299,normal
3,15790342,2019-09-30 12:35:21.053299,normal
4,11935281,2008-02-04,failing


In [549]:
df_filtered = df_events.merge(df_outcome, how='left', on='seq_id')
df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])
df_filtered['timestamp_o'] = pd.to_datetime(df_filtered['timestamp_o'])
df_filtered = df_filtered[df_filtered['timestamp'] < df_filtered['timestamp_o']]
df_filtered.head()

Unnamed: 0,seq_id,event_type,event_attribute,timestamp,timestamp_o,outcome
0,86299410,navn,732519,1979-05-07,2019-09-30 12:35:21.053299,normal
1,86299410,penhed,1002743637,1979-05-07,2019-09-30 12:35:21.053299,normal
2,86299410,virksomhedsstatus,1,1979-05-07,2019-09-30 12:35:21.053299,normal
3,86299410,virksomhedsstatus,9,2011-06-22,2019-09-30 12:35:21.053299,normal
4,86299410,virksomhedsstatus,11,2011-10-31,2019-09-30 12:35:21.053299,normal


# Features

In [550]:
def counts(df):
    return df.groupby('event_type').size()

In [551]:
df_counts = df_filtered.groupby(['seq_id', pd.Grouper(key='timestamp', freq='Y')]).apply(counts).to_frame()
df_counts = df_counts.pivot_table(index=['seq_id', 'timestamp'] , columns='event_type', values=0)
df_counts.fillna(0, inplace=True)
df_counts.head()

Unnamed: 0_level_0,event_type,bibranche1,binavn,elektroniskpost,hovedbranche,navn,obligatoriskemail,penhed,regnummer,telefaxnummer,telefonnummer,virksomhedsform,virksomhedsstatus
seq_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10036518,2000-12-31,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
10036518,2001-12-31,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
10036518,2003-12-31,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
10036518,2008-12-31,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10036518,2009-12-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Clustering

In [552]:
kmeans = cluster.KMeans(n_clusters=5, n_init=20)
result = kmeans.fit_predict(df_counts)

In [553]:
df_clustered = df_counts.copy()
df_clustered['event_type'] = result
df_clustered.reset_index(inplace=True)
df_clustered = df_clustered.merge(df_outcome, how='left', on='seq_id')
df_clustered['timestamp'] = pd.to_datetime(df_clustered['timestamp'])
df_clustered['timestamp_o'] = pd.to_datetime(df_clustered['timestamp_o'])
df_clustered = df_clustered[df_clustered['timestamp'] < df_clustered['timestamp_o']]
df_clustered.head()

Unnamed: 0,seq_id,timestamp,bibranche1,binavn,elektroniskpost,hovedbranche,navn,obligatoriskemail,penhed,regnummer,telefaxnummer,telefonnummer,virksomhedsform,virksomhedsstatus,event_type,timestamp_o,outcome
0,10036518,2000-12-31,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,2,2019-09-30 12:35:21.053299,normal
1,10036518,2001-12-31,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,2019-09-30 12:35:21.053299,normal
2,10036518,2003-12-31,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,2019-09-30 12:35:21.053299,normal
3,10036518,2008-12-31,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2019-09-30 12:35:21.053299,normal
4,10036518,2009-12-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,2019-09-30 12:35:21.053299,normal


# Aggregation

In [559]:
aggregated = es.aggregate(df_filtered)

# aggregated = es.aggregate(df_clustered)

# Visualization

In [560]:
display(Javascript(filename="js/events.js"))
display(HTML(filename="css/events.css.html"))

<IPython.core.display.Javascript object>

In [561]:
def draw_event_hiearchy(data, width=1000, height=800):
    display(Javascript("""
        (function(element){
            require(['events'], function(events) {
                events(element.get(0), %s, %d, %d);
            });
        })(element);
    """ % (json.dumps(data), width, height)))

In [562]:
draw_event_hiearchy(aggregated)

<IPython.core.display.Javascript object>