# 1. Required packages

## For the interactivity I'm using _plotly_ library and _Jupyter Widgets_. 
It allows to prototype dashboard in some way. Hence, for correct execution of the current notebook one has to install the following packages: 
   - [plotly](https://plot.ly/python/) + [cufflinks](https://plot.ly/ipython-notebooks/cufflinks/)
   - [ipywidgets](https://github.com/jupyter-widgets/ipywidgets)

and activate the required extensions for Jupyter. Feel free to do it manually or run the following script.

In [None]:
# %%bash
# pip install plotly
# pip install cufflinks
# pip install ipywidgets
# jupyter nbextension install --py --sys-prefix widgetsnbextension
# jupyter nbextension install --py --sys-prefix plotlywidget
# jupyter nbextension enable --py --sys-prefix widgetsnbextension
# jupyter nbextension enable --py --sys-prefix plotlywidget

To validate the correct installation and activation of the required packages please execute:

In [1]:
%%bash 
jupyter nbextension list

Known nbextensions:
  config dir: /anaconda3/envs/DL/etc/jupyter/nbconfig
    notebook section
      plotlywidget/extension [32m enabled [0m
      jupyter-js-widgets/extension [32m enabled [0m


      - Validating: [32mOK[0m
      - Validating: [32mOK[0m


In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# 2. Data I/O
We assume that one has already obtained the embeddings for the researched financial statement network. 

In [3]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from NetEmbs import *

MODE = "SimulatedData"
DB_PATH = "Simulation/FSN_Data.db"


if MODE == "SimulatedData":
    EMBS_PATH = "UvA/SensitivityAnalysis/_versionMetaDiff_directionCOMBI_walks20_pressure30_window2_1hopFraction0/TFsteps100000batch64_emb32/"
#     EMBS_PATH = "Simulation_versionMetaDiff_directionCOMBI_walks30_pressure30_window3/TFsteps100000batch64_emb32/"
    embs = pd.read_pickle(EMBS_PATH+"cache/Embeddings.pkl")
    print("Embeddings have been uploaded to memory!")
    d = upload_data(DB_PATH, limit=None)
    d = prepare_data(d)
    print("Supported information has been uploaded to memory!")

if MODE == "RealData":
    import extras
    import analysis
    EMBS_PATH = "model/15108_2017_versionMetaDiff_directionCOMBI_walks31_pressure30_window3/TFsteps100000batch64_emb32/"
    embs = pd.read_pickle(EMBS_PATH+"cache/Embeddings.pkl")
        # //////// TODO UPLOAD your data HERE \\\\\\\\\\
#     d = analysis.analysis("14082_2017")
    d = extras.getData("15108_2017")
        # //////// END  \\\\\\\\\\
    # TODO pay attention for the split argument below!
    if "Value" in list(d):
        need_split = True
    else:
        need_split = False
    d = prepare_dataMarcel(d, split=need_split)
#     Here we drop the duplicate of GroundTruth in the DataFrame with supported info, because we have it in Embs DataFrame
    if "GroundTruth" in list(d):
        d.drop("GroundTruth", axis=1, inplace=True)

Embeddings have been uploaded to memory!
Final shape of DataFrame is  (58559, 9)
Supported information has been uploaded to memory!


In [5]:
embs.head(2)

Unnamed: 0,ID,Emb,x,y,GroundTruth,Time
0,7,"[0.1692866, 0.33486184, -0.17571802, -0.041063...",-13.495379,-33.940025,Sales 6 btw,1.055052
1,8,"[0.050937697, -0.051440712, 0.074395135, -0.13...",-37.793919,28.697119,Good delivery,1.055052


In [6]:
d.shape

(58559, 9)

# 3. Interactive visualization

### 3.2 Visualization

In [7]:
from __future__ import print_function
# Standard plotly imports
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, plot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks

cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
from ipywidgets import interactive, HBox, VBox, widgets

### WordClouds function

In [8]:
from collections import Counter
from wordcloud import WordCloud
# Count most frequent FA names in the given DataFrame OR FA names with the highest amount
def findMostCommonFAs_v2(df, labels_column="label", words_column="FA_Name", amount_column="amount", sort_mode="freq", n_top=4, vis=False, folder=""):
    if labels_column not in list(df):
        raise KeyError(f"Given column name {labels_column} is not presented in the given DataFrame! Only allows: {list(df)}!")
    if "from" not in list(df):
        raise KeyError(f"Please ensure that column 'from' is presented in your DataFrame!")
    for name, group in df.groupby(labels_column):
        print("Current cluster label is ", name)
        gr = group.groupby([words_column, "from"])
        counts = gr.size().to_frame(name='counts')
        all_stat = counts.join(gr.agg({amount_column: sum, 'Debit': lambda x: list(x), 'Credit': lambda x: list(x)})
              .rename(columns={amount_column: 'amount_sum', 'Debit': 'Debit_list', 'Credit': 'Credit_list'}))\
        .reset_index()
        if sort_mode == "freq":
            all_stat.sort_values(['counts', words_column], ascending=False, inplace=True)
        elif sort_mode == "amount":
            all_stat.sort_values(['amount_sum', word_column], ascending=False, inplace=True)
#             Store all statistict for N_TOP values as dictionary for further visualization
        text = {"Left": [(x[0], x[2], x[3], x[5]) for x in all_stat[all_stat["from"]==True].values[:n_top]], 
                "Right": [(x[0], x[2], x[3], x[4]) for x in all_stat[all_stat["from"]==False].values[:n_top]]}
        if vis:
            i = 0
            fig, axes = plt.subplots(2,2)
        for key, data in text.items():
            if sort_mode == "freq":
#             Take the most frequent FA names
                to_vis = [(str(item[0]), item[1]) for item in data]
            elif sort_mode == "amount":
                to_vis = [(str(item[0]), item[2]) for item in data]
            print(key, "--->", [item[:3] for item in data])
            if vis:
#                 WordClouds
                axes[0, i].set_title(key, size=24)
                wc = WordCloud(background_color="white", width=800, height=400, max_font_size=84, min_font_size=14, repeat=False, relative_scaling=0.8, max_words=100)
                if len(to_vis)>0:
                    wc.generate_from_frequencies(dict(to_vis))
                else:
                    continue
                axes[0, i].axis("off")
                axes[0, i].imshow(wc, interpolation="bilinear")
#                 Histograhm
                [sns.distplot( item[3] , label=item[0], kde=False, bins=50, ax=axes[1, i], hist_kws={"range": (0, 1.0)}) for item in data if len(item[3])>4]
                axes[1,i].legend(frameon=False, fontsize=14)
                axes[1,i].set_xlim((0,1.0))
                i+=1
        if vis:
            plt.tight_layout()
#             plt.savefig(folder + "img/WordClouds/" + str(name), dpi=140, pad_inches=0.01)
            plt.show()

### Helper functions

In [None]:
# Transform Matplotlib colormap into plotly colorscale:
import itertools
def matplotlib_to_plotly(color_map="tab10", pl_entries=10):
    cmap = matplotlib.cm.get_cmap(color_map)
    h = 1.0/(pl_entries-1)
    pl_colorscale = []

    for k in range(pl_entries):
        C = list(map(np.uint8, np.array(cmap(k*h)[:3])*255))
        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])

    return pl_colorscale

def getColors_Markers(keys, cm="tab10", n_color=10, markers = ["circle", "diamond", "square"]):
    keys = sorted(keys)
    color_map = dict(zip(keys, sns.color_palette(cm, n_color)*(len(keys)//n_color+1)))
    marker_map = dict(zip(keys, list(itertools.chain(*[[m]*n_color for m in markers]))*(len(keys)//(3*n_color)+1)))
    return color_map, marker_map

### Clustering here

In [9]:
embs.GroundTruth.nunique()

11

In [14]:
def v_measure(df):
    str_labels = list(df.GroundTruth.unique())
    real_labels = dict(zip(str_labels, range(len(str_labels))))
    from sklearn.metrics import v_measure_score
    return v_measure_score(df.GroundTruth.apply(lambda x: real_labels[x]).values, df.label.values)

In [10]:
N_CLS = 11
embs = cl_Agglomerative(embs, N_CLS)

First row of Data: 
 [ 0.16928659  0.33486184 -0.17571802 -0.04106357  0.07029159  0.16931748
 -0.17421429 -0.18707065 -0.15756008  0.13521057 -0.12681708 -0.13500977
 -0.26708183  0.09424353 -0.23384432 -0.16906002 -0.26211342 -0.16125521
  0.10504428 -0.19114669 -0.24795982 -0.03191672 -0.20394816 -0.22484238
  0.10905757  0.1725453  -0.15521425  0.21100681  0.03445403 -0.03363062
  0.13887367 -0.24079129]


In [15]:
v_measure(embs)

0.8826548482881204

In [56]:
from NetEmbs.Vis.helpers import set_font, getColors_Markers
# Label text
description = widgets.Label(
        value=''
    )
# WordCouds area
wordCloudsOutput = widgets.Output()
# Table with JournalEntries data
table_titles = ["ID", "FA_Name", "Credit", "Debit", "label"]

if MODE == "RealData":
    table_titles = ["ID", "FA_Name", "accountDesc", "Credit", "Debit", "label"]
    
t = go.FigureWidget([go.Table(
    header=dict(values=table_titles,
                fill = dict(color='#C2D4FF'),
                align = ['left'] * 5),
    cells=dict(values=[],
               fill = dict(color='#F5F8FF'),
               align = ['left'] * 5))],
                    layout = go.Layout(
                            title="Journal Entries",
                            autosize=True,
                            width=1000,
                            height=400))
# Scatter plot
N_COLORS = 10
WORD_CLOUD_LABEL = "FA_Name"
LEGEND_TITLE = "GroundTruth"
LEGEND_TITLE = "label"
tmp_p_see = None
# For selection via multiple traces... stupid way.
indexes = []
tr_nums = 0


def interactiveScatter(df, df_info, legend_title="label"):
    """Create FigureWidget with the scatter plot for the given DataFrame"""
    scatter_data = list()
    cmap, mmap = getColors_Markers(keys=df[legend_title].unique(), cm="tab10", n_colors=N_COLORS, markers=["circle", "diamond", "square"])
    for name, group in df.groupby(legend_title):
        scatter_data.append(go.Scatter(x=group.x, y=group.y, mode='markers', name=name, 
                                    text = group.apply(lambda row: f"ID={row.ID},   GroundTruth={row.GroundTruth}", axis=1),
                                    customdata = group.index.to_list(),
                                    marker=dict(color=cmap[name][1], 
                                                symbol=mmap[name])))
    f = go.FigureWidget(data=scatter_data,
                       layout = go.Layout(
                           title=f"t-SNE visualisation with coloring based on {legend_title}",
        hovermode='closest',
        autosize=True,
        width=1000,
        height=700))
    
    def printSignature(trace, points, *args):
        if len(points.point_inds)>0:
            ids = trace.customdata[points.point_inds[0]]
            row = df.iloc[ids]
            description.value = f"ID={row.ID},   GroundTruth={row.GroundTruth}"
    def selectBP(trace, points, *args):
        if len(points.point_inds)>0:
            ids = trace.customdata[points.point_inds[0]]
            row = df.iloc[[ids]]
            chosen_bps = df_info.merge(row, on="ID")
            wordCloudsOutput.clear_output()
            t.data[0].cells.values = [chosen_bps[col] for col in t.data[0].header.values]
    
    def filterRows(selected_ids):
        row = df.iloc[selected_ids]
        chosen_bps = df_info.merge(row, on="ID")
        return chosen_bps
    def updateTable(chosen_bps):
        t.data[0].cells.values = [chosen_bps[col] for col in t.data[0].header.values]
    def showClouds(chosen_bps):
        wordCloudsOutput.clear_output()
        with wordCloudsOutput:
            findMostCommonFAs_v2(chosen_bps, LEGEND_TITLE, WORD_CLOUD_LABEL, sort_mode="freq", vis=True, n_top=4)
    scatters = f.data
    max_traces = len(scatters)
    def selectBPs(trace,points,selector):
        global indexes
        global tr_nums
#         print(f"For trace index={points.trace_index} tr_nums is {tr_nums}")
        if not points.point_inds:
            pass
        else:
            indexes.extend([trace.customdata[cur_point] for cur_point in points.point_inds])
        tr_nums = tr_nums+1
        if tr_nums==max_traces:
            selected_data = filterRows(indexes)
            updateTable(selected_data)
            showClouds(selected_data)
            indexes = []
            tr_nums = 0
    # Hover text: ID and GroundTruth
    for scatter in scatters:
        scatter.hoverinfo = 'text'
        scatter.on_hover(printSignature) 
        scatter.on_click(selectBP)
        scatter.on_selection(selectBPs)

    # Selection
    return f
# @interact(Coloring=['label', 'GroundTruth'])
# def update(Coloring="label"):
#     print(Coloring)
#     f_scatter = interactiveScatter(embs, Coloring)
#     return VBox([description, f_scatter])
f_scatter = interactiveScatter(embs, d, LEGEND_TITLE)
VBox([description, f_scatter, t, wordCloudsOutput])

VBox(children=(Label(value=''), FigureWidget({
    'data': [{'customdata': [26, 37, 56, ..., 9789, 9795, 9796]…

# Time-Series modeling

In [20]:
def crosscorr(data_x, data_y, lag=0):
    """ Lag-N cross correlation. 
    Parameters
    ----------
    lag : int, default 0
    datax, datay : pandas.Series objects of equal length

    Returns
    ----------
    crosscorr : float
    """
    return data_x.shift(lag).corr(data_y)

Our initial hypotheis is that sub-groups of business process within a company should have some kind of cross correlation (e.g. goods delivery business process follows after sale business process). For deeper investigation of that hypothesis we are going to aggregate given Journal Entries (aka input raw data) based on the predicted cluster label and to build time-series from these groups w.r.t. to the transaction time.

In [33]:
# TODO helper uploader for obtain Time column. 
if MODE=="SimulatedData":
    df_all = d.drop(["Signature", "Name"], axis=1)
    print(f"Shape of supported info is {df_all.shape}, shape of embs DataFrame is {embs.shape}")
    df_all = df_all.merge(embs, on="ID")
    print(f"After merge the shape is {df_all.shape}")
    df_all = df_all.groupby(["ID", "FA_Name", "from"], as_index=False).aggregate({"amount": lambda x: np.sum(x), 
                                "Time": "first",
                              "GroundTruth": "first",
                              "label": "first",
                              "x": "first",
                              "y": "first"}) \
                        .sort_values("Time", ascending=True)
if MODE=="RealData":
    d["Date"] = pd.to_datetime(d["Date"],format='%Y-%m-%d')
#     df_all = d.groupby("ID", as_index=False).aggregate({"amount": lambda x: np.sum(x)/2.0, 
#                                "Date": "first"}).merge(embs, on="ID").sort_values("Date", ascending=True)
    df_all = d.groupby(["ID", "FA_Name", "from"], as_index=False)\
                .aggregate({"amount": lambda x: np.sum(x), 
                            "Date": "first",
                           "accountDesc": "first"})\
                .merge(embs, on="ID")\
                .sort_values("Date", ascending=True)
    df_all.set_index(df_all.Date, inplace=True)

Shape of supported info is (58559, 7), shape of embs DataFrame is (10217, 7)
After merge the shape is (58559, 13)


In [35]:
df_all.head(2)

Unnamed: 0,ID,FA_Name,from,amount,Time,GroundTruth,label,x,y
0,7,NoisyLeftFA_edir,True,1.354448,1.055052,Sales 6 btw,10,-13.495379,-33.940025
17,8,NoisyLeftFA_wouu,True,0.867604,1.055052,Good delivery,2,-37.793919,28.697119


## Add DateTimeIndex to simulated data

In [36]:
def addDateTimeIndex(df):
    df["SimulatedTime"] = df["Time"]
    df["Time"] = df["Time"].apply(lambda x: np.datetime64('2019-01-01')+np.timedelta64(int(x*28.8), 'm'))
    return df.set_index("Time")
if MODE=="SimulatedData":
    df_all = addDateTimeIndex(df_all)

## Find optimal scaling for simulated data

In [None]:
# def findDateTimeIndex(dfA, dfB, mult=55, on="Time", data_column="amount", agg_period="D"):
#     dfA = dfA.set_index(dfA[on].apply(lambda x: np.datetime64('2019-01-01')+np.timedelta64(int(x*mult), 'm')))
#     dfB = dfB.set_index(dfB[on].apply(lambda x: np.datetime64('2019-01-01')+np.timedelta64(int(x*mult), 'm')))
# #     return dfA, dfB
#     return crosscorr(dfA[data_column].resample(agg_period).sum(), dfB[data_column].resample(agg_period).sum(), 0)

# res = {cur_m: findDateTimeIndex(sales, collections, agg_period="D", mult=cur_m) for cur_m in np.linspace(50, 150, 200)}
# res = sorted(res.items(), key=lambda x: x[1], reverse=True)

## Get part of data with required labe/GroundTruth

In [37]:
def filterData(df, query=[["Sales 21 btw", "Sales 6 btw"], ["Collections"]], on="GroundTruth"):
    result = list()
    for q in query:
        result.append(df_all[df_all[on].isin(q)])
        result[-1].name=str(q)[1:-1]
        if on == "label":
            result[-1].name+=" cluster"
    return tuple(result)

def filterData_v3(df, 
                  query=[{"select": ["Sales 21 btw", "Sales 6 btw"], 
                              "_with": {"FA_Name": "Revenue", "from": True}}, 
                 {"select": ["Collections"], "_with": None}], 
                  on="GroundTruth"):
    result = list()
    for q in query:
        postfix = ""
        if q["select"] is None:
            cur_df = df
        else:
            cur_df = df[df[on].isin(q["select"])]
        if q["_with"] is not None:
            for key, value in q["_with"].items():
                try:
                    cur_df = cur_df[cur_df[key]==value]
                    postfix+="_"+str(value)
                except KeyError as e:
                    raise(f"{a} is not in a columns titles!")
        result.append(cur_df)
        result[-1].name=str(q["select"])+postfix
        if on == "label":
            result[-1].name+=" cluster"
    if len(result)==1:
        return result[0]
    else:
        return tuple(result)

In [38]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
from sklearn import linear_model
def plotAmounts(DFs, title="Default signals", vis_type="line"):
    """Helper funciton to plot a few DataFrame in one plotly graph"""
    if vis_type=="line":
        if len(DFs)>1:
            fig2 = go.Figure(data=[go.Scatter(x=df.index,
            y=df.amount,
            name=df.name
            ) for df in DFs], 
            layout = go.Layout(showlegend=True, title=go.layout.Title(text=title), hovermode='closest'))
        else:
            fig2 = go.Figure(data=go.Scatter(x=DFs.index,
            y=DFs.amount,
            name=DFs.name, 
            layout = go.Layout(showlegend=True, title=go.layout.Title(text=title), hovermode='closest')))
        iplot(fig2)
    elif vis_type == "scatter" and len(DFs)==2:
        print([df.shape for df in DFs])
        import seaborn as sns
        if MODE=="SimulatedData":
            sc_data = DFs[0].merge(DFs[1], on="Time", how="inner", suffixes=("_X", "_Y"))
        elif MODE=="RealData":
            sc_data = DFs[0].merge(DFs[1], on="Date", how="inner", suffixes=("_X", "_Y"))
        sns.regplot(x=sc_data.amount_X, y=sc_data.amount_Y)
#         regr = linear_model.LinearRegression()
#         regr.fit(DFs[0].amount.values.reshape(-1, 1), DFs[1].amount.values.reshape(-1, 1))
#         fig2 = go.Figure(data=[go.Scatter(x=DFs[0].amount, y=DFs[1].amount, mode='markers', name="Amounts"),
#                               go.Scatter(x=DFs[0].amount, y=regr.predict(DFs[0].amount.values.reshape(-1, 1)), name = "Best fit",
#                 mode='lines',
#                 line=dict(color='blue', width=2)
#                 )],
#                 layout = go.Layout(showlegend=True, 
#                                             title=go.layout.Title(text=str(DFs[0].name) +" vs. " + str(DFs[1].name))))

In [59]:
df_filtered = filterData_v3(df_all, query=[{"select": ["Sales 21 btw", "Sales 6 btw"], 
                              "_with": None}, 
                                {"select": ["Collections"], "_with": None}], on="GroundTruth")
plotAmounts(df_filtered)

## Resample our TimeSeries with Dayly/Weekly/Monthly frequencies

In [60]:
sales = df_filtered[0]
collections = df_filtered[1]

In [68]:
sales.ID.nunique(), collections.ID.nunique()

(1866, 2000)

In [63]:
sales_w = sales.resample("D").apply({"amount": sum, "GroundTruth": pd.Series.mode, "label": pd.Series.mode})
sales_w.name = sales.name+", weekly"
collections_w = collections.resample("D").apply({"amount": sum, "GroundTruth": pd.Series.mode, "label": pd.Series.mode})
collections_w.name = collections.name+", weekly"

In [66]:
sales_w.merge(collections_w, on="Time")

Unnamed: 0_level_0,amount_x,GroundTruth_x,label_x,amount_y,GroundTruth_y,label_y
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-01,32542.969825,Sales 6 btw,6,20347.892899,Collections,0
2019-01-02,20219.804910,Sales 6 btw,6,32223.727450,Collections,0
2019-01-03,25776.716940,Sales 6 btw,6,22622.137161,Collections,0
2019-01-04,25380.637548,Sales 21 btw,6,19168.951187,Collections,0
2019-01-05,33616.854161,Sales 21 btw,10,22487.937974,Collections,4
2019-01-06,23003.640876,Sales 6 btw,6,38477.850436,Collections,0
2019-01-07,37241.122277,Sales 6 btw,6,22477.508596,Collections,4
2019-01-08,21733.195736,Sales 6 btw,6,35840.544453,Collections,0
2019-01-09,31415.473240,Sales 21 btw,6,22296.098337,Collections,0
2019-01-10,24825.511385,Sales 21 btw,6,25357.512303,Collections,0


In [42]:
plotAmounts([sales_w, collections_w], vis_type="line")

## All in One function

In [44]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
legend_postfix = {"W": ", weekly", "D": ", daily", "M": ", monthly"}
def constructSignals(df_all, query=[{"select": [2], "_with": None}, {"select": [4], "_with": None}], 
                     on="label", agg_period="W", vis_type="line"):
    # Predicted labels
    agg_title = "Aggregated signals"
    sales, collections = filterData_v3(df_all, query=query, on=on)
#     plotAmounts([sales, collections], "Original signals, without aggregation")
    print("Started aggregation...")
    sales_w = sales.resample(agg_period).agg({"amount": sum})
    collections_w = collections.resample(agg_period).apply({"amount": sum})
#     Add info about aggregation period to legen texts
    try:
        sales_w.name = sales.name+legend_postfix[agg_period]
        collections_w.name = collections.name+legend_postfix[agg_period]
        agg_title+=legend_postfix[agg_period]
    except KeyError as e:
        print(f"Could not intepret {agg_period} for adding postfix to legend text... Use the default ones.." )
        sales_w.name = sales.name
        collections_w.name = collections.name
    plotAmounts([sales_w, collections_w], agg_title, vis_type=vis_type)
    print([crosscorr(sales.amount.resample(agg_period).sum(), collections.amount.resample(agg_period).sum(), lag) for lag in range(4)])

In [58]:
constructSignals(df_all, query=[{"select": ["Sales 6 btw", "Sales 21 btw"], 
                              "_with": None}, 
                                {"select": ["Collections"], "_with": None}], on="GroundTruth", agg_period="2D", vis_type="line")

Started aggregation...
Could not intepret 2D for adding postfix to legend text... Use the default ones..


[0.15445547186368594, 0.012824784824743859, 0.045619243695561675, 0.06263141747455367]


## Tax vs. Revenue based on the Sales , GroundTruth

In [46]:
constructSignals(df_all, query=[{"select": ["Sales 21 btw"], 
                              "_with": {"FA_Name": "Revenue"}}, 
                                {"select": ["Sales 21 btw"], "_with": {"FA_Name": "Tax"}}], on="GroundTruth", agg_period="D", vis_type="line")

Started aggregation...


[0.9999815322552272, -0.1187576345720236, -0.26509404188400837, 0.009997507823279805]


## Tax vs. Revenue based on the Sales *as-expert*-cluster

In [None]:
def check_is_sale(df):
    return pd.Series({"ID": df.ID.values[0], "is_Sale": ("Revenue" in df[df["from"]==True].FA_Name.unique()) and ("TradeReceivables" in df[df["from"]==False].FA_Name.unique())})
expert_sales = df_all.reset_index()\
        .merge(df_all.groupby("ID", as_index=False).apply(check_is_sale), on="ID", left_index=True)\
        .set_index("Time", drop=True)

In [None]:
constructSignals(expert_sales, query=[{"select": [True], 
                              "_with": {"FA_Name": "Revenue"}}, 
                                {"select": [True], "_with": {"FA_Name": "Tax"}}], on="is_Sale", agg_period="D", vis_type="line")

## Tax vs. Revenue based on the Sales predicted-cluster

In [49]:
constructSignals(df_all, query=[{"select": [6, 10], 
                              "_with": {"FA_Name": "Revenue"}}, 
                                {"select": [6, 10], "_with": {"FA_Name": "Tax"}}], on="label", agg_period="2D", vis_type="line")

Started aggregation...
Could not intepret 2D for adding postfix to legend text... Use the default ones..


[0.8955234620455036, -0.010053237959088985, 0.08364857663727246, 0.1750108446037379]


## Tax vs. Revenue based on FA Names

In [51]:
constructSignals(df_all, query=[{"select": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                              "_with": {"FA_Name": "Revenue"}}, 
                                {"select": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "_with": {"FA_Name": "Tax"}}], on="label", agg_period="2D", vis_type="line")

Started aggregation...
Could not intepret 2D for adding postfix to legend text... Use the default ones..


[0.6264328463907124, 0.04652057080720376, 0.07302363319614152, 0.12716603584992014]


In [54]:
from plotly.offline import iplot
import plotly.figure_factory as ff
import seaborn as sns
def calculate_corr(df_all, query=[1, 2], on="label", agg_period="2D", lag=0):
    X, Y = [df_all[df_all[on]==q] for q in query]
    return crosscorr(X.amount.resample(agg_period).sum(), Y.amount.resample(agg_period).sum(), lag)

def get_corr_matrix(df, on="label", agg_period="D", lag=0):
    labels = sorted(df[on].unique())
    corr_matrix = np.zeros((len(labels), len(labels)))
    for x_pos, x in enumerate(labels):
        for y_pos, y in enumerate(labels):
            if x!=y:
                try:
                    corr_matrix[x_pos, y_pos] = calculate_corr(df, query=[x,y], on=on, agg_period=agg_period, lag=lag)
                except FloatingPointError as float_er:
                    corr_matrix[x_pos, y_pos] = 0.0
    return corr_matrix

axis_prefix = {"label": "Cluster ", "GroundTruth": ""}


def make_annotations(z, x, y, annotation_text):
    annotations = []
    for n, row in enumerate(z):
        for m, val in enumerate(row):
            annotations.append(
                go.layout.Annotation(
                    text=str(annotation_text[n][m]) if annotation_text[n][m] != 0.0 else "",
                    x=x[m],
                    y=y[n],
                    xref='x1',
                    yref='y1',
                    font=dict(color="Black"),
                    showarrow=False))
    return annotations


def corrHeatmap_interactive(corr_matrix, labels, on="label"):
    x = [axis_prefix[on] + str(cl) for cl in labels]
    y = [axis_prefix[on] + str(cl) for cl in labels]
    # Generate a mask for the upper triangle
    mask = np.ones_like(corr_matrix, dtype=np.bool)
    mask[np.tril_indices_from(mask, k=0)] = False
    corr_matrix[mask] = 0.0
    z_text = np.round(corr_matrix, 2)
    cs = [[00.0, 'rgb(31, 119, 180)'],  # blue
          [0.5, 'rgb(255,255,255)'],  # white
          [1, 'rgb(214, 39, 40)']]  # red

    trace = go.Heatmap(z=corr_matrix, x=x, y=y, zmid=0, zmin=-1, zmax=1, colorscale=cs, showscale=True,
                       colorbar={"thickness": 20, "len": 0.5, "outlinewidth": 0, "xpad": 25,
                                 "title": {"text": "\n \n Correlation", "side": "right"}})
    fig = go.Figure(data=[trace])
    fig.layout.title = go.layout.Title(text="Cross-correlation for the " + on)
    fig.layout.height = len(labels) * 70 + 200
    fig.layout.width = len(labels) * 70 + 25 + 25 + 150
    fig.layout.margin = go.layout.Margin(l=100, r=50, b=100, t=200, pad=10)
    fig.layout.annotations = make_annotations(corr_matrix, x, y, z_text)
    fig.layout.xaxis.side = "top"
    fig.layout.yaxis.automargin = True
    fig.layout.xaxis.automargin = True
    iplot(fig)


def corrHeatmap_static(corr_matrix, labels, on="label"):
    sns.set_context("paper", rc={'figure.figsize': (20, 10), "font.size": 12, "axes.titlesize": 16, "axes.labelsize": 20,
                        "xtick.labelsize": 12, "ytick.labelsize": 12})
    x_ticks = [axis_prefix[on] + str(cl) for cl in labels]
    y_ticks = [axis_prefix[on] + str(cl) for cl in labels]
    z_text = np.round(corr_matrix, 2)
    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr_matrix, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))
    sns.heatmap(corr_matrix, mask=mask, annot=z_text, cmap="RdBu_r", vmin=-1, vmax=1, center=0, xticklabels=x_ticks,
                yticklabels=y_ticks,
                square=True, linewidths=.1, cbar_kws={"shrink": .5})
    ax.set_title("Cross-correlation for the " + on)
    plt.tight_layout()
    plt.plot()
    
def correlationMatrix(df, on="label", agg_period="D", lag=0, interactive=True):
    corr_matrix = get_corr_matrix(df, on=on, agg_period=agg_period, lag=lag)
    labels = sorted(df[on].unique())
    if interactive:
#         Use plotly
        return corrHeatmap_interactive(corr_matrix, labels=labels, on=on)
    else:
#         Use seaborn for visualization
        corrHeatmap_static(corr_matrix, labels=labels, on=on)

In [55]:
correlationMatrix(df_all, on="label", agg_period="2D", interactive=True)
correlationMatrix(df_all, on="GroundTruth", agg_period="2D", interactive=True)

## Clusters 0 and 2 in detail

In [None]:
cl_0 = df_all[df_all["label"] == 0]
cl_0_D = cl_0.amount.resample("D").sum()
cl_2 = df_all[df_all["label"] == 2]
cl_2_D = cl_2.amount.resample("D").sum()

In [None]:
from scipy import stats
try:
    print(stats.pearsonr(cl_0_D, cl_2_D), stats.spearmanr(cl_0_D, cl_2_D))
except:
    print(stats.pearsonr(cl_0_D, cl_2_D[:-1]), stats.spearmanr(cl_0_D, cl_2_D[:-1]))

In [None]:
cl_0.resample("D").agg({"amount": sum}).rename(columns={"amount": "Cluster 0"}).head(2)