# 1. Required packages

## For the interactivity I'm using _plotly_ library and _Jupyter Widgets_. 
It allows to prototype dashboard in some way. Hence, for correct execution of the current notebook one has to install the following packages: 
   - [plotly](https://plot.ly/python/) + [cufflinks](https://plot.ly/ipython-notebooks/cufflinks/)
   - [ipywidgets](https://github.com/jupyter-widgets/ipywidgets)

and activate the required extensions for Jupyter. Feel free to do it manually or run the following script.

In [None]:
# %%bash
# pip install plotly
# pip install cufflinks
# pip install ipywidgets
# jupyter nbextension install --py --sys-prefix widgetsnbextension
# jupyter nbextension install --py --sys-prefix plotlywidget
# jupyter nbextension enable --py --sys-prefix widgetsnbextension
# jupyter nbextension enable --py --sys-prefix plotlywidget

To validate the correct installation and activation of the required packages please execute:

In [1]:
%%bash 
jupyter nbextension list

Known nbextensions:
  config dir: /anaconda3/envs/DL/etc/jupyter/nbconfig
    notebook section
      plotlywidget/extension [32m enabled [0m
      jupyter-js-widgets/extension [32m enabled [0m


      - Validating: [32mOK[0m
      - Validating: [32mOK[0m


In [293]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# 2. Data I/O
We assume that one has already obtained the embeddings for the researched financial statement network. 

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from NetEmbs import *

MODE = "SimulatedData"


if MODE == "SimulatedData":
    EMBS_PATH = "Simulation_versionMetaDiff_directionCOMBI_walks30_pressure30_window3/TFsteps100000batch64_emb32/"
    embs = pd.read_pickle(EMBS_PATH+"cache/Embeddings.pkl")
    print("Embeddings have been uploaded to memory!")
    d = upload_data("Simulation/FSN_Data.db", limit=None)
    d = prepare_data(d)
    print("Supported information has been uploaded to memory!")

if MODE == "RealData":
    import extras
    import analysis
    EMBS_PATH = "model/15108_2017_versionMetaDiff_directionCOMBI_walks31_pressure30_window3/TFsteps100000batch64_emb32/"
    embs = pd.read_pickle(EMBS_PATH+"cache/Embeddings.pkl")
        # //////// TODO UPLOAD your data HERE \\\\\\\\\\
#     d = analysis.analysis("14082_2017")
    d = extras.getData("15108_2017")
        # //////// END  \\\\\\\\\\
    # TODO pay attention for the split argument below!
    if "Value" in list(d):
        need_split = True
    else:
        need_split = False
    d = prepare_dataMarcel(d, split=need_split)
#     Here we drop the duplicate of GroundTruth in the DataFrame with supported info, because we have it in Embs DataFrame
    if "GroundTruth" in list(d):
        d.drop("GroundTruth", axis=1, inplace=True)

Embeddings have been uploaded to memory!
Original shape of DataFrame is  (61158, 4)
Deleted all NaNs and Strings values from 'Value' column:  (61158, 4)
Before merging FAs columns titles are:  ['ID', 'Name', 'FA_Name', 'Value', 'Debit', 'Credit']
After merging FAs columns titles are:  ['ID', 'FA_Name', 'Credit', 'Debit', 'Name', 'Value']  shape is  (61050, 6)
After normalization shape of DataFrame is  (60740, 8)
Final shape of DataFrame is  (52175, 9)
Supported information has been uploaded to memory!


In [4]:
embs.head(2)

Unnamed: 0,ID,Emb,GroundTruth
0,8,"[-0.19695708, -0.22684997, 0.115069434, 0.1744...",Sales 21 btw
1,9,"[0.15847915, 0.0964187, -0.09496842, -0.130400...",Cost of Sales


In [5]:
d.head(2)

Unnamed: 0,ID,Signature,FA_Name,Credit,Debit,Name,Value,from,amount
0,8,"([('Revenue', 0.82629), ('Tax', 0.17371)], [('...",NoisyRightFA_ilpe,0.0,0.001441,NoisyRightFA_ilpe_6,0.678201,False,0.678201
1,8,"([('Revenue', 0.82629), ('Tax', 0.17371)], [('...",NoisyRightFA_jhzg,0.0,0.003988,NoisyRightFA_jhzg_6,1.877578,False,1.877578


# 3. Interactive visualization

### 3.1 Decrease the dim of embeddings for visualization purpose

In [6]:
import os
from sklearn.manifold import TSNE
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
def dim_reduction(df, n_dim=2, rand_state=1):
    if "Emb" in list(df):
        tsne = TSNE(random_state=rand_state)
        embdf = pd.DataFrame(list(map(np.ravel, df["Emb"])))
        embed_tsne = tsne.fit_transform(embdf)
        df["x"] = pd.Series(embed_tsne[:, 0])
        df["y"] = pd.Series(embed_tsne[:, 1])
        return df
    else:
        raise KeyError("No Embs column in the given DataFrame!")

### 3.2 Visualization

In [7]:
from __future__ import print_function
# Standard plotly imports
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, plot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
from ipywidgets import interactive, HBox, VBox, widgets

### WordClouds function

In [8]:
from collections import Counter
from wordcloud import WordCloud
# Count most frequent FA names in the given DataFrame OR FA names with the highest amount
def findMostCommonFAs_v2(df, labels_column="label", words_column="FA_Name", amount_column="amount", sort_mode="freq", n_top=4, vis=False, folder=""):
    if labels_column not in list(df):
        raise KeyError(f"Given column name {labels_column} is not presented in the given DataFrame! Only allows: {list(df)}!")
    if "from" not in list(df):
        raise KeyError(f"Please ensure that column 'from' is presented in your DataFrame!")
    for name, group in df.groupby(labels_column):
        print("Current cluster label is ", name)
        gr = group.groupby([words_column, "from"])
        counts = gr.size().to_frame(name='counts')
        all_stat = counts.join(gr.agg({amount_column: sum, 'Debit': lambda x: list(x), 'Credit': lambda x: list(x)})
              .rename(columns={amount_column: 'amount_sum', 'Debit': 'Debit_list', 'Credit': 'Credit_list'}))\
        .reset_index()
        if sort_mode == "freq":
            all_stat.sort_values(['counts', words_column], ascending=False, inplace=True)
        elif sort_mode == "amount":
            all_stat.sort_values(['amount_sum', word_column], ascending=False, inplace=True)
#             Store all statistict for N_TOP values as dictionary for further visualization
        text = {"Left": [(x[0], x[2], x[3], x[5]) for x in all_stat[all_stat["from"]==True].values[:n_top]], 
                "Right": [(x[0], x[2], x[3], x[4]) for x in all_stat[all_stat["from"]==False].values[:n_top]]}
        if vis:
            i = 0
            fig, axes = plt.subplots(2,2)
        for key, data in text.items():
            if sort_mode == "freq":
#             Take the most frequent FA names
                to_vis = [(str(item[0]), item[1]) for item in data]
            elif sort_mode == "amount":
                to_vis = [(str(item[0]), item[2]) for item in data]
            print(key, "--->", [item[:3] for item in data])
            if vis:
#                 WordClouds
                axes[0, i].set_title(key, size=24)
                wc = WordCloud(background_color="white", width=800, height=400, max_font_size=84, min_font_size=14, repeat=False, relative_scaling=0.8, max_words=100)
                if len(to_vis)>0:
                    wc.generate_from_frequencies(dict(to_vis))
                else:
                    continue
                axes[0, i].axis("off")
                axes[0, i].imshow(wc, interpolation="bilinear")
#                 Histograhm
                [sns.distplot( item[3] , label=item[0], kde=False, bins=50, ax=axes[1, i], hist_kws={"range": (0, 1.0)}) for item in data if len(item[3])>4]
                axes[1,i].legend(frameon=False, fontsize=14)
                axes[1,i].set_xlim((0,1.0))
                i+=1
        if vis:
            plt.tight_layout()
#             plt.savefig(folder + "img/WordClouds/" + str(name), dpi=140, pad_inches=0.01)
            plt.show()

### Helper functions

In [9]:
# Transform Matplotlib colormap into plotly colorscale:
import itertools
def matplotlib_to_plotly(color_map="tab10", pl_entries=10):
    cmap = matplotlib.cm.get_cmap(color_map)
    h = 1.0/(pl_entries-1)
    pl_colorscale = []

    for k in range(pl_entries):
        C = list(map(np.uint8, np.array(cmap(k*h)[:3])*255))
        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])

    return pl_colorscale

def getColors_Markers(keys, cm="tab10", n_color=10, markers = ["circle", "diamond", "square"]):
    keys = sorted(keys)
    color_map = dict(zip(keys, matplotlib_to_plotly(cm, n_color)*(len(keys)//n_color+1)))
    marker_map = dict(zip(keys, list(itertools.chain(*[[m]*n_color for m in markers]))*(len(keys)//(3*n_color)+1)))
    return color_map, marker_map

### Clustering here

In [10]:
N_CLS = 8
embs = dim_reduction(cl_Agglomerative(embs, N_CLS))

First row of Data: 
 [-0.19695708 -0.22684997  0.11506943  0.17442311  0.20106015 -0.13254054
  0.24126534  0.16323344  0.1949625   0.09444448  0.16678071  0.16417685
 -0.12703338  0.15944019 -0.24492544 -0.00580015 -0.07937735 -0.17278388
 -0.07006469  0.32693356 -0.23947953  0.04118197 -0.16536982  0.20513551
  0.17499836 -0.29295653  0.0757151   0.21822007  0.14713573 -0.05681241
 -0.17912321 -0.12949651]


In [12]:
# TODO before visualization one has to use t-SNE!
# Label text
description = widgets.Label(
        value=''
    )
# WordCouds area
wordCloudsOutput = widgets.Output()
# Table with JournalEntries data
table_titles = ["ID", "FA_Name", "Credit", "Debit", "label"]

if MODE == "RealData":
    table_titles = ["ID", "FA_Name", "accountDesc", "Credit", "Debit", "label"]
    
t = go.FigureWidget([go.Table(
    header=dict(values=table_titles,
                fill = dict(color='#C2D4FF'),
                align = ['left'] * 5),
    cells=dict(values=[],
               fill = dict(color='#F5F8FF'),
               align = ['left'] * 5))],
                    layout = go.Layout(
                            title="Journal Entries",
                            autosize=True,
                            width=1000,
                            height=400))
# Scatter plot
N_COLORS = 10
WORD_CLOUD_LABEL = "FA_Name"
# LEGEND_TITLE = "GroundTruth"
LEGEND_TITLE = "label"
tmp_p_see = None
# For selection via multiple traces... stupid way.
indexes = []
tr_nums = 0


def interactiveScatter(df, df_info, legend_title="label"):
    """Create FigureWidget with the scatter plot for the given DataFrame"""
    scatter_data = list()
    cmap, mmap = getColors_Markers(keys=df[legend_title].unique(), n_color=N_COLORS)
    for name, group in df.groupby(legend_title):
        scatter_data.append(go.Scatter(x=group.x, y=group.y, mode='markers', name=name, 
                                    text = group.apply(lambda row: f"ID={row.ID},   GroundTruth={row.GroundTruth}", axis=1),
                                    customdata = group.index.to_list(),
                                    marker=dict(color=cmap[name][1], 
                                                symbol=mmap[name])))
    f = go.FigureWidget(data=scatter_data,
                       layout = go.Layout(
                           title=f"t-SNE visualisation with coloring based on {legend_title}",
        hovermode='closest',
        autosize=True,
        width=1000,
        height=700))
    
    def printSignature(trace, points, *args):
        if len(points.point_inds)>0:
            ids = trace.customdata[points.point_inds[0]]
            row = df.iloc[ids]
            description.value = f"ID={row.ID},   GroundTruth={row.GroundTruth}"
    def selectBP(trace, points, *args):
        if len(points.point_inds)>0:
            ids = trace.customdata[points.point_inds[0]]
            row = df.iloc[[ids]]
            chosen_bps = df_info.merge(row, on="ID")
            t.data[0].cells.values = [chosen_bps[col] for col in t.data[0].header.values]
    
    def filterRows(selected_ids):
        row = df.iloc[selected_ids]
        chosen_bps = df_info.merge(row, on="ID")
        return chosen_bps
    def updateTable(chosen_bps):
        t.data[0].cells.values = [chosen_bps[col] for col in t.data[0].header.values]
    def showClouds(chosen_bps):
        wordCloudsOutput.clear_output()
        with wordCloudsOutput:
            findMostCommonFAs_v2(chosen_bps, LEGEND_TITLE, WORD_CLOUD_LABEL, sort_mode="freq", vis=True, n_top=4)
    scatters = f.data
    max_traces = len(scatters)
    def selectBPs(trace,points,selector):
        global indexes
        global tr_nums
#         print(f"For trace index={points.trace_index} tr_nums is {tr_nums}")
        if not points.point_inds:
            pass
        else:
            indexes.extend([trace.customdata[cur_point] for cur_point in points.point_inds])
        tr_nums = tr_nums+1
        if tr_nums==max_traces:
            selected_data = filterRows(indexes)
            updateTable(selected_data)
            showClouds(selected_data)
            indexes = []
            tr_nums = 0
    # Hover text: ID and GroundTruth
    for scatter in scatters:
        scatter.hoverinfo = 'text'
        scatter.on_hover(printSignature) 
        scatter.on_click(selectBP)
        scatter.on_selection(selectBPs)

    # Selection
    return f
# @interact(Coloring=['label', 'GroundTruth'])
# def update(Coloring="label"):
#     print(Coloring)
#     f_scatter = interactiveScatter(embs, Coloring)
#     return VBox([description, f_scatter])
f_scatter = interactiveScatter(embs, d, LEGEND_TITLE)
VBox([description, f_scatter, t, wordCloudsOutput])

VBox(children=(Label(value=''), FigureWidget({
    'data': [{'customdata': [1, 2, 3, ..., 6492, 6501, 6508],
 …

In [None]:
# import sklearn
# sklearn.metrics.pairwise.cosine_similarity(embs[embs.ID==10].Emb.values[0].reshape(1,-1), 
#                                            embs[embs.ID==970].Emb.values[0].reshape(1, -1))

# Time-Series modeling

In [327]:
def crosscorr(data_x, data_y, lag=0):
    """ Lag-N cross correlation. 
    Parameters
    ----------
    lag : int, default 0
    datax, datay : pandas.Series objects of equal length

    Returns
    ----------
    crosscorr : float
    """
    return data_x.shift(lag).corr(data_y)

Our initial hypotheis is that sub-groups of business process within a company should have some kind of cross correlation (e.g. goods delivery business process follows after sale business process). For deeper investigation of that hypothesis we are going to aggregate given Journal Entries (aka input raw data) based on the predicted cluster label and to build time-series from these groups w.r.t. to the transaction time.

In [306]:
# TODO helper uploader for obtain Time column. 
if MODE=="SimulatedData":
    df_all = d.merge(upload_JournalEntriesTruth("Simulation/FSN_Data.db", limit=None)[["ID", "Time"]], on="ID")\
        .drop(["Signature", "Name"], axis=1)
    print(f"Shape of supported info is {df_all.shape}, shape of embs DataFrame is {embs.shape}")
    df_all = df_all.merge(embs, on="ID")
    print(f"After merge the shape is {df_all.shape}")
    df_all = df_all.groupby(["ID", "FA_Name", "from"], as_index=False).aggregate({"amount": lambda x: np.sum(x), 
                                "Time": "first",
                              "GroundTruth": "first",
                              "label": "first",
                              "x": "first",
                              "y": "first"}) \
                        .sort_values("Time", ascending=True)
if MODE=="RealData":
    d["Date"] = pd.to_datetime(d["Date"],format='%Y-%m-%d')
#     df_all = d.groupby("ID", as_index=False).aggregate({"amount": lambda x: np.sum(x)/2.0, 
#                                "Date": "first"}).merge(embs, on="ID").sort_values("Date", ascending=True)
    df_all = d.groupby(["ID", "FA_Name", "from"], as_index=False)\
                .aggregate({"amount": lambda x: np.sum(x), 
                            "Date": "first",
                           "accountDesc": "first"})\
                .merge(embs, on="ID")\
                .sort_values("Date", ascending=True)
    df_all.set_index(df_all.Date, inplace=True)

Shape of supported info is (52175, 8), shape of embs DataFrame is (6845, 6)
After merge the shape is (52175, 13)


In [302]:
df_all.head(2)

Unnamed: 0,ID,FA_Name,from,amount,Time,GroundTruth,label,x,y
0,8,NoisyRightFA_ilpe,False,0.678201,1.543287,Sales 21 btw,2,-13.842628,-3.367294
18,9,NoisyLeftFA_vjmc,True,1.328013,1.543287,Cost of Sales,0,62.452831,0.478696


## Add DateTimeIndex to simulated data

In [307]:
def addDateTimeIndex(df):
    df["SimulatedTime"] = df["Time"]
    df["Time"] = df["Time"].apply(lambda x: np.datetime64('2019-01-01')+np.timedelta64(int(x*28.8), 'm'))
    return df.set_index("Time")
if MODE=="SimulatedData":
    df_all = addDateTimeIndex(df_all)

## Find optimal scaling for simulated data

In [228]:
def addDateTimeIndex(dfA, dfB, mult=55, on="Time", data_column="amount", agg_period="D"):
    dfA = dfA.set_index(dfA[on].apply(lambda x: np.datetime64('2019-01-01')+np.timedelta64(int(x*mult), 'm')))
    dfB = dfB.set_index(dfB[on].apply(lambda x: np.datetime64('2019-01-01')+np.timedelta64(int(x*mult), 'm')))
#     return dfA, dfB
    return crosscorr(dfA[data_column].resample(agg_period).sum(), dfB[data_column].resample(agg_period).sum(), 0)

sales, collections = filterData_v3(df_all, query=[{"select": ["Sales 21 btw", "Sales 6 btw"], 
                              "_with": None}, 
                                {"select": ["Collections"], "_with": None}], on="GroundTruth")
res = {cur_m: addDateTimeIndex(sales, collections, agg_period="D", mult=cur_m) for cur_m in np.linspace(50, 150, 200)}
res = sorted(res.items(), key=lambda x: x[1], reverse=True)

## Get part of data with required labe/GroundTruth

In [221]:
def filterData(df, query=[["Sales 21 btw", "Sales 6 btw"], ["Collections"]], on="GroundTruth"):
    result = list()
    for q in query:
        result.append(df_all[df_all[on].isin(q)])
        result[-1].name=str(q)[1:-1]
        if on == "label":
            result[-1].name+=" cluster"
    return tuple(result)

def filterData_v3(df, 
                  query=[{"select": ["Sales 21 btw", "Sales 6 btw"], 
                              "_with": {"FA_Name": "Revenue", "from": True}}, 
                 {"select": ["Collections"], "_with": None}], 
                  on="GroundTruth"):
    result = list()
    for q in query:
        postfix = ""
        if q["select"] is None:
            cur_df = df
        else:
            cur_df = df[df[on].isin(q["select"])]
        if q["_with"] is not None:
            for key, value in q["_with"].items():
                try:
                    cur_df = cur_df[cur_df[key]==value]
                    postfix+="_"+str(value)
                except KeyError as e:
                    raise(f"{a} is not in a columns titles!")
        result.append(cur_df)
        result[-1].name=str(q["select"])+postfix
        if on == "label":
            result[-1].name+=" cluster"
    if len(result)==1:
        return result[0]
    else:
        return tuple(result)

In [66]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
from sklearn import linear_model
def plotAmounts(DFs, title="Default signals", vis_type="line"):
    """Helper funciton to plot a few DataFrame in one plotly graph"""
    if vis_type=="line":
        if len(DFs)>1:
            fig2 = go.Figure(data=[go.Scatter(x=df.index,
            y=df.amount,
            name=df.name
            ) for df in DFs], 
            layout = go.Layout(showlegend=True, title=go.layout.Title(text=title), hovermode='closest'))
        else:
            fig2 = go.Figure(data=go.Scatter(x=DFs.index,
            y=DFs.amount,
            name=DFs.name, 
            layout = go.Layout(showlegend=True, title=go.layout.Title(text=title), hovermode='closest')))
        iplot(fig2)
    elif vis_type == "scatter" and len(DFs)==2:
        print([df.shape for df in DFs])
        import seaborn as sns
        if MODE=="SimulatedData":
            sc_data = DFs[0].merge(DFs[1], on="Time", how="inner", suffixes=("_X", "_Y"))
        elif MODE=="RealData":
            sc_data = DFs[0].merge(DFs[1], on="Date", how="inner", suffixes=("_X", "_Y"))
        sns.regplot(x=sc_data.amount_X, y=sc_data.amount_Y)
#         regr = linear_model.LinearRegression()
#         regr.fit(DFs[0].amount.values.reshape(-1, 1), DFs[1].amount.values.reshape(-1, 1))
#         fig2 = go.Figure(data=[go.Scatter(x=DFs[0].amount, y=DFs[1].amount, mode='markers', name="Amounts"),
#                               go.Scatter(x=DFs[0].amount, y=regr.predict(DFs[0].amount.values.reshape(-1, 1)), name = "Best fit",
#                 mode='lines',
#                 line=dict(color='blue', width=2)
#                 )],
#                 layout = go.Layout(showlegend=True, 
#                                             title=go.layout.Title(text=str(DFs[0].name) +" vs. " + str(DFs[1].name))))

In [223]:
plotAmounts(df_filtered)

## Resample our TimeSeries with Dayly/Weekly/Monthly frequencies

In [22]:
sales = df_filtered[0]
collections = df_filtered[1]

In [23]:
sales_w = sales.resample("D").apply({"amount": sum, "GroundTruth": pd.Series.mode, "label": pd.Series.mode})
sales_w.name = sales.name+", weekly"
collections_w = collections.resample("D").apply({"amount": sum, "GroundTruth": pd.Series.mode, "label": pd.Series.mode})
collections_w.name = collections.name+", weekly"

In [48]:
plotAmounts([sales_w, collections_w], vis_type="line")

In [27]:
[crosscorr(sales.amount.resample("W").sum(), collections.amount.resample("W").sum(), lag) for lag in range(4)]

[0.8418523836143519,
 0.42127112457633065,
 -0.3191397249975479,
 0.011603597950486343]

## All in One function

In [218]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
legend_postfix = {"W": ", weekly", "D": ", daily", "M": ", monthly"}
def constructSignals(df_all, query=[{"select": [2], "_with": None}, {"select": [4], "_with": None}], 
                     on="label", agg_period="W", vis_type="line"):
    # Predicted labels
    agg_title = "Aggregated signals"
    sales, collections = filterData_v3(df_all, query=query, on=on)
#     plotAmounts([sales, collections], "Original signals, without aggregation")
    print("Started aggregation...")
    sales_w = sales.resample(agg_period).agg({"amount": sum})
    collections_w = collections.resample(agg_period).apply({"amount": sum})
#     Add info about aggregation period to legen texts
    try:
        sales_w.name = sales.name+legend_postfix[agg_period]
        collections_w.name = collections.name+legend_postfix[agg_period]
        agg_title+=legend_postfix[agg_period]
    except KeyError as e:
        print(f"Could not intepret {agg_period} for adding postfix to legend text... Use the default ones.." )
        sales_w.name = sales.name
        collections_w.name = collections.name
    plotAmounts([sales_w, collections_w], agg_title, vis_type=vis_type)
    print([crosscorr(sales.amount.resample(agg_period).sum(), collections.amount.resample(agg_period).sum(), lag) for lag in range(4)])

In [231]:
constructSignals(df_all, query=[{"select": ["Sales 6 btw", "Sales 21 btw"], 
                              "_with": None}, 
                                {"select": ["Collections"], "_with": None}], on="GroundTruth", agg_period="D", vis_type="line")

Started aggregation...


[0.39022464045334837, 0.23658973916945286, 0.031109297135739417, 0.06962121281838715]


## Tax vs. Revenue based on the Sales , GroundTruth

In [321]:
constructSignals(df_all, query=[{"select": ["Sales 21 btw"], 
                              "_with": {"FA_Name": "Revenue"}}, 
                                {"select": ["Sales 21 btw"], "_with": {"FA_Name": "Tax"}}], on="GroundTruth", agg_period="D", vis_type="line")

Started aggregation...


[0.999985338069011, -0.11383366704061187, -0.24636933332999564, 0.01741564649699069]


## Tax vs. Revenue based on the Sales *as-expert*-cluster

In [None]:
def check_is_sale(df):
    return pd.Series({"ID": df.ID.values[0], "is_Sale": ("Revenue" in df[df["from"]==True].FA_Name.unique()) and ("TradeReceivables" in df[df["from"]==False].FA_Name.unique())})
expert_sales = df_all.reset_index()\
        .merge(df_all.groupby("ID", as_index=False).apply(check_is_sale), on="ID", left_index=True)\
        .set_index("Time", drop=True)

In [326]:
constructSignals(expert_sales, query=[{"select": [True], 
                              "_with": {"FA_Name": "Revenue"}}, 
                                {"select": [True], "_with": {"FA_Name": "Tax"}}], on="is_Sale", agg_period="D", vis_type="line")

Started aggregation...


[0.8879032294438138, 0.020334141594050052, -0.13248383971637848, 0.15188214824254614]


## Tax vs. Revenue based on the Sales predicted-cluster

In [320]:
constructSignals(df_all, query=[{"select": [1, 2], 
                              "_with": {"FA_Name": "Revenue"}}, 
                                {"select": [1, 2], "_with": {"FA_Name": "Tax"}}], on="label", agg_period="D", vis_type="line")

Started aggregation...


[0.8879032294438138, 0.020334141594050052, -0.13248383971637848, 0.15188214824254614]


In [334]:
def calculate_corr(df_all, query=[1, 2], on="label", agg_period="2D", lag=0):
    X, Y = [df_all[df_all[on]==q] for q in query]
    return crosscorr(X.amount.resample(agg_period).sum(), Y.amount.resample(agg_period).sum(), lag)

In [335]:
calculate_corr(df_all, query=[1,2], on="label", agg_period="D")

0.06269490561744173

In [336]:
all_pairs = sorted(df_all.label.unique())

In [392]:
corr_matrix = np.zeros((len(all_pairs), len(all_pairs)))
for x in all_pairs:
    for y in all_pairs:
        if x!=y:
            corr_matrix[x,y] = calculate_corr(df_all, query=[x,y], on="label", agg_period="D", lag=0)

In [424]:
from plotly.offline import iplot
import plotly.figure_factory as ff
def calculate_corr(df_all, query=[1, 2], on="label", agg_period="2D", lag=0):
    X, Y = [df_all[df_all[on]==q] for q in query]
    return crosscorr(X.amount.resample(agg_period).sum(), Y.amount.resample(agg_period).sum(), lag)

def get_corr_matrix(df, on="label", agg_period="D", lag=0):
    labels = sorted(df[on].unique())
    corr_matrix = np.zeros((len(labels), len(labels)))
    for x_pos, x in enumerate(labels):
        for y_pos, y in enumerate(labels):
            if x!=y:
                try:
                    corr_matrix[x_pos, y_pos] = calculate_corr(df, query=[x,y], on=on, agg_period=agg_period, lag=lag)
                except FloatingPointError as float_er:
                    corr_matrix[x_pos, y_pos] = 0.0
    return corr_matrix
axis_prefix = {"label": "Cluster ", "GroundTruth": ""}
def corrHeatmap(corr_matrix, labels, on="label"):
    x = [axis_prefix[on]+str(cl) for cl in labels]
    y = [axis_prefix[on]+str(cl) for cl in labels]
    z_text = np.round(corr_matrix, 2)
    cs = [[00.0, 'rgb(31, 119, 180)'], #blue
            [0.5, 'rgb(255,255,255)'],#white
                [1, 'rgb(214, 39, 40)']] #red
    # fig = go.Figure(data=[go.Heatmap(z=corr_matrix, x=x, y=y, text=z_text, zmid=0, colorscale=cs, name="Correlation between different clusters")])
    fig = ff.create_annotated_heatmap(corr_matrix, x=x, y=y, annotation_text=z_text, 
                                      zmid=0, zmin=-1, zmax=1, colorscale=cs, showscale=True, font_colors=["Black", "Black"])
    fig.layout.title=go.layout.Title(text="Cross-correlation for the "+on)
    iplot(fig)

def correlationMatrix(df, on="label", agg_period="D", lag=0):
    corr_matrix = get_corr_matrix(df, on=on, agg_period=agg_period, lag=lag)
    corrHeatmap(corr_matrix, labels=sorted(df[on].unique()), on=on)

In [428]:
correlationMatrix(df_all, on="GroundTruth", agg_period="2D")
correlationMatrix(df_all, on="label", agg_period="2D")