In [111]:
import pandas as pd
import numpy as np
import os
import re
import ast
import sys

In [112]:
DATA_PATH = '../data'
FILENAME_TAXO = 'simple_taxo.csv'
FILENAME_RULES = '2020-01-22 Set aanvullende controleregels Solvency II_tcm46-387021.xlsx'

## Construct test Solvency 2 instance (put here your own data)

In [113]:
taxo_datatypes = dict()
for row in df_taxo.index:
    taxo_datatypes[df_taxo.loc[row, "datapoint"]] = df_taxo.loc[row, "dtype"]
    
unique_dp = list(df_taxo['datapoint'].unique())
data_dp = [[0 if taxo_datatypes[i]=='int64' else 0.0 if taxo_datatypes[i]=='float64' else "text" for i in unique_dp]]

df = pd.DataFrame(columns = unique_dp, data = data_dp)

In [114]:
RESULTS_PATH = 'C:\\Users\\wjwil\\50_results\\xbrl\\'

df = pd.DataFrame()
files = [f for f in os.listdir(RESULTS_PATH) if os.path.isfile(os.path.join(RESULTS_PATH, f)) if f[-3:]=='csv']
for file in files:
    df = df.append(pd.read_csv(os.path.join(RESULTS_PATH, file)), sort = True)

In [115]:
len(df.columns)

1526

## Create simple taxonomy based on instance

In [119]:
df_taxo = pd.DataFrame(columns = ['datapoint', 'template', 'row', 'column', 'dtype'])
for idx, col in enumerate(df.columns):
    df_taxo.loc[idx, "datapoint"] = col
    df_taxo.loc[idx, "template"] = col[0:13]
    df_taxo.loc[idx, "row"] = col[14:19]
    df_taxo.loc[idx, "column"] = col[20:25]
    df_taxo.loc[idx, "dtype"] = df.dtypes[idx]
df_taxo.head(5)

Unnamed: 0,datapoint,template,row,column,dtype
0,"S.01.01.02.01,r0010,c0010",S.01.01.02.01,r0010,c0010,object
1,"S.01.01.02.01,r0030,c0010",S.01.01.02.01,r0030,c0010,object
2,"S.01.01.02.01,r0110,c0010",S.01.01.02.01,r0110,c0010,object
3,"S.01.01.02.01,r0140,c0010",S.01.01.02.01,r0140,c0010,object
4,"S.01.01.02.01,r0150,c0010",S.01.01.02.01,r0150,c0010,object


## Read DNBs Additional Validation Rules

In [101]:
df_vr = pd.read_excel(os.path.join(DATA_PATH, FILENAME_RULES), header = 1)
df_vr = df_vr.set_index('ControleRegelCode')
df_vr = df_vr.drop('S.28.01_129', axis = 0)
df_vr.fillna("", inplace = True)

## Parse formulas

In [102]:
def replace_and_or(s):
    """Replace and by & and or by |, but not within strings"""
    if re.search(r"(.*?)\'(.*?)\'(.*)", s) is None: # input text does not contain strings
        s = s.replace("OR", "|")
        s = s.replace("AND", "&")
        return s
    for item in re.findall(r"(.*?)\'(.*?)\'(.*)", s):
        s = s.replace(item[0], item[0].replace("OR", "|"))
        s = s.replace(item[0], item[0].replace("AND", "&"))
        s = s.replace(item[2], replace_and_or(item[2]))
    return s

def preprocess(s):
    """Transform EVA2 code to Python Pandas code"""
    res = s
    res = res.replace("=" , "==")
    res = res.replace(">==" , ">=")
    res = res.replace("<==" , "<=")
    res = res.replace("<>", "!=")
    res = res.replace("< >", "!=") # the space between < and > should be deleted in EVA2
    res = res.replace('"', "'")
    res = replace_and_or(res)
    res = res.replace(" )", ")")
    res = res.replace(';', ",") # this should be corrected in EVA2
    return res

def transform_datapoints(s, columns):
    """Transform EVA2 datapoints to Python Pandas datapoints"""
    res = s
    found = []
    not_found = []
    for item in re.findall(r'{(.*?)}', res):
        res = res.replace("{"+item+"}", "df['"+item+"']")
        if item not in list(columns):
            not_found.append(item)
        else:
            found.append(item)
    return res, found, not_found

def add_brackets(s):
    """Add brackets around expressions with & and | (this is not consistent in EVA2)
    TODO: should not apply is AND or OR is in string text
    """
    item = re.search(r'(.*)([&|\|])(.*)', s) # & and | takes priority over other functions like ==
    if item is not None:
        return '('+ add_brackets(item.group(1)) + ') ' + item.group(2).strip() + ' (' + add_brackets(item.group(3)) + ')'
    else:
        item = re.search(r'(.*)([>|<|!=|<=|>=|==])(.*)', s)
        if item is not None:
            return add_brackets(item.group(1)) + item.group(2).strip() + add_brackets(item.group(3))
        else:
            return s.strip()
    
def transform_conditional_expression(g):
    """Transform EVA2 conditional expression to Python Pandas code"""
    item = re.search(r'IF(.*)THEN(.*)', g)
    if item is not None:
        co_str = 'df[('+add_brackets(item.group(1))+') & ('+add_brackets(item.group(2))+")]"
        ex_str = 'df[('+add_brackets(item.group(1))+') & ~('+add_brackets(item.group(2))+")]"
    else:
        co_str = 'df[('+add_brackets(g)+')]'
        ex_str = 'df[~('+add_brackets(g)+')]'
    return co_str, ex_str

def evaluate_strings(co_str, ex_str):
    """Evaluate Python Pandas string for confirmation and exceptions"""
    try:
        co = len(eval(co_str, {'df': df, 'MAX': np.maximum, 'MIN': np.minimum, 'SUM': np.sum}))
        ex = len(eval(ex_str, {'df': df, 'MAX': np.maximum, 'MIN': np.minimum, 'SUM': np.sum}))
        return "Correctly parsed (#co=" + str(co)+", #ex="+str(ex)+")"
    except TypeError as e:
        return "Parse error: " + co_str + ": " + str(e)
    except:
        return "Parse error: " + co_str + ": UNKNOWN ERROR"
        
def get_all_datapoints(template, dim):
    """Get all rows or columns in the taxonomy given a template with column or row"""
    d = dim.lower()
    if 'r' in d:
        l = list(df_taxo[(df_taxo['template']==template) & (df_taxo['row']==dim)]['column'].values)
        if l != ['']:
            dp = [template + "," + d + "," + column for column in l]
        else:
            dp = [template + "," + d]
        return dp
    elif 'c' in d:
        l = list(df_taxo[(df_taxo['template']==template) & (df_taxo['column']==dim)]['row'].values)
        if l != ['']:
            dp = [template + "," + row + "," + d for row in l]
        else:
            dp = [template + "," + d]
        return dp        

In [108]:
def evaluate_rule(df_data, original):
    g = preprocess(original)
    g, found, not_found = transform_datapoints(g, df_data.columns)
    if not_found == []:
        # Simple expression with complete datapoints
        co_str, ex_str = transform_conditional_expression(g)
        print(co_str)
        return evaluate_strings(co_str, ex_str)
    else:
        for datapoint in not_found:
            # three possibilities
            # - template is missing
            # - column is missing
            # - row is missing
            template = datapoint[0:13]
            dim = datapoint[14:]
            l = get_all_datapoints(template, dim)
            if (l is None) or (l == []):
#                 print("data point not found: " + str(datapoint))
#                 print("derive from : " + str(original))
#                 print("derive from : " + str(not_found))
                return "Datapoints not found: " + str(not_found)
            else:
                for item in l:
#                    p+rint("Datapoint " + datapoint + " expanded to " + item)
                    expanded = g.replace(datapoint, item)
                    co_str, ex_str = transform_conditional_expression(expanded)
                    res = evaluate_strings(co_str, ex_str)
 #                   print(res)
                return "Correctly parsed : " + str(len(l)) + " rules evaluated"

def evaluate_rules(df_data, df_rules, df_taxo):
    for row in df_rules.index:
        print("Rule " + row + ": ", end='')
        original = df_rules.loc[row, 'Formule']
        if not isinstance(original, str):
            print("duplicate rule. ", end = '')
            original = original.values[0]
        print(evaluate_rule(df_data, original))

In [109]:
evaluate_rules(df, df_vr, df_taxo)

Rule S.01.01_111: Datapoints not found: ['S.01.01.01.01,r0580,c0010', 'S.01.01.01.01,r0590,c0010']
Rule S.01.01_112: df[(df['S.01.01.02.01,r0580,c0010']=='REPORTED') & (df['S.01.01.02.01,r0590,c0010']!='REPORTED')]
Correctly parsed (#co=0, #ex=0)
Rule S.01.01_113: df[(df['S.01.01.02.01,r0590,c0010']!='REPORTED ')]
Correctly parsed (#co=28, #ex=0)
Rule S.01.01_114: Datapoints not found: ['S.02.01.01.01,r0160,c0010', 'S.02.01.01.01,r0070,c0010', 'S.02.01.01.01,r0220,c0010', 'S.01.01.01.01,r0160,c0010']
Rule S.01.01_115: Datapoints not found: ['S.02.01.01.01,r0160,c0010', 'S.02.01.01.01,r0070,c0010', 'S.02.01.01.01,r0220,c0010', 'S.01.01.04.01,r0160,c0010']
Rule S.01.02_102: df[(df['S.01.02.01.01,r0050,c0010']=='NETHERLANDS')]
Correctly parsed (#co=0, #ex=28)
Rule S.01.02_103: Datapoints not found: ['S.01.02.04.01,r0050,c0010']
Rule S.01.02_104: df[((df['S.01.02.01.01,r0070,c0010']=='DUTCH') | (df['S.01.02.01.01,r0070,c0010']=='ENGLISH'))]
Correctly parsed (#co=0, #ex=28)
Rule S.01.02_105

Rule S.07.01_104: Datapoints not found: ['S.07.01.01.01,c0040', 'S.07.01.01.01,c0080']
Rule S.07.01_105: Datapoints not found: ['S.07.01.04.01,c0040', 'S.07.01.04.01,c0080']
Rule S.07.01_106: Datapoints not found: ['S.07.01.01.01,c0040', 'S.07.01.01.01,c0090']
Rule S.07.01_107: Datapoints not found: ['S.07.01.04.01,c0040', 'S.07.01.04.01,c0090']
Rule S.07.01_108: Datapoints not found: ['S.07.01.01.01,c0040', 'S.07.01.01.01,c0110']
Rule S.07.01_109: Datapoints not found: ['S.07.01.04.01,c0040', 'S.07.01.04.01,c0110']
Rule S.07.01_110: Datapoints not found: ['S.07.01.01.01,c0040', 'S.07.01.01.01,c0120']
Rule S.07.01_111: Datapoints not found: ['S.07.01.04.01,c0040', 'S.07.01.04.01,c0120']
Rule S.07.01_112: Datapoints not found: ['S.07.01.01.01,c0140', 'S.07.01.01.01,c0130']
Rule S.07.01_113: Datapoints not found: ['S.07.01.04.01,c0140', 'S.07.01.04.01,c0130']
Rule S.07.01_114: Datapoints not found: ['S.07.01.01.01,c0140', 'S.07.01.01.01,c0130']
Rule S.07.01_115: Datapoints not found: ['S

Rule S.24.01_113: Datapoints not found: ['S.24.01.01.06,c0310', 'S.24.01.01.06,c0330']
Rule S.24.01_114: Datapoints not found: ['S.24.01.01.06,c0330', 'S.24.01.01.06,c0340', 'S.24.01.01.06,c0350', 'S.24.01.01.06,c0360']
Rule S.24.01_115: Datapoints not found: ['S.24.01.01.07,c0380', 'S.24.01.01.07,c0370']
Rule S.24.01_117: Datapoints not found: ['S.24.01.01.07,c0400', 'S.24.01.01.07,c0410', 'S.24.01.01.07,c0420', 'S.24.01.01.07,c0430']
Rule S.24.01_118: Datapoints not found: ['S.24.01.01.08,c0450', 'S.24.01.01.08,c0440']
Rule S.24.01_119: Datapoints not found: ['S.24.01.01.08,c0450', 'S.24.01.01.08,c0470']
Rule S.24.01_120: Datapoints not found: ['S.24.01.01.08,c0470', 'S.24.01.01.08,c0480', 'S.24.01.01.08,c0490', 'S.24.01.01.08,c0500']
Rule S.24.01_121: Datapoints not found: ['S.24.01.01.09,c0520', 'S.24.01.01.09,c0510']
Rule S.24.01_122: Datapoints not found: ['S.24.01.01.09,c0520', 'S.24.01.01.09,c0540']
Rule S.24.01_123: Datapoints not found: ['S.24.01.01.09,c0540', 'S.24.01.01.09,

Rule S.27.01_241: Datapoints not found: ['S.27.01.04.10,r2300,c0740', 'S.27.01.04.10,r2300,c0710', 'S.27.01.04.10,r2300,c0720', 'S.27.01.04.10,r2300,c0730']
Rule S.27.01_242: Datapoints not found: ['S.27.01.01.11,r2400,c0760', 'S.27.01.01.09,r2200,c0610', 'S.27.01.01.10,r2300,c0710']
Rule S.27.01_243: Datapoints not found: ['S.27.01.04.11,r2400,c0760', 'S.27.01.04.09,r2200,c0610', 'S.27.01.04.10,r2300,c0710']
Rule S.27.01_244: Datapoints not found: ['S.27.01.01.11,r2400,c0780', 'S.27.01.01.09,r2200,c0640', 'S.27.01.01.10,r2300,c0740']
Rule S.27.01_245: Datapoints not found: ['S.27.01.04.11,r2400,c0780', 'S.27.01.04.09,r2200,c0640', 'S.27.01.04.10,r2300,c0740']
Rule S.27.01_246: Datapoints not found: ['S.27.01.01.11,r2410']
Rule S.27.01_247: Datapoints not found: ['S.27.01.04.11,r2410']
Rule S.27.01_248: Datapoints not found: ['S.27.01.01.12,r2500,c0810', 'S.27.01.01.12,r2500,c0790', 'S.27.01.01.12,r2500,c0800']
Rule S.27.01_249: Datapoints not found: ['S.27.01.04.12,r2500,c0810', 'S.27

Rule S.28.01_100: df[(df['S.28.01.01.01,r0010,c0010']==MAX(0, df['S.28.01.01.02,r0020,c0020']*0.047) + MAX(0, df['S.28.01.01.02,r0020,c0030'] * 0.047) + MAX(0, df['S.28.01.01.02,r0030,c0020'] * 0.131) + MAX(0, df['S.28.01.01.02,r0030,c0030'] * 0.085) + MAX(0, df['S.28.01.01.02,r0040,c0020'] * 0.107) + MAX(0, df['S.28.01.01.02,r0040,c0030'] * 0.075) + MAX(0, df['S.28.01.01.02,r0050,c0020'] * 0.085) + MAX(0, df['S.28.01.01.02,r0050,c0030'] * 0.094) + MAX(0, df['S.28.01.01.02,r0060,c0020'] * 0.075) + MAX(0, df['S.28.01.01.02,r0060,c0030'] * 0.075) + MAX(0, df['S.28.01.01.02,r0070,c0020'] * 0.103) + MAX(0, df['S.28.01.01.02,r0070,c0030'] * 0.14) + MAX(0, df['S.28.01.01.02,r0080,c0020'] * 0.094) + MAX(0, df['S.28.01.01.02,r0080,c0030'] * 0.075) + MAX(0, df['S.28.01.01.02,r0090,c0020'] * 0.103) + MAX(0, df['S.28.01.01.02,r0090,c0030'] * 0.131) + MAX(0, df['S.28.01.01.02,r0100,c0020'] *0.177) + MAX(0, df['S.28.01.01.02,r0100,c0030'] * 0.113) + MAX(0, df['S.28.01.01.02,r0110,c0020'] * 0.113) +

Rule S.37.01_108: Datapoints not found: ['S.38.01.10.01,r0010,c0010']
Rule S.37.01_109: Datapoints not found: ['S.38.01.10.01,r0020,c0010']
Rule S.37.01_111: Datapoints not found: ['S.40.01.10.01,r0010,c0010']
Rule S.37.01_112: Datapoints not found: ['S.41.01.11.01,c0010']




In [37]:
            # Datapoints in expression should be expanded with the content of Rijen and Kolommen
#             to_select_rows = df_rules.loc[row, "Rijen"].replace("(", "").replace(")", "")
#             to_select_columns = df_rules.loc[row, "Kolommen"].replace("(", "").replace(")", "")
#             if (to_select_rows != "") and not('all' in to_select_rows.lower()):
#                 to_select_rows = ["r" + r if len(r)==4 else r for r in to_select_rows.split(";")]
# #               print(to_select_rows)
#             if (to_select_columns != "") and not('all' in to_select_columns.lower()): 
#                 to_select_columns = ["c" + r if len(r)==4 else r for r in to_select_columns.split(";")]
# #               print(to_select_columns)


#                 if l is None:
#                     print("Datapoint string: " + str(datapoint))
#                 else:
#                     if len(l) == 0:
#                         print("Datapoint not found " + str(datapoint))
#                     else:
#                         expansion.append(l)
#             if expansion !=[]:
#                 print("Not yet implemented: expand possible ")
#                 for row in to_select_rows:
#                     a = datapoint[0:13] + "," + row + datapoint[13:19]
#                     if a in df_taxo['datapoint'].values:
#                         print(a + ": found")

In [110]:
df[(df['S.01.01.02.01,r0590,c0010']!='REPORTED ')]

Unnamed: 0.1,"S.01.01.02.01,r0010,c0010","S.01.01.02.01,r0030,c0010","S.01.01.02.01,r0110,c0010","S.01.01.02.01,r0140,c0010","S.01.01.02.01,r0150,c0010","S.01.01.02.01,r0170,c0010","S.01.01.02.01,r0180,c0010","S.01.01.02.01,r0220,c0010","S.01.01.02.01,r0290,c0010","S.01.01.02.01,r0410,c0010",...,"S.28.02.01.06,r0560,c0150","T.99.01.01.01,*artificial key*|""mandatory"",c0050","T.99.01.01.01,*artificial key*|""mandatory"",c0060","T.99.01.01.01,*artificial key*|""mandatory"",c0070","T.99.01.01.01,*artificial key*|""mandatory"",c0080","T.99.01.01.01,*artificial key*|""mandatory"",c0090","T.99.01.01.01,*artificial key*|""mandatory"",c0100","T.99.01.01.01,*artificial key*|""mandatory"",c0110","T.99.01.01.01,*artificial key*|""mandatory"",c0120",Unnamed: 0
0,1 - Reported,6 - Exempted under Article 35 (6) to (8),6 - Exempted under Article 35 (6) to (8),7 - Not due annually as reported for Quarter 4...,1 - Reported,0 - Not reported other reason (in this case sp...,2 - Not reported as no derivative transactions,1 - Reported,6 - Exempted under Article 35 (6) to (8),1 - Reported,...,,,,,,,,,,0
0,,,,,,,,,,,...,,,,,,,,,,0
0,,,,,,,,,,,...,,,,,,,,,,0
0,,,,,,,,,,,...,,,,,,,,,,0
0,,,,,,,,,,,...,,,,,,,,,,0
0,,,,,,,,,,,...,,,,,,,,,,0
0,,,,,,,,,,,...,,,,,,,,,,0
0,,,,,,,,,,,...,,,,,,,,,,0
0,,,,,,,,,,,...,,,,,,,,,,0
0,,,,,,,,,,,...,,,,,,,,,,0
