In [1]:
import pandas as pd
import numpy as np
import os
from os.path import join
import re
import ast
import sys
import data_patterns

In [2]:
RULES_PATH = '..//solvency2-rules//'
FILENAME_RULES = '2020-01-22 Set aanvullende controleregels Solvency II_tcm46-387021.xlsx'

## Construct test Solvency 2 instance (put here your own data)

In [3]:
RESULTS_PATH = '..\\results\\'
DATA_PATH = '..\\data\\'

df = pd.DataFrame()
df.index.name = "index"
files = [f for f in os.listdir(RESULTS_PATH) if os.path.isfile(os.path.join(RESULTS_PATH, f)) if f[-6:]=='pickle']
for file in files:
    new_df = pd.read_pickle(os.path.join(RESULTS_PATH, file))
    new_df.columns = [col.upper() for col in new_df.columns]
    if list(new_df.index) == [0]: # without z-axis for now
        for col in new_df.columns:
            if col not in df.columns:
                df[col] = new_df[col]
    else:
        for col in new_df.columns:
            if col not in df.columns:
                # we only pick the first line in the z-axis column
                df.loc[0, col] = new_df.iloc[0, new_df.columns.get_loc(col)]
        df.loc[0, new_df.index.name] = new_df.index[0]
df = df.astype(object)

In [4]:
df.head()

Unnamed: 0_level_0,"E.01.01.16.01,EC0020","E.01.01.16.01,EC0030","E.01.01.16.01,EC0040","E.01.01.16.01,EC0050","E.01.01.16.01,EC0060","E.01.01.16.01,EC0010","E.02.01.16.01,EC0010","E.02.01.16.01,ER0050","E.03.01.16.01,EC0020","E.03.01.16.01,ER0030",...,"SR.27.01.01.28,R2421,C0781","T.99.01.01.01,C0050","T.99.01.01.01,C0060","T.99.01.01.01,C0070","T.99.01.01.01,C0080","T.99.01.01.01,C0090","T.99.01.01.01,C0100","T.99.01.01.01,C0110","T.99.01.01.01,C0120","T.99.01.01.01,C0010"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,"IRAN, ISLAMIC REPUBLIC OF",s2c_CU:SCR,4594460.0,778090000.0,948777000.0,1,362888000.0,Pension entitlements,840734000.0,Home country,...,0,bpfy aq nmai jr,938006000.0,svvf vvzwwn,1999-10-10 00:00:00,80290,796320,0.6436,True,1


## Create simple taxonomy based on instance

In [5]:
df_taxo = pd.DataFrame(columns = ['datapoint', 'template', 'row', 'column', 'dtype'])
for idx, col in enumerate(df.columns):
    df_taxo.loc[idx, "datapoint"] = col.upper()
    df_taxo.loc[idx, "template"] = col[0:13].upper()
    df_taxo.loc[idx, "row"] = col[14:19].upper()
    df_taxo.loc[idx, "column"] = col[20:25].upper()
    df_taxo.loc[idx, "dtype"] = df.dtypes[idx]
df_taxo.head(5)

# for now we only use the list of templates in the instance
instance_templates = list(df_taxo.loc[:, 'template'].unique())
del df_taxo

## Read DNBs Additional Validation Rules

In [6]:
df_rules = pd.read_excel(os.path.join(RULES_PATH, FILENAME_RULES), header = 1)
df_rules = df_rules.set_index('ControleRegelCode')
df_rules = df_rules.drop('S.28.01_129', axis = 0) # double line, should be removed
df_rules = df_rules.drop('S.01.03_110', axis = 0) # double line, should be removed
df_rules.fillna("", inplace = True)


## Parse formulas

In [7]:
def datapoints2pandas(s):
    """Transform EVA2 datapoints to Python Pandas datapoints"""
    datapoints = []
    for item in re.findall(r'{(.*?)}', s):
        datapoints.append(item.upper())
        s = s.replace(item,  item.upper() )
    return s, datapoints

   

def evaluate_strings(expression):
    """Evaluate Python Pandas string for confirmation and exceptions"""

    parameters = {'min_confidence': 0,'min_support'   : 0, 'solvency' : True}
    p2 = {'name'      : 'Pattern 1',
        'expression' : expression,
         'parameters':parameters}
    return p2
    df_patterns = miner.find(p2)
    co = df_patterns.loc[0,'support']
    ex = df_patterns.loc[0,'exceptions']
    if df_patterns.loc[0,'Error message'] != '':
        return "ERROR: "+ df_patterns.loc[0,'Error message']
    return "Correctly parsed (#co=" + str(co)+", #ex="+str(ex)+")"


In [8]:
def transform2pandas(df_data, rule_original):
    rule_original, datapoints = datapoints2pandas(rule_original)
    return  rule_original, datapoints

def transform_rules(df_data, df_rules):
    for row in df_rules.index:
        rule_original = df_rules.loc[row, 'Formule']
        if not isinstance(rule_original, str):
            print("Rule " + row + ": " + "duplicate rule. ")
            rule_original = rule_original.values[0]
        else:
            rule_original, datapoints = transform2pandas(df_data, rule_original)
            print(rule_original)
            df_rules.at[row, 'datapoints'] = ''
            df_rules.at[row, 'datapoints'] = df_rules['datapoints'].astype('object')
            df_rules.at[row, 'datapoints'] = datapoints
            df_rules.at[row, 'templates'] = ''
            df_rules.at[row, 'templates'] = df_rules['templates'].astype('object')
            df_rules.at[row, 'templates'] = [datapoint[0:13].upper() for datapoint in datapoints]
            df_rules.loc[row, 'Formule_input'] = rule_original
            
def evaluate_rule(miner, expression, datapoints, substitutions, expansion_dict):
    if datapoints == []:
        for item in substitutions.keys():
            expression = expression.replace(item, substitutions[item])
        print(evaluate_strings(miner, expression))
    else:
        datapoint = datapoints.pop()
        if datapoint in expansion_dict.keys():
            for d in expansion_dict[datapoint]:
                substitutions[datapoint] = d
                evaluate_rule(miner,expression, datapoints, substitutions, expansion_dict)
        else:
            evaluate_rule(miner,expression, datapoints, substitutions, expansion_dict)

def evaluate_rules(df, df_rules):
    miner = data_patterns.PatternMiner(df)

    for idx in range(len(df_rules.index)):
        row = df_rules.index[idx]
        print(str(idx) + ": ", end='')
        rule_original = df_rules.loc[row, 'Formule_input']
        datapoints = df_rules.loc[row, 'datapoints']
        templates = df_rules.loc[row, 'templates']
        # are the templates in the rule in the instance?
        templates_not_found = []
        for template in templates:
            if template not in instance_templates:
                templates_not_found.append(template)
                
        if templates_not_found == []:
            datapoints_not_found = []
            expansion_dict = {}
            # are the datapoints in the rule in the instance?
            for datapoint in datapoints:
                if datapoint not in df.columns:
                    all_datapoints_found = False
                    new_list = []
                    if datapoint[14]=="C":
                        for col in df.columns:
                            reg = re.search(datapoint[0:14] + "R....," + datapoint[14:], col)
                            if reg:
                                new_list.append(reg.group(0))
                    if datapoint[14]=="R":
                        for col in df.columns:
                            reg = re.search(datapoint + ",C....", col)
                            if reg:
                                new_list.append(reg.group(0))
                    if new_list != []:
                        expansion_dict[datapoint] = new_list
                    else:
                        datapoints_not_found.append(datapoint)
            if datapoints_not_found == []:
                expression = rule_original
                evaluate_rule(miner,expression, datapoints, {}, expansion_dict)
            else:
                print("Datapoints not found: " +str(datapoints_not_found))
        else:
            print("Not all templates in instance: " + str(templates_not_found))

In [9]:
transform_rules(df, df_rules)

IF {S.01.01.01.01,R0580,C0010} = "REPORTED" THEN {S.01.01.01.01,R0590,C0010} <> "REPORTED"
IF {S.01.01.02.01,R0580,C0010} = "REPORTED" THEN {S.01.01.02.01,R0590,C0010} <> "REPORTED"
{S.01.01.02.01,R0590,C0010} <> "REPORTED "
IF {S.02.01.01.01,R0160,C0010} > 0.05*({S.02.01.01.01,R0070,C0010}+{S.02.01.01.01,R0220,C0010}) THEN {S.01.01.01.01,R0160,C0010} = "REPORTED"
IF {S.02.01.01.01,R0160,C0010} > 0.05*({S.02.01.01.01,R0070,C0010}+{S.02.01.01.01,R0220,C0010}) THEN {S.01.01.04.01,R0160,C0010} = "REPORTED"
{S.01.02.01.01,R0050,C0010} = "NETHERLANDS"
{S.01.02.04.01,R0050,C0010} = "NETHERLANDS" 
{S.01.02.01.01,R0070,C0010} = "DUTCH" OR {S.01.02.01.01,R0070,C0010} = "ENGLISH"
{S.01.02.04.01,R0070,C0010} = "DUTCH" OR {S.01.02.04.01,R0070,C0010} = "ENGLISH"
{S.01.02.01.01,R0100,C0010} = "REGULAR REPORTING"
{S.01.02.04.01,R0100,C0010} = "REGULAR REPORTING"
IF {S.01.02.04.01,R0160,C0010}<> "ACCOUNTING CONSOLIDATION-BASED METHOD [METHOD 1]" THEN {S.01.01.04.01,R0710,C0010} = "REPORTED"
IF {S.01.0

IF {S.15.01.04.01,C0040}<>" " THEN {S.15.01.04.01,C0100} > 0
IF {S.15.01.01.01,C0040}<>" " THEN {S.15.01.01.01,C0110} <> " "
IF {S.15.01.04.01,C0040}<>" " THEN {S.15.01.04.01,C0110} <> " "
{S.17.01.01.01,R0330} = {S.17.01.01.01,R0050} + {S.17.01.01.01,R0140} + {S.17.01.01.01,R0240}
{S.17.01.02.01,R0330} = {S.17.01.02.01,R0050} + {S.17.01.02.01,R0140} + {S.17.01.02.01,R0240}
{S.23.01.01.01,C0010} = {S.23.01.01.01,C0020} + {S.23.01.01.01,C0030} + {S.23.01.01.01,C0040}
{S.23.01.01.01,C0010} = {S.23.01.01.01,C0020} + {S.23.01.01.01,C0030} + {S.23.01.01.01,C0040} + {S.23.01.01.01,C0050}
{S.23.01.04.01,C0010} = {S.23.01.04.01,C0020} + {S.23.01.04.01,C0030} + {S.23.01.04.01,C0040} + {S.23.01.04.01,C0050}
{S.23.01.04.01,C0010} = {S.23.01.04.01,C0020} + {S.23.01.04.01,C0030} + {S.23.01.04.01,C0040} + {S.23.01.04.01,C0050}
{S.23.01.04.01,R0530,C0010} = {S.23.01.04.01,R0530,C0020} + {S.23.01.04.01,R0530,C0030} + {S.23.01.04.01,R0530,C0040}
{S.23.01.04.01,R0630,C0010} = {S.23.01.04.01,R0560,C0010}

IF ({S.26.03.04.04,R0400,C0060}>0 OR {S.26.03.04.04,R0400,C0080}>0) THEN ({S.26.03.04.03,R0040,C0010}="SIMPLIFICATIONS USED" OR {S.26.03.04.03,R0040,C0010}="SIMPLIFICATIONS NOT USED")
IF ({S.26.03.01.04,R0500,C0060}>0 OR {S.26.03.01.04,R0500,C0080}>0) THEN ({S.26.03.01.03,R0050,C0010}="SIMPLIFICATIONS USED" OR {S.26.03.01.03,R0050,C0010}="SIMPLIFICATIONS NOT USED")
IF ({S.26.03.04.04,R0500,C0060}>0 OR {S.26.03.04.04,R0500,C0080}>0) THEN ({S.26.03.04.03,R0050,C0010}="SIMPLIFICATIONS USED" OR {S.26.03.04.03,R0050,C0010}="SIMPLIFICATIONS NOT USED")
IF ({S.26.03.01.04,R0700,C0060}>0 OR {S.26.03.01.04,R0700,C0080}>0) THEN ({S.26.03.01.03,R0060,C0010}="SIMPLIFICATIONS USED" OR {S.26.03.01.03,R0060,C0010}="SIMPLIFICATIONS NOT USED")
IF ({S.26.03.04.04,R0700,C0060}>0 OR {S.26.03.04.04,R0700,C0080}>0) THEN ({S.26.03.04.03,R0060,C0010}="SIMPLIFICATIONS USED" OR {S.26.03.04.03,R0060,C0010}="SIMPLIFICATIONS NOT USED")
{S.26.03.01.04,R0800}={S.26.03.01.04,R0900}-({S.26.03.01.04,R0100} + {S.26.03.01

{S.27.01.01.18,R3120,C1110}={S.27.01.01.18,R3120,C1100}-{S.27.01.01.18,R3120,C1120}
{S.27.01.04.18,R3120,C1110}={S.27.01.04.18,R3120,C1100}-{S.27.01.04.18,R3120,C1120}
{S.27.01.01.18,R3120,C1100}=((({S.27.01.01.16,R2920,C1010}**2)+({S.27.01.01.17,R3000,C1060}**2))**0.5)
{S.27.01.04.18,R3120,C1100}=((({S.27.01.04.16,R2920,C1010}**2)+({S.27.01.04.17,R3000,C1060}**2))**0.5)
{S.27.01.01.19,R3200,C1140}={S.27.01.01.19,R3200,C1130}
{S.27.01.04.19,R3200,C1140}={S.27.01.04.19,R3200,C1130}
{S.27.01.01.19,R3210,C1140}={S.27.01.01.19,R3210,C1130}*2.5
{S.27.01.04.19,R3210,C1140}={S.27.01.04.19,R3210,C1130}*2.5
{S.27.01.01.19,R3220,C1140}={S.27.01.01.19,R3220,C1130}*0.4
{S.27.01.04.19,R3220,C1140}={S.27.01.04.19,R3220,C1130}*0.4
{S.27.01.01.19,R3230,C1140}={S.27.01.01.19,R3230,C1130}*2.5
{S.27.01.04.19,R3230,C1140}={S.27.01.04.19,R3230,C1130}*2.5
{S.27.01.01.19,R3240,C1140}={S.27.01.01.19,R3240,C1130}*2.5
{S.27.01.04.19,R3240,C1140}={S.27.01.04.19,R3240,C1130}*2.5
{S.27.01.01.19,R3260} <= 0
{S.27.0

IF {S.35.01.04.01,C0130} <> 0 AND {S.35.01.04.01,C0040}<>"DEDUCTION AND AGGREGATION METHOD [METHOD 2]" THEN {S.35.01.04.01,C0140} <> 0
IF {S.35.01.04.01,C0160} <> 0 AND {S.35.01.04.01,C0040}<>"DEDUCTION AND AGGREGATION METHOD [METHOD 2]" THEN {S.35.01.04.01,C0170} <> 0
IF {S.35.01.04.01,C0190} <> 0 AND {S.35.01.04.01,C0040}<>"DEDUCTION AND AGGREGATION METHOD [METHOD 2]" THEN {S.35.01.04.01,C0200} <> 0
IF {S.35.01.04.01,C0020}<>" " AND {S.35.01.04.01,C0040}<>"DEDUCTION AND AGGREGATION METHOD [METHOD 2]" THEN ({S.35.01.04.01,C0070} + {S.35.01.04.01,C0100} + {S.35.01.04.01,C0130} + {S.35.01.04.01,C0160} + {S.35.01.04.01,C0190} <> 0)
IF {S.36.01.01.01,C0010} <> " " THEN {S.36.01.01.01,C0020} <> " "
IF {S.36.01.01.01,C0010} <> " " THEN {S.36.01.01.01,C0050} <> " "
IF {S.36.01.01.01,C0010} <> " " THEN {S.36.01.01.01,C0100} <> " "
IF {S.36.01.01.01,C0010} <> " " THEN {S.36.01.01.01,C0110} > 0
IF {S.36.01.01.01,C0010} <> " " THEN {S.36.01.01.01,C0130} <> " "
IF {S.36.02.01.01,C0010} <> " " THE

In [10]:
patterns = evaluate_rules(df, df_rules)

0: Not all templates in instance: ['S.01.01.01.01', 'S.01.01.01.01']
1: 

TypeError: evaluate_strings() takes 1 positional argument but 2 were given

In [None]:
patterns

In [None]:
miner = data_patterns.PatternMiner(df)
parameters = {'min_confidence': 0,'min_support'   : 0, 'solvency' : True}
expression =  'IF {S.15.01.01.01,C0040}<>" " THEN {S.15.01.01.01,C0070} >0'
p2 = {'name'      : 'Pattern 1',
    'expression' : expression,
     'parameters':parameters}
p1 = {'name'      : 'Pattern 1',
    'expression' : '{S.01.01.02.01,R0590,C0010} <> "REPORTED "',
     'parameters':parameters}

df_patterns = miner.find([p2,p1])
co = df_patterns.loc[1,'support']
ex = df_patterns.loc[0,'exceptions']
df_patterns

In [None]:
df_rules