In [None]:
import pandas as pd
import numpy as np
import os
from os.path import join
import re
import ast
import sys

In [None]:
RULES_PATH = '..//solvency2-rules//'
FILENAME_RULES = '2020-01-22 Set aanvullende controleregels Solvency II_tcm46-387021.xlsx'

## Construct test Solvency 2 instance (put here your own data)

In [None]:
RESULTS_PATH = '..\\results\\'
DATA_PATH = '..\\data\\'

df = pd.DataFrame()
df.index.name = "index"
files = [f for f in os.listdir(RESULTS_PATH) if os.path.isfile(os.path.join(RESULTS_PATH, f)) if f[-6:]=='pickle']
for file in files:
    new_df = pd.read_pickle(os.path.join(RESULTS_PATH, file))
    new_df.columns = [col.upper() for col in new_df.columns]
    if list(new_df.index) == [0]: # without z-axis for now
        for col in new_df.columns:
            if col not in df.columns:
                df[col] = new_df[col]
    else:
        for col in new_df.columns:
            if col not in df.columns:
                # we only pick the first line in the z-axis column
                df.loc[0, col] = new_df.iloc[0, new_df.columns.get_loc(col)]
        df.loc[0, new_df.index.name] = new_df.index[0]
df = df.astype(object)

In [None]:
df

## Create simple taxonomy based on instance

In [None]:
df_taxo = pd.DataFrame(columns = ['datapoint', 'template', 'row', 'column', 'dtype'])
for idx, col in enumerate(df.columns):
    df_taxo.loc[idx, "datapoint"] = col.upper()
    df_taxo.loc[idx, "template"] = col[0:13].upper()
    df_taxo.loc[idx, "row"] = col[14:19].upper()
    df_taxo.loc[idx, "column"] = col[20:25].upper()
    df_taxo.loc[idx, "dtype"] = df.dtypes[idx]
df_taxo.head(5)

# for now we only use the list of templates in the instance
instance_templates = list(df_taxo.loc[:, 'template'].unique())
del df_taxo

## Read DNBs Additional Validation Rules

In [None]:
df_rules = pd.read_excel(os.path.join(RULES_PATH, FILENAME_RULES), header = 1)
df_rules = df_rules.set_index('ControleRegelCode')
df_rules = df_rules.drop('S.28.01_129', axis = 0) # double line, should be removed
df_rules = df_rules.drop('S.01.03_110', axis = 0) # double line, should be removed
df_rules.fillna("", inplace = True)

## Parse formulas

In [None]:
def replace_and_or(s):
    """Replace and by & and or by |, but not within strings"""
    if re.search(r"(.*?)\'(.*?)\'(.*)", s) is None: # input text does not contain strings
        s = s.replace("OR", "|")
        s = s.replace("AND", "&")
    for item in re.findall(r"(.*?)\'(.*?)\'(.*)", s):
        s = s.replace(item[0], item[0].replace("OR", "|"))
        s = s.replace(item[0], item[0].replace("AND", "&"))
        s = s.replace(item[2], replace_and_or(item[2]))
    return s

def preprocess(s):
    """Transform EVA2 code to Python Pandas code"""
    res = s
    res = res.replace("=" , "==")
    res = res.replace(">==" , ">=")
    res = res.replace("<==" , "<=")
    res = res.replace("<>", "!=")
    res = res.replace("< >", "!=") # the space between < and > should be deleted in EVA2
    res = res.replace('"', "'")
    res = replace_and_or(res)
    res = res.replace(" )", ")")
    res = res.replace(';', ",") # this should be corrected in EVA2
    return res

def datapoints2pandas(s):
    """Transform EVA2 datapoints to Python Pandas datapoints"""
    res = s
    datapoints = []
    for item in re.findall(r'{(.*?)}', res):
        res = res.replace("{"+item+"}", "df['"+item.upper()+"']")
        datapoints.append(item.upper())
    return res, datapoints

def add_brackets(s):
    """Add brackets around expressions with & and | (this is not consistent in EVA2)
    TODO: should not apply is AND or OR is in string text
    """
    item = re.search(r'(.*)([&|\|])(.*)', s) # & and | takes priority over other functions like ==
    if item is not None:
        return '('+ add_brackets(item.group(1)) + ') ' + item.group(2).strip() + ' (' + add_brackets(item.group(3)) + ')'
    else:
        item = re.search(r'(.*)([>|<|!=|<=|>=|==])(.*)', s)
        if item is not None:
            return add_brackets(item.group(1)) + item.group(2).strip() + add_brackets(item.group(3))
        else:
            return s.strip()
    
def expression2pandas(g):
    """Transform EVA2 conditional expression to Python Pandas code"""
    item = re.search(r'IF(.*)THEN(.*)', g)
    if item is not None:
        co_str = 'df[('+add_brackets(item.group(1))+') & ('+add_brackets(item.group(2))+")]"
        ex_str = 'df[('+add_brackets(item.group(1))+') & ~('+add_brackets(item.group(2))+")]"
    else:
        co_str = 'df[('+add_brackets(g)+')]'
        ex_str = 'df[~('+add_brackets(g)+')]'
    return co_str, ex_str

def evaluate_strings(df_data, co_str, ex_str):
    """Evaluate Python Pandas string for confirmation and exceptions"""
    try:
        co = len(eval(co_str, {'df': df_data, 'MAX': np.maximum, 'MIN': np.minimum, 'SUM': np.sum}))
        ex = len(eval(ex_str, {'df': df_data, 'MAX': np.maximum, 'MIN': np.minimum, 'SUM': np.sum}))
        return "Correctly parsed (#co=" + str(co)+", #ex="+str(ex)+")"
    except TypeError as e:
        return "Parse error: " + co_str + ": " + str(e)
    except:
        return "Parse error: " + co_str + ": UNKNOWN ERROR"   

In [None]:
def transform2pandas(df_data, rule_original):
    g = preprocess(rule_original)
    g, datapoints = datapoints2pandas(g)
    co_str, ex_str = expression2pandas(g)
    return co_str, ex_str, datapoints

def transform_rules(df_data, df_rules):
    for row in df_rules.index:
        rule_original = df_rules.loc[row, 'Formule']
        if not isinstance(rule_original, str):
            print("Rule " + row + ": " + "duplicate rule. ")
            rule_original = rule_original.values[0]
        else:
            co_str, ex_str, datapoints = transform2pandas(df_data, rule_original)
            df_rules.loc[row, 'pandas co'] = co_str
            df_rules.loc[row, 'pandas ex'] = ex_str
            df_rules.at[row, 'datapoints'] = ''
            df_rules.at[row, 'datapoints'] = df_rules['datapoints'].astype('object')
            df_rules.at[row, 'datapoints'] = datapoints
            df_rules.at[row, 'templates'] = ''
            df_rules.at[row, 'templates'] = df_rules['templates'].astype('object')
            df_rules.at[row, 'templates'] = [datapoint[0:13].upper() for datapoint in datapoints]

def evaluate_rule(df, co_str, ex_str, datapoints, substitutions, expansion_dict):
    if datapoints == []:
        for item in substitutions.keys():
            co_str = co_str.replace(item, substitutions[item])
            ex_str = ex_str.replace(item, substitutions[item])
        print(evaluate_strings(df, co_str, ex_str))
    else:
        datapoint = datapoints.pop()
        if datapoint in expansion_dict.keys():
            for d in expansion_dict[datapoint]:
                substitutions[datapoint] = d
                evaluate_rule(df, co_str, ex_str, datapoints, substitutions, expansion_dict)
        else:
            evaluate_rule(df, co_str, ex_str, datapoints, substitutions, expansion_dict)

def evaluate_rules(df, df_rules):
    for idx in range(len(df_rules.index)):
        row = df_rules.index[idx]
        print(str(idx) + ": ", end='')
        rule_original = df_rules.loc[row, 'Formule']
        datapoints = df_rules.loc[row, 'datapoints']
        templates = df_rules.loc[row, 'templates']
        # are the templates in the rule in the instance?
        templates_not_found = []
        for template in templates:
            if template not in instance_templates:
                templates_not_found.append(template)
                
        if templates_not_found == []:
            datapoints_not_found = []
            expansion_dict = {}
            # are the datapoints in the rule in the instance?
            for datapoint in datapoints:
                if datapoint not in df.columns:
                    all_datapoints_found = False
                    new_list = []
                    if datapoint[14]=="C":
                        for col in df.columns:
                            reg = re.search(datapoint[0:14] + "R....," + datapoint[14:], col)
                            if reg:
                                new_list.append(reg.group(0))
                    if datapoint[14]=="R":
                        for col in df.columns:
                            reg = re.search(datapoint + ",C....", col)
                            if reg:
                                new_list.append(reg.group(0))
                    if new_list != []:
                        expansion_dict[datapoint] = new_list
                    else:
                        datapoints_not_found.append(datapoint)
            if datapoints_not_found == []:
                co_str = df_rules.loc[row, 'pandas co']
                ex_str = df_rules.loc[row, 'pandas ex']
                evaluate_rule(df, co_str, ex_str, datapoints, {}, expansion_dict)
            else:
                print("Datapoints not found: " +str(datapoints_not_found))
        else:
            print("Not all templates in instance: " + str(templates_not_found))

In [None]:
transform_rules(df, df_rules)

In [None]:
evaluate_rules(df, df_rules)