In [56]:
import sqlalchemy
import pandas as pd
import numpy as np
import torch
import torch_geometric.transforms as T
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import FunctionTransformer
import dill as pickle
import copy
from collections import OrderedDict
import yaml
from utils import load_yaml

model_name='all-MiniLM-L6-v2'
model_string_encoder = SentenceTransformer(model_name)

@torch.no_grad()
def encode_strings(df):
    x = model_string_encoder.encode(df.values, show_progress_bar=True)
    return x

engine = sqlalchemy.create_engine("mariadb+mariadbconnector://guest:relational@relational.fit.cvut.cz:3306/financial")

trans = pd.read_sql_table("trans", engine)
loan = pd.read_sql_table("loan", engine)
order = pd.read_sql_table("order", engine)
card = pd.read_sql_table("card", engine)
account = pd.read_sql_table("account", engine)
client = pd.read_sql_table("client", engine)
disp = pd.read_sql_table("disp", engine)
district = pd.read_sql_table("district", engine)

MAX_SIZE_PER_TABLE = 1_000
THRESHOLD_RATIO_CATEGORIES = 0.2
THRESHOLD_ABSOLUTE_CATEGORIES = 128

In [69]:
def sin_transformer(period):
    from sklearn.preprocessing import FunctionTransformer
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    from sklearn.preprocessing import FunctionTransformer
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))


def proc_raw(data):
    return data.values


def proc_objects_one_hot(data):
    return pd.get_dummies(data).values


def proc_objects_string(data):
    from sentence_transformers import SentenceTransformer
    import torch

    model_name='all-MiniLM-L6-v2'
    model_string_encoder = SentenceTransformer(model_name)

    @torch.no_grad()
    def encode_strings(df):
        x = model_string_encoder.encode(df.values, show_progress_bar=True)
        return x
    return encode_strings(data.fillna(''))


def postproc(data):
    if len(data[-1].shape) == 1:
        data[-1] = np.expand_dims(data[-1], 1)
    return np.hstack(data)


def proc_datetime(data):
    feature_tmp_year = data.dt.year
    feature_tmp_day = data.dt.day
    # Cyclical features:
    feature_tmp_month = data.dt.month
    feature_tmp_hour = data.dt.hour
    feature_tmp_minute = data.dt.minute
    feature_tmp_second = data.dt.second
    feature_tmp_day_of_week = data.dt.day_of_week

    feature_tmp_month_x = sin_transformer(12).fit_transform(feature_tmp_month)
    feature_tmp_month_y = cos_transformer(12).fit_transform(feature_tmp_month)

    feature_tmp_hour_x = sin_transformer(60).fit_transform(feature_tmp_hour)
    feature_tmp_hour_y = cos_transformer(60).fit_transform(feature_tmp_hour)

    feature_tmp_minute_x = sin_transformer(60).fit_transform(feature_tmp_minute)
    feature_tmp_minute_y = cos_transformer(60).fit_transform(feature_tmp_minute)

    feature_tmp_second_x = sin_transformer(60).fit_transform(feature_tmp_second)
    feature_tmp_second_y = cos_transformer(60).fit_transform(feature_tmp_second)

    feature_tmp_day_of_week_x = sin_transformer(7).fit_transform(feature_tmp_day_of_week)
    feature_tmp_day_of_week_y = cos_transformer(7).fit_transform(feature_tmp_day_of_week)

    feature_tmp = [feature_tmp_year, 
                    feature_tmp_month_x,
                    feature_tmp_month_y,
                    feature_tmp_day,
                    feature_tmp_hour_x,
                    feature_tmp_hour_y,
                    feature_tmp_minute_x,
                    feature_tmp_minute_y,
                    feature_tmp_second_x,
                    feature_tmp_second_y,
                    feature_tmp_day_of_week_x,
                    feature_tmp_day_of_week_y,
                    ]
    return np.vstack(feature_tmp).T

def auto_enc(df):
    mapping = dict()
    for i, _ in enumerate(df.columns):
        if df.dtypes.iloc[i] == "float64":
            mapping[str(df.columns[i])] = "proc_raw"
        elif df.dtypes.iloc[i] == "object":
            if (len(df[df.columns[i]].unique()) / len(df) <= THRESHOLD_RATIO_CATEGORIES) & \
                (len(df[df.columns[i]].unique()) <= THRESHOLD_ABSOLUTE_CATEGORIES):
                mapping[str(df.columns[i])] = "proc_objects_one_hot"
            else:
                mapping[str(df.columns[i])] = "proc_objects_string"
        elif df.dtypes.iloc[i] == "int64":
            if (len(df[df.columns[i]].unique()) / len(df) <= THRESHOLD_RATIO_CATEGORIES) & \
                (len(df[df.columns[i]].unique()) <= THRESHOLD_ABSOLUTE_CATEGORIES):
                mapping[str(df.columns[i])] = "proc_objects_one_hot"
            else:
                mapping[str(df.columns[i])] = "proc_raw"
        elif df.dtypes.iloc[i] == "datetime64[ns]":
            #features.append( proc_datetime( df[df.columns[i]] ) )
            mapping[str(df.columns[i])] = "proc_datetime"
        else:
            print("WARNING: UNKNOWN COLUMN TYPE")
        
    return mapping


def auto_edgerizer(dfs):
    strong_edges = list()
    all_maps = dict()
    for name, df in dfs:
        curr_map = dict()
        for i in range(len(df.columns)):
            if (df.dtypes.iloc[i] == "object") | (df.dtypes.iloc[i] == "int64"):
                curr_map[df.columns[i]] = df[df.columns[i]].unique()
        all_maps[str(name)] = copy.copy(curr_map)
    
    # Now find overlaps
    keys = sorted(list(all_maps.keys()))
    for k in range(len(keys)):
        for l in range(len(keys)):
            if k < l:
                k_cols = sorted(list(all_maps[keys[k]].keys()))
                l_cols = sorted(list(all_maps[keys[l]].keys()))
                for kk in range(len(k_cols)):
                    for ll in range(len(l_cols)):
                        inter = set(all_maps[keys[k]][k_cols[kk]].tolist()).intersection( 
                            set(all_maps[keys[l]][l_cols[ll]].tolist() ))
                        if (len(set(all_maps[keys[k]][k_cols[kk]].tolist() )) == len(inter)) | \
                            (len(set(all_maps[keys[l]][l_cols[ll]].tolist() )) == len(inter)):
                            if len(set(all_maps[keys[l]][l_cols[ll]].tolist() )) == len(set(all_maps[keys[k]][k_cols[kk]].tolist() )) and \
                                len(set(all_maps[keys[l]][l_cols[ll]].tolist() )) == len(inter):
                                tmp_dict = dict()
                                tmp_dict["name"] = str(keys[k]) + "_" + str(keys[l]) 
                                tmp_dict["from"] = str(keys[k])
                                tmp_dict["to"] = str(keys[l])
                                tmp_dict["transform"] = ""
                                tmp_dict["from_col"] = str(k_cols[kk])
                                tmp_dict["to_col"] = str(l_cols[ll])
                                strong_edges.append(tmp_dict)
    return strong_edges

In [70]:
features_district = auto_enc(district)
features_district

{'district_id': 'proc_raw',
 'A2': 'proc_objects_string',
 'A3': 'proc_objects_one_hot',
 'A4': 'proc_raw',
 'A5': 'proc_raw',
 'A6': 'proc_raw',
 'A7': 'proc_raw',
 'A8': 'proc_objects_one_hot',
 'A9': 'proc_objects_one_hot',
 'A10': 'proc_raw',
 'A11': 'proc_raw',
 'A12': 'proc_raw',
 'A13': 'proc_raw',
 'A14': 'proc_raw',
 'A15': 'proc_raw',
 'A16': 'proc_raw'}

In [71]:
def create_auto_config_from_dataframes(dfs, file="default01.yml"):
    """
    Create a yaml file as well as a python file to give a 
    customizable basis for any following pipeline
    """

    # 1. Create Node-FeatureExtraction Mapping
    mappings = dict()
    for name, df in dfs:
        mappings[name] = dict()
        mappings[name]["transform"] = auto_enc(df)

    # 2. Create Edge Mapping
    strong_edges = auto_edgerizer(dfs)

    # 3. Write script.py

    final_dict = dict()
    final_dict["nodes"] = mappings
    final_dict["edges"] = strong_edges

    # 4. Write yaml file
    f = open(file, "w")
    yaml.dump(final_dict, f, sort_keys=False)
    f.close()
    print("YAML file saved.")
    #print(final_dict)
    return

In [72]:
create_auto_config_from_dataframes([
        ("trans", trans),
        ("loan", loan),
        ("order", order),
        ("card", card),
        ("account", account),
        ("client", client),
        ("disp", disp),
        ("district", district)])

YAML file saved.


In [68]:
def run_feature_pipeline_from_yaml(file="../default.yml"):
    config = load_yaml(file)
    return config

#run_feature_pipeline_from_yaml()

# Evalurizer

In [None]:
# Same Dataframes Schema
# 1. Read in the Transformation pipelines etc
# 2. Create Batch
# 3. Load model checkpoint and evaluate