**Note:**
Because our data file includes about 100K rows of data, we chose to reduce the central table to create the infrastructure as it is presented in this file ("mini_table").

In [1]:
!pip install jsonpath_ng

Collecting jsonpath_ng
  Downloading jsonpath_ng-1.5.3-py3-none-any.whl (29 kB)
Collecting ply (from jsonpath_ng)
  Downloading ply-3.11-py2.py3-none-any.whl (49 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/49.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ply, jsonpath_ng
Successfully installed jsonpath_ng-1.5.3 ply-3.11


In [2]:
import os
import json
import numpy as np
import pandas as pd
import sqlite3
import functools as ft
import matplotlib.pyplot as plt
from abc import ABC, abstractmethod
from jsonpath_ng import parse
from enum import Enum
from datetime import datetime
%matplotlib inline


In [3]:
Fact_df = pd.read_excel('mini_table.xlsx')
Fact_df
Fact_df.to_json("mini_table.json")
json_Fact_df = Fact_df.astype(str).to_dict(orient='records')
json_Fact_df

[{'date': '2005-06-30',
  'Patient_Number': '1',
  'age': '75',
  'sex': '1',
  'weight': '59.30644875176507',
  'height': '151',
  'Smoker': '0',
  'isDead': '0',
  'deathReason': 'nan',
  'Routine_test': '1',
  'startIshpuz': '2017-10-06',
  'endIshpuz': '2017-10-11',
  'Hospitalization_duration': '5',
  'procedures': '2',
  'simulation': '5'},
 {'date': '2005-07-08',
  'Patient_Number': '1',
  'age': '75',
  'sex': '1',
  'weight': '59.97035020648912',
  'height': '151',
  'Smoker': '0',
  'isDead': '0',
  'deathReason': 'nan',
  'Routine_test': '1',
  'startIshpuz': '2017-10-06',
  'endIshpuz': '2017-10-11',
  'Hospitalization_duration': '5',
  'procedures': '2',
  'simulation': '5'},
 {'date': '2005-10-24',
  'Patient_Number': '1',
  'age': '75',
  'sex': '1',
  'weight': '64.11938956027117',
  'height': '151',
  'Smoker': '1',
  'isDead': '0',
  'deathReason': 'nan',
  'Routine_test': '1',
  'startIshpuz': '2017-10-06',
  'endIshpuz': '2017-10-11',
  'Hospitalization_duration': '

In [4]:
class Interface(ABC):

    @abstractmethod
    def get_data_by_field(self, field_name):
        """Fetch the data by given feild name """

    @abstractmethod
    def get_data_by_id(self, id):
        """Fetch the data by given ID  """

    @abstractmethod
    def get(self):
        """Fetch all data """

In [5]:
class TransformMask(Enum):
    # add here any masks you want
    CLEAN_STRING = ".strip().lower()"
    CAPITAL_LETTER = ".strip().lower().title()"



In [6]:
class Database:
    def __init__(self):
        self.db = {
            "source": [],
            "destination": [],
            "transform": [],
            "mapping": []
        }
        self.add_source(1, "Patient_Number", "str", True)
        self.add_destination(1, "Patient_Number", "Patient_Number", "str", "n/a", "fact")
        self.add_mapping(1, 1, 1, 1, "fact")

        self.add_source(4, "Patient_Number", "str", True)
        self.add_destination(4, "Patient_Number_G", "Patient_Number_G", "str", "n/a", "General_Details")
        self.add_mapping(4, 4, 4, 4, "General_Details")

        self.add_source(5, "Patient_Number", "str", True)
        self.add_destination(5, "Patient_Number_A", "Patient_Number_A", "str", "n/a", "Additional_Hos_Details")
        self.add_mapping(5, 5, 5, 5, "Additional_Hos_Details")

        self.add_source(17, "Patient_Number", "str", True)
        self.add_destination(17, "Patient_Number_D", "Patient_Number_D", "str", "n/a", "deathReason")
        self.add_mapping(17, 17, 17, 17, "deathReason")

        self.add_source(2, "age", "str", True)
        self.add_destination(2, "age", "age", "int", "n/a", "General_Details")
        self.add_mapping(2, 2, 2, 2, "General_Details")

        self.add_source(3, "date", "str", True)
        self.add_destination(3, "date", "date", "str", 0, "fact")
        self.add_mapping(3, 3, 3, 3, "fact")

        self.add_source(6, "date", "str", True)
        self.add_destination(6, "date_A", "date_A", "str", 0, "Additional_Hos_Details")
        self.add_mapping(6, 6, 6, 6, "Additional_Hos_Details")

        self.add_source(7, "date", "str", True)
        self.add_destination(7, "date_G", "date_G", "str", 0, "General_Details")
        self.add_mapping(7, 7, 7, 7, "General_Details")

        self.add_source(18, "date", "str", True)
        self.add_destination(18, "date_D", "date_D", "str", 0, "deathReason")
        self.add_mapping(18, 18, 18, 18, "deathReason")

        self.add_source(8, "Routine_test", "str", True)
        self.add_destination(8, "Routine_test", "Routine_test", "int", 0, "fact")
        self.add_mapping(8, 8, 8, 8, "fact")

        self.add_source(9, "Routine_test", "str", True)
        self.add_destination(9, "Routine_test_G", "Routine_test_G", "int", 0, "General_Details")
        self.add_mapping(9, 9, 9, 9, "General_Details")

        self.add_source(19, "Routine_test", "str", True)
        self.add_destination(19, "Routine_test_D", "Routine_test_D", "int", 0, "deathReason")
        self.add_mapping(19, 19, 19, 19, "deathReason")

        self.add_source(10, "sex", "str", True)
        self.add_destination(10, "sex", "sex", "int", 0, "General_Details")
        self.add_mapping(10, 10, 10, 10, "General_Details")

        self.add_source(11, "height", "str", True)
        self.add_destination(11, "height", "height", "float", 0, "General_Details")
        self.add_mapping(11, 11, 11, 11, "General_Details")

        self.add_source(12, "Smoker", "str", True)
        self.add_destination(12, "Smoker", "Smoker", "int", 0, "General_Details")
        self.add_mapping(12, 12, 12, 12, "General_Details")

        self.add_source(13, "weight", "str", True)
        self.add_destination(13, "weight", "weight", "float", 0, "General_Details")
        self.add_mapping(13, 13, 13, 13, "General_Details")

        self.add_source(14, "Hospitalization_duration", "str", True)
        self.add_destination(14, "Hospitalization_duration", "Hospitalization_duration", "int", 0, "Additional_Hos_Details")
        self.add_mapping(14, 14, 14, 14, "Additional_Hos_Details")

        self.add_source(15, "procedures", "str", True)
        self.add_destination(15, "procedures", "procedures", "int", 0, "Additional_Hos_Details")
        self.add_mapping(15, 15, 15, 15, "Additional_Hos_Details")

        self.add_source(16, "simulation", "str", True)
        self.add_destination(16, "simulation", "simulation", "int", 0, "Additional_Hos_Details")
        self.add_mapping(16, 16, 16, 16, "Additional_Hos_Details")

        self.add_source(20, "deathReason", "str", True)
        self.add_destination(20, "deathReason", "deathReason", "str", "n/a", "deathReason")
        self.add_mapping(20, 20, 20, 20, "deathReason")

    def add_source(self, id, name, type, is_required ):
        self.db["source"].append({
            "id": id,
            "source_field_name": name,
            "source_field_mapping": "$."+name,
            "source_field_type": type, # use python types
            "is_required": is_required,
        })

    def add_destination(self, id, name, mapping, type, default, table):
        self.db["destination"].append({
              "id":id,
              "destination_field_name": name,
              "destination_field_mapping":mapping,
              "destination_field_type": type,
              "default_value": default,
              "destination_table": table
                })

    def add_transformation(self, transform_mask):
        self.db["transform"].append({
              "id": id,
              "transform_mask": transform_mask
                })


    def add_mapping(self, id, source, destination, transform, table):
        self.db["mapping"].append({
              "id": id,
              "mapping_source": source,
              "mapping_destination": destination,
              "mapping_transform": transform,
              "destination_table": table
              })



    # built-in function that creates and returns a property object
    # get data by: get_data_source_target_mapping.get(dict_key)
    @property
    def get_data_source_target_mapping(self):
        return self.db


In [7]:
class Source(Interface, Database):
    def __init__(self):
        Database.__init__(self)

    # should be implemented - inherited from Interface
    def get_data_by_field(self, field_name):
        data = self.get
        for item in data:
            for key, value in item.items():
                if key == field_name:
                    return item
        return None

    @property
    def get(self):
        return self.get_data_source_target_mapping.get("source")

    def get_data_by_id(self, id):
        self.id = id
        data = self.get
        for x in data:
            if x.get("id") == self.id:
                return x
        return None

In [8]:
class Target(Interface, Database):

    def __init__(self):
        Database.__init__(self)

    # should be implemented - inherited from Interface
    def get_data_by_field(self, field_name):
        data = self.get
        for item in data:
            for key, value in item.items():
                if key == field_name:
                    return item
        return None

    @property
    def get(self):
        return self.get_data_source_target_mapping.get("destination")

    def get_data_by_id(self, id):
        self.id = id
        data = self.get
        for x in data:
            if x.get("id").__str__() == self.id.__str__():
                return x
        return None

In [9]:
class Transform(Interface, Database):

    def __init__(self):
        Database.__init__(self)

    # should be implemented - inherited from Interface
    def get_data_by_field(self, field_name):
        data = self.get
        for item in data:
            for key, value in item.items():
                if key == field_name:
                    return item
        return None

    @property
    def get(self):
        return self.get_data_source_target_mapping.get("transform", [])

    def get_data_by_id(self, id):
        self.id = id
        data = self.get
        for x in data:
            if x.get("id").__str__() == self.id.__str__():
                return x
        return None

In [10]:
class Mappings(Interface, Database):

    def __init__(self):
        Database.__init__(self)

    @property
    def get(self):
        return self.get_data_source_target_mapping.get("mapping")

    def get_data_by_id(self, id):
        self.id = id
        data = self.get
        for x in data:
            if x.get("id").__str__() == self.id.__str__():
                return x
        return None

    def get_data_by_field(self, field_name):
        return None

In [11]:
class JsonQuery:
    def __init__(self, json_path, json_data):
        self.json_path = json_path
        self.json_data = json_data

    def get(self):
        jsonpath_expression = parse(self.json_path)
        match = jsonpath_expression.find(self.json_data)
        source_data_value = match[0].value
        return source_data_value

In [12]:
class STTM:
    def __init__(self, input_json):
        self.input_json = input_json
        self.mapping_instance = Mappings()
        self.source_instance = Source()
        self.destination_instance = Target()
        self.transform_instance = Transform()
        self.look_up_mask = {i.name: i.value for i in TransformMask}
        self.json_data_transformed = {}
        self.to_table = {}

    def _get_mapping_data(self):
        return self.mapping_instance.get

    def _get_mapping_source_data(self):
        return self.source_instance.get

    def get_transformed_data(self):




        for mappings in self._get_mapping_data():

            """fetch the source mapping """
            mapping_source_id = mappings.get("mapping_source")
            mapping_destination_id = mappings.get("mapping_destination")
            mapping_transform_id = mappings.get("mapping_transform")
            mapping_table = mappings.get("destination_table")


            #mapping_table = mappings.get("destination_table")

            mapping_source_data = self.source_instance.get_data_by_id(id=mapping_source_id)
            transform_data = self.transform_instance.get_data_by_id(id=mapping_transform_id)

            """Fetch Source  field Name"""
            source_field_name = mapping_source_data.get("source_field_name")

            """if field given is not present incoming json """
            if source_field_name not in self.input_json.keys():
                if mapping_source_data.get("is_required"):
                    raise Exception(
                        "Alert ! Field {} is not present in JSON please FIX mappings ".format(source_field_name))
                else:
                    pass

            else:
                source_data_value = JsonQuery(
                    json_path=mapping_source_data.get("source_field_mapping"),
                    json_data=self.input_json
                ).get()

                """check the data type for source if matches with what we have """
                if mapping_source_data.get("source_field_type") != type(source_data_value).__name__:
                    if source_data_value is not None:
                        _message = (
                            "Alert ! Source Field :{} Datatype has changed from {} to {} ".format(source_field_name,
                                                                                                  mapping_source_data.get(
                                                                                                      "source_field_type"),
                                                                                                  type(
                                                                                                      source_data_value).__name__))
                        print(_message)
                        raise Exception(_message)

                """Query and fetch the Destination | target """
                destination_mappings_json_object = self.destination_instance.get_data_by_id(
                    id=mappings.get("mapping_destination"))

                destination_field_name = destination_mappings_json_object.get("destination_field_name")
                destination_field_type = destination_mappings_json_object.get("destination_field_type")
                self.to_table[destination_field_name] = mapping_table

                dtypes = [str, float, list, int, set, dict]

                for dtype in dtypes:

                    """Datatype Conversion """
                    if destination_field_type == str(dtype.__name__):

                        """is source is none insert default value"""
                        if source_data_value is None:
                            self.json_data_transformed[destination_field_name] = dtype.__call__(
                                destination_mappings_json_object.get("default_value")
                            )

                        else:
                            """check if you have items to transform"""
                            if transform_data is not None:
                                """ check for invalid mask name """
                                if transform_data.get("transform_mask") not in list(self.look_up_mask.keys()):
                                    raise Exception(
                                        f"Specified Transform {transform_data.get('transform_mask')} is not available please select from following Options :{list(self.look_up_mask.keys())}")
                                else:
                                    mask_apply = self.look_up_mask.get(transform_data.get("transform_mask"))
                                    converted_dtype = dtype.__call__(source_data_value)
                                    mask = f'converted_dtype{mask_apply}'
                                    curated_value = eval(mask)
                                    self.json_data_transformed[destination_field_name] = curated_value

                            else:
                                self.json_data_transformed[destination_field_name] = dtype.__call__(source_data_value)

        return self.json_data_transformed, self.to_table

In [13]:
transformed_data = []
for item in json_Fact_df:
    helper = STTM(input_json=item)
    response, mapping = helper.get_transformed_data()
    transformed_data.append(response)
    print(response)
print(mapping)


{'Patient_Number': '1', 'Patient_Number_G': '1', 'Patient_Number_A': '1', 'Patient_Number_D': '1', 'age': 75, 'date': '2005-06-30', 'date_A': '2005-06-30', 'date_G': '2005-06-30', 'date_D': '2005-06-30', 'Routine_test': 1, 'Routine_test_G': 1, 'Routine_test_D': 1, 'sex': 1, 'height': 151.0, 'Smoker': 0, 'weight': 59.30644875176507, 'Hospitalization_duration': 5, 'procedures': 2, 'simulation': 5, 'deathReason': 'nan'}
{'Patient_Number': '1', 'Patient_Number_G': '1', 'Patient_Number_A': '1', 'Patient_Number_D': '1', 'age': 75, 'date': '2005-07-08', 'date_A': '2005-07-08', 'date_G': '2005-07-08', 'date_D': '2005-07-08', 'Routine_test': 1, 'Routine_test_G': 1, 'Routine_test_D': 1, 'sex': 1, 'height': 151.0, 'Smoker': 0, 'weight': 59.97035020648912, 'Hospitalization_duration': 5, 'procedures': 2, 'simulation': 5, 'deathReason': 'nan'}
{'Patient_Number': '1', 'Patient_Number_G': '1', 'Patient_Number_A': '1', 'Patient_Number_D': '1', 'age': 75, 'date': '2005-10-24', 'date_A': '2005-10-24', 'd

In [14]:
mapping

{'Patient_Number': 'fact',
 'Patient_Number_G': 'General_Details',
 'Patient_Number_A': 'Additional_Hos_Details',
 'Patient_Number_D': 'deathReason',
 'age': 'General_Details',
 'date': 'fact',
 'date_A': 'Additional_Hos_Details',
 'date_G': 'General_Details',
 'date_D': 'deathReason',
 'Routine_test': 'fact',
 'Routine_test_G': 'General_Details',
 'Routine_test_D': 'deathReason',
 'sex': 'General_Details',
 'height': 'General_Details',
 'Smoker': 'General_Details',
 'weight': 'General_Details',
 'Hospitalization_duration': 'Additional_Hos_Details',
 'procedures': 'Additional_Hos_Details',
 'simulation': 'Additional_Hos_Details',
 'deathReason': 'deathReason'}

In [15]:
df = pd.DataFrame(transformed_data)
df

Unnamed: 0,Patient_Number,Patient_Number_G,Patient_Number_A,Patient_Number_D,age,date,date_A,date_G,date_D,Routine_test,Routine_test_G,Routine_test_D,sex,height,Smoker,weight,Hospitalization_duration,procedures,simulation,deathReason
0,1,1,1,1,75,2005-06-30,2005-06-30,2005-06-30,2005-06-30,1,1,1,1,151.0,0,59.306449,5,2,5,
1,1,1,1,1,75,2005-07-08,2005-07-08,2005-07-08,2005-07-08,1,1,1,1,151.0,0,59.97035,5,2,5,
2,1,1,1,1,75,2005-10-24,2005-10-24,2005-10-24,2005-10-24,1,1,1,1,151.0,1,64.11939,5,2,5,
3,1,1,1,1,75,2006-01-08,2006-01-08,2006-01-08,2006-01-08,1,1,1,1,151.0,0,68.149578,5,2,5,
4,1,1,1,1,75,2006-02-02,2006-02-02,2006-02-02,2006-02-02,1,1,1,1,151.0,0,66.977856,5,2,5,
5,1,1,1,1,75,2006-03-13,2006-03-13,2006-03-13,2006-03-13,1,1,1,1,151.0,0,68.82375,5,2,5,
6,1,1,1,1,75,2006-04-08,2006-04-08,2006-04-08,2006-04-08,1,1,1,1,151.0,0,54.203421,5,2,5,
7,1,1,1,1,75,2006-04-26,2006-04-26,2006-04-26,2006-04-26,1,1,1,1,151.0,0,70.601417,5,2,5,
8,1,1,1,1,76,2006-10-14,2006-10-14,2006-10-14,2006-10-14,1,1,1,1,151.0,1,59.668095,5,2,5,
9,1,1,1,1,76,2007-02-02,2007-02-02,2007-02-02,2007-02-02,1,1,1,1,151.0,0,68.450888,5,2,5,


**fact table**

In [16]:
fact_columns = [column for column, value in mapping.items() if value == 'fact' and column in df.columns]
fact_table = df[fact_columns]
fact_table.to_excel('fact.xlsx',index=False)
fact_table

Unnamed: 0,Patient_Number,date,Routine_test
0,1,2005-06-30,1
1,1,2005-07-08,1
2,1,2005-10-24,1
3,1,2006-01-08,1
4,1,2006-02-02,1
5,1,2006-03-13,1
6,1,2006-04-08,1
7,1,2006-04-26,1
8,1,2006-10-14,1
9,1,2007-02-02,1


**General Details table**

In [17]:
General_Details_columns = [column for column, value in mapping.items() if value == 'General_Details' and column in df.columns]
General_Details_table = df[General_Details_columns]
General_Details_table.to_excel('General_Details_table.xlsx',index=False)
General_Details_table

Unnamed: 0,Patient_Number_G,age,date_G,Routine_test_G,sex,height,Smoker,weight
0,1,75,2005-06-30,1,1,151.0,0,59.306449
1,1,75,2005-07-08,1,1,151.0,0,59.97035
2,1,75,2005-10-24,1,1,151.0,1,64.11939
3,1,75,2006-01-08,1,1,151.0,0,68.149578
4,1,75,2006-02-02,1,1,151.0,0,66.977856
5,1,75,2006-03-13,1,1,151.0,0,68.82375
6,1,75,2006-04-08,1,1,151.0,0,54.203421
7,1,75,2006-04-26,1,1,151.0,0,70.601417
8,1,76,2006-10-14,1,1,151.0,1,59.668095
9,1,76,2007-02-02,1,1,151.0,0,68.450888


**Additional Hospitalization Details table**

In [18]:
Additional_Hos_Details_columns = [column for column, value in mapping.items() if value == 'Additional_Hos_Details' and column in df.columns]
Additional_Hos_Details_table = df[Additional_Hos_Details_columns]
Additional_Hos_Details_table.to_excel('Additional_Hos_Details_table.xlsx',index=False)
Additional_Hos_Details_table

Unnamed: 0,Patient_Number_A,date_A,Hospitalization_duration,procedures,simulation
0,1,2005-06-30,5,2,5
1,1,2005-07-08,5,2,5
2,1,2005-10-24,5,2,5
3,1,2006-01-08,5,2,5
4,1,2006-02-02,5,2,5
5,1,2006-03-13,5,2,5
6,1,2006-04-08,5,2,5
7,1,2006-04-26,5,2,5
8,1,2006-10-14,5,2,5
9,1,2007-02-02,5,2,5


**deathReason table**

In [19]:
deathReason_columns = [column for column, value in mapping.items() if value == 'deathReason' and column in df.columns]
deathReason_table = df[deathReason_columns]
deathReason_table.to_excel('deathReason_table.xlsx',index=False)
deathReason_table

Unnamed: 0,Patient_Number_D,date_D,Routine_test_D,deathReason
0,1,2005-06-30,1,
1,1,2005-07-08,1,
2,1,2005-10-24,1,
3,1,2006-01-08,1,
4,1,2006-02-02,1,
5,1,2006-03-13,1,
6,1,2006-04-08,1,
7,1,2006-04-26,1,
8,1,2006-10-14,1,
9,1,2007-02-02,1,
