In [1]:
import json
from hashlib import sha256
from datetime import datetime, timezone
import pandas as pd

class CONST:
    current_ind = 'current_ind'
    deleted_ind = 'deleted_ind'
    created_ts = 'created_ts'
    updated_ts = 'updated_ts'
    deleted_ts = 'deleted_ts'
    sha256_hex = 'sha256_hex'

def mark_empty_dict_to_none(data):
    if isinstance(data, dict):
        if len(data) == 0:
            return None
        return {k: mark_empty_dict_to_none(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [mark_empty_dict_to_none(item) for item in data]
    else:
        return data

class Type_II:
    current_ind = 'current_ind'
    
    @staticmethod
    def convert_to_df(data, now):
        data = mark_empty_dict_to_none(data)
        for entry in data:
            entry[CONST.current_ind] = True
            entry[CONST.deleted_ind] = False
            entry[CONST.created_ts] = now
            entry[CONST.updated_ts] = None
            entry[CONST.deleted_ts] = None
            entry[CONST.sha256_hex] = sha256(json.dumps(entry).encode('utf-8')).hexdigest()

        df = pd.json_normalize(data, sep='_', max_level=0)
        df[CONST.created_ts] = pd.to_datetime(df[CONST.created_ts])
        df[CONST.updated_ts] = pd.to_datetime(df[CONST.updated_ts])
        df[CONST.deleted_ts] = pd.to_datetime(df[CONST.deleted_ts])
        
        return df
            
    def __init__(self):
        self.state = None

    def set_index(self, df):
        return df.set_index('id')

    def initiate_state(self, df):
        self.state = self.set_index(df)
        
    def merge(self, data):
        now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %z")
        df = Type_II.convert_to_df(data, now)
        df = self.set_index(df)
        if not df.index.is_unique:
            raise ValueError('The index of the dataframe is not unique')

        state = self.state
        df_history = state.loc[state[CONST.current_ind] == False, :]
        df_current = state.loc[state[CONST.current_ind] == True, :]
        df_current_active = df_current.loc[df_current[CONST.deleted_ind] == False, :]
        df_current_delted = df_current.loc[df_current[CONST.deleted_ind] == True, :]

        df_inserted = df.loc[~df.index.isin(df_current_active.index), :]
        df_modified = df.join(df_current_active, how='inner', rsuffix='_')
        df_modified = df_modified.loc[~(df_modified[CONST.sha256_hex] == df_modified[f'{CONST.sha256_hex}_']), df.columns]
        df_deleted = df_current_active.loc[~df_current_active.index.isin(df.index), :]

        df_current_active.loc[df_current_active.index.isin(df_modified.index), [CONST.current_ind, CONST.updated_ts]] = [False, now]
        df_current_active.loc[df_current_active.index.isin(df_deleted.index), [CONST.deleted_ind, CONST.deleted_ts]] = [True, now]
        df_current_delted.loc[df_current_delted.index.isin(df_inserted.index), [CONST.current_ind]] = [False]

        frames = [df_inserted, df_modified, df_current_active, df_current_delted, df_history]
        self.state = pd.concat([frame for frame in frames if not frame.empty])


In [2]:
def create_data_entry(id, value):
    return {'id': id, 'value': value}

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', None)

type2 = Type_II()
type2.initiate_state(pd.DataFrame(columns=['id', 'value', 'current_ind', 'deleted_ind', 'created_ts', 'updated_ts', 'deleted_ts', 'sha256_hex']))

data = []
data.append(create_data_entry(1, 'initial'))
data.append(create_data_entry(2, 'initial'))
data.append(create_data_entry(3, 'initial'))

type2.merge(data)

print(type2.state)

data = []
data.append(create_data_entry(1, 'initial'))
data.append(create_data_entry(2, 'modified'))
data.append(create_data_entry(4, 'initial'))

type2.merge(data)

print(type2.state)

data = []
data.append(create_data_entry(1, 'initial'))
data.append(create_data_entry(2, 'modified'))
data.append(create_data_entry(4, 'initial'))

type2.merge(data)

print(type2.state)


      value  current_ind  deleted_ind                created_ts updated_ts deleted_ts                                         sha256_hex
id                                                                                                                                      
1   initial         True        False 2025-03-08 03:01:49+00:00        NaT        NaT  613016924b1a7c74d8f4d297e46733327502e9b12f0286...
2   initial         True        False 2025-03-08 03:01:49+00:00        NaT        NaT  ca2a7130d96482fae4b98234616c7dc54a9f958af87d79...
3   initial         True        False 2025-03-08 03:01:49+00:00        NaT        NaT  7560c761dc4a43d0e09f26d6c956aead2e1b70f6718d51...
       value  current_ind  deleted_ind                created_ts                 updated_ts                 deleted_ts                                         sha256_hex
id                                                                                                                                               

  df_current_active.loc[df_current_active.index.isin(df_modified.index), [CONST.current_ind, CONST.updated_ts]] = [False, now]
  df_current_active.loc[df_current_active.index.isin(df_deleted.index), [CONST.deleted_ind, CONST.deleted_ts]] = [True, now]
