In [91]:
import pandas as pd
from tqdm import tqdm
import random
from  datetime import datetime
import phonenumbers
from clean_phone import clean_phone
from clean_phone import _check_phone

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


def extract_columns_from_ddl(table_ddl):
    column_names = [line.split()[0] for line in table_ddl.strip().split('\n')]
    return  column_names

def generate_dummy_data_df(n):
    data = []
    for _ in range(n):
        data.append({
            "PARTY_PHONE_ID": random.randint(1, 1000),
            "PARTY_ID": random.randint(1, 1000),
            "COUNTRY_ID": f'Country_{random.randint(1, 100)}',
            "PARTY_PHONE_TYPE_ID": random.randint(1, 10),
            "PHONE_NUMBER": f'{random.randint(100000, 999999)}',
            "PHONE_NUMBER_PREFIX": f'+{random.randint(10, 99)}',
            "SRC_ID": f'SRC_{random.randint(1, 100)}',
            "SRC_SYS_ID": f'SYS_{random.randint(1, 100)}',
            "DEL_FLAG": random.randint(0, 1),
            "INSERT_DATETIME": datetime.now().date(),
            "INS_PROCESS_ID": f'Process_{random.randint(1, 100)}',
            "UPDATE_DATETIME":  pd.to_datetime('2024-01-05'),
            "UPD_PROCESS_ID": f'Process_{random.randint(101, 200)}',
            "UPD_EFF_DATE": datetime.now().date()
        })
    return pd.DataFrame(data)

def MTCH_PT_PHONE_definition(table_ddl):

    columns = extract_columns_from_ddl(table_ddl)
    return pd.DataFrame(columns=columns)

def insert_phones(filtered_MTCH_PT_PHONE):
    transformed_df = pd.DataFrame(
        {
        'PARTY_PHONE_ID': filtered_MTCH_PT_PHONE['PARTY_PHONE_ID'],
        'PARTY_ID': filtered_MTCH_PT_PHONE['PARTY_ID'],
        'COUNTRY_ID': filtered_MTCH_PT_PHONE['COUNTRY_ID'],
        'PARTY_PHONE_TYPE_ID': filtered_MTCH_PT_PHONE['PARTY_PHONE_TYPE_ID'],
        'PHONE_NUMBER': filtered_MTCH_PT_PHONE['PHONE_NUMBER'],
        'PHONE_NUMBER_PREFIX': filtered_MTCH_PT_PHONE['PHONE_NUMBER_PREFIX'],
        'SRC_ID': filtered_MTCH_PT_PHONE['SRC_ID'],
        'SRC_SYS_ID': filtered_MTCH_PT_PHONE['SRC_SYS_ID'],
        'DELETE_FLAG': filtered_MTCH_PT_PHONE['DEL_FLAG'],
        'INSERT_DATETIME': filtered_MTCH_PT_PHONE['INSERT_DATETIME'],
        'INSERT_PROCESS_ID': filtered_MTCH_PT_PHONE['INS_PROCESS_ID'],
        'UPDATE_DATETIME': filtered_MTCH_PT_PHONE['UPDATE_DATETIME'],
        'UPDATE_DATETIME_PROCESS_ID': filtered_MTCH_PT_PHONE['UPD_PROCESS_ID'],
        'UPDATE_DATETIME_EFFECTIVE_DATE': filtered_MTCH_PT_PHONE['UPD_EFF_DATE']
        }
        
    )
    return transformed_df

def validate_phone_number(number):
    try:
        parsed_number = phonenumbers.parse(str(number), None)
        return phonenumbers.is_valid_number(parsed_number)
    except phonenumbers.NumberParseException:
        return False

def unified_phone_id(df,column_for_unification,group_by_column, phone_key, result_column):
    """
        df :  pd.DataFrame() with party phone numbers
        column_for_unification :  column which contains valid phone numbers for unification
        group_by_column : primary key, based on which the data will be grouped 
        phone_key : primary key, which identifies the phone
        result_column : name of the result column, where the result values will be added
    """

    filtered_df = df[df[column_for_unification].notnull() & ~df[column_for_unification].isna()]
    grouped_phones = filtered_df.groupby(group_by_column)[phone_key].apply(list)
    df[result_column] = df[group_by_column].map(grouped_phones)
    
    return df

def impute_null_phone_numbers(df, column, value):
    for index, row in tqdm(df[df[column].isnull()].iterrows(),desc = "Null value impute is running ..."):
        df.loc[index,column] = value
    return df 


def list_to_string_transformation(df: pd.DataFrame, column: str):
    for index, row in tqdm(df.iterrows(), desc="List transformation is running ..."):
        # Check if the entry is iterable (e.g., a list), if not, skip the iteration.
        if not isinstance(row[column], list):
            #print(f"Non-iterable item at index {index}: {row['UNIFICATION_PARTY_PHONE_ID']}")
            continue
        
        resulted_string = ""
        for val in row[column]:
            # First method
            values_as_string = ["'{}'".format(int(val)) for val in row[column]]
            resulted_string = ', '.join(values_as_string)
            df.loc[index, column]
            
            # Properly concatenate the string with each value.
        df.loc[index, column] = resulted_string
    return df

ddl = """
    SORURCE_PHONE_PREFIX VARCHAR(255),
    SORURCE_PHONE_NUMBER VARCHAR(255),
    MATCHING_PHONE_NUMBER VARCHAR(255),
    MASTER_PARTY_PHONE_ID INTEGER,
    UNIFICATION_PARTY_PHONE_ID INTEGER,
    SOURCE_IDENTIFIER VARCHAR(255),
    SOURCE_SYSTEM_IDENTIFIER VARCHAR(255),
    DELETE_FLAG INTEGER,
    INSERT_DATETIME DATE,
    INSERT_PROCESS_ID VARCHAR(255),
    UPDATE_DATETIME DATE,
    UPDATE_DATETIME_PROCESS_ID VARCHAR(255),
    UPDATE_DATETIME_EFFECTIVE_DATE DATE
"""
MTCH_PT_PHONE = MTCH_PT_PHONE_definition(ddl)
PARTY_PHONE = generate_dummy_data_df(1000)
PARTY_PHONE = PARTY_PHONE.drop_duplicates()

#Type correction
MTCH_PT_PHONE['UPDATE_DATETIME'] = pd.to_datetime(MTCH_PT_PHONE['UPDATE_DATETIME'], errors='coerce')
MTCH_PT_PHONE['INSERT_DATETIME'] = pd.to_datetime(MTCH_PT_PHONE['INSERT_DATETIME'], errors='coerce')

print(MTCH_PT_PHONE.shape)
print(PARTY_PHONE.shape)


print(MTCH_PT_PHONE.shape)
last_checked_date = pd.to_datetime('2024-01-10')
filtered_MTCH_PT = PARTY_PHONE[PARTY_PHONE['UPDATE_DATETIME']<last_checked_date]
MTCH_PT_PHONE = pd.concat([MTCH_PT_PHONE, insert_phones(filtered_MTCH_PT)], ignore_index=True)
for index, row in tqdm(MTCH_PT_PHONE.iterrows(),desc = "Phone union"):
     MTCH_PT_PHONE.loc[index,'MATCHING_PHONE_NUMBER_TMP'] = row['PHONE_NUMBER_PREFIX'] + row['PHONE_NUMBER']
print(MTCH_PT_PHONE.shape)

from clean_phone import validate_phone
print("MATCHING_PHONE_NUMBER null value count ",MTCH_PT_PHONE.MATCHING_PHONE_NUMBER.isna().sum(),"\n")

#res = []
for index, row in tqdm(MTCH_PT_PHONE.iterrows(), "validate_phone method is running ..."):
    #res.append(_check_phone(row['MATCHING_PHONE_NUMBER_TMP'],False))
    phone_number_str = row['PHONE_NUMBER_PREFIX'] + row['PHONE_NUMBER']
    if validate_phone(phone_number_str):
        MTCH_PT_PHONE.loc[index, 'MATCHING_PHONE_NUMBER'] = phonenumbers.format_number(phonenumbers.parse(phone_number_str, None), phonenumbers.PhoneNumberFormat.INTERNATIONAL)   
    else:
        MTCH_PT_PHONE.loc[index, 'MATCHING_PHONE_NUMBER'] = None 

print("MATCHING_PHONE_NUMBER null value count ",MTCH_PT_PHONE.MATCHING_PHONE_NUMBER.isna().sum(),"\n")

print("UNIFICATION_PARTY_PHONE_ID null value count ",MTCH_PT_PHONE['UNIFICATION_PARTY_PHONE_ID'].isna().sum())

MTCH_PT_PHONE = unified_phone_id(df = MTCH_PT_PHONE,
                                column_for_unification = 'MATCHING_PHONE_NUMBER',
                                group_by_column = 'PARTY_ID',
                                phone_key = 'PARTY_PHONE_ID',
                                result_column =  'UNIFICATION_PARTY_PHONE_ID')

print("UNIFICATION_PARTY_PHONE_ID null value count ",MTCH_PT_PHONE['UNIFICATION_PARTY_PHONE_ID'].isna().sum())

MTCH_PT_PHONE = impute_null_phone_numbers(MTCH_PT_PHONE,'MATCHING_PHONE_NUMBER',None)
print("MATCHING_PHONE_NUMBER null value count ",MTCH_PT_PHONE.MATCHING_PHONE_NUMBER.isna().sum(),"\n")

MTCH_PT = pd.read_csv('mtch_pt.csv')
MTCH_PT['UNIFICATION_PARTY_PHONE_ID'] = None
#MTCH_PT = MTCH_PT.rename(columns={"MATCHING_MFO": "UNIFICATION_PARTY_PHONE_ID"})
#merged_df = pd.merge(MTCH_PT, MTCH_PT_PHONE[['PARTY_ID', 'UNIFICATION_PARTY_PHONE_ID']], on='PARTY_ID', how='left')

merged_df = pd.merge(MTCH_PT, 
                     MTCH_PT_PHONE[['PARTY_ID', 'UNIFICATION_PARTY_PHONE_ID']], 
                     on='PARTY_ID', 
                     how='left',
                     suffixes=('', '_from_phone'))

# Update UNIFICATION_PARTY_PHONE_ID in MTCH_PT with the values from MTCH_PT_PHONE
MTCH_PT['UNIFICATION_PARTY_PHONE_ID'] = merged_df['UNIFICATION_PARTY_PHONE_ID_from_phone']


(0, 13)
(1000, 14)
(0, 13)


Phone union: 1000it [00:00, 9989.63it/s]


(1000, 22)
MATCHING_PHONE_NUMBER null value count  1000 



validate_phone method is running ...: 1000it [00:00, 6731.97it/s]


MATCHING_PHONE_NUMBER null value count  868 

UNIFICATION_PARTY_PHONE_ID null value count  1000
UNIFICATION_PARTY_PHONE_ID null value count  771


Null value impute is running ...: 868it [00:00, 9210.93it/s]


MATCHING_PHONE_NUMBER null value count  868 



# Emails part

In [92]:
from pyisemail import is_email

email_df = pd.read_csv("generated-emails-32000.csv", header=0, names=['email'])

def extract_columns_from_ddl(table_ddl):
    column_names = [line.split()[0] for line in table_ddl.strip().split('\n')]
    return  column_names

def generate_dummy_data_email_df(n):
    data = []
    max_index = email_df.shape[0]
    i = 0
    for _ in range(n):
        if i == max_index - 2:
            i = 0
        data.append({
            "PARTY_EMAIL_ID": random.randint(1, 1000),
            "PARTY_ID": random.randint(1, 1000),
            #"COUNTRY_ID": f'Country_{random.randint(1, 100)}',
            "PARTY_EMAIL_TYPE_ID": random.randint(1, 10),
            "EMAIL": email_df["email"][i],
            #"EMAIL_NUMBER_PREFIX": f'+{random.randint(10, 99)}',
            "SRC_ID": f'SRC_{random.randint(1, 100)}',
            "SRC_SYS_ID": f'SYS_{random.randint(1, 100)}',
            "DEL_FLAG": random.randint(0, 1),
            "INSERT_DATETIME": datetime.now().date(),
            "INS_PROCESS_ID": f'Process_{random.randint(1, 100)}',
            "UPDATE_DATETIME":  pd.to_datetime('2024-01-05'),
            "UPD_PROCESS_ID": f'Process_{random.randint(101, 200)}',
            "UPD_EFF_DATE": datetime.now().date()
        })
        i += 1
    return pd.DataFrame(data)

def MTCH_PT_EMAIL_definition(table_ddl):

    columns = extract_columns_from_ddl(table_ddl)
    return pd.DataFrame(columns=columns)

def insert_emails(df : pd.DataFrame):
    transformed_df = pd.DataFrame(
        {
        'PARTY_EMAIL_ID': df['PARTY_EMAIL_ID'],
        'PARTY_ID': df['PARTY_ID'],
        #'COUNTRY_ID': df['COUNTRY_ID'],
        'PARTY_EMAIL_TYPE_ID': df['PARTY_EMAIL_TYPE_ID'],
        'SOURCE_EMAIL': df['EMAIL'],
        
        'SRC_ID': df['SRC_ID'],
        'SRC_SYS_ID': df['SRC_SYS_ID'],
        'DELETE_FLAG': df['DEL_FLAG'],
        'INSERT_DATETIME': df['INSERT_DATETIME'],
        'INSERT_PROCESS_ID': df['INS_PROCESS_ID'],
        'UPDATE_DATETIME': df['UPDATE_DATETIME'],
        'UPDATE_DATETIME_PROCESS_ID': df['UPD_PROCESS_ID'],
        'UPDATE_DATETIME_EFFECTIVE_DATE': df['UPD_EFF_DATE']
        }
        
    )
    return transformed_df


def impute_null_values(df, column, value):
    for index, row in tqdm(df[df[column].isnull()].iterrows(),desc = "Null value impute is running ..."):
        df.loc[index,column] = value
    return df 

def unify_email(df,column_for_unification,group_by_column, phone_key, result_column):
    """
        df :  pd.DataFrame() with party email data
        column_for_unification :  column which contains valid email for unification
        group_by_column :  key, based on which the data will be grouped 
        phone_key : primary key, which identifies the email
        result_column : name of the result column, where the result values will be added
    """

    filtered_df = df[df[column_for_unification].notnull() & ~df[column_for_unification].isna()]
    grouped_phones = filtered_df.groupby(group_by_column)[phone_key].apply(list)
    df[result_column] = df[group_by_column].map(grouped_phones)
    
    return df


def list_to_string_transformation(df: pd.DataFrame, column: str):
    for index, row in tqdm(df.iterrows(), desc="List transformation is running ..."):
        # Check if the entry is iterable (e.g., a list), if not, skip the iteration.
        if not isinstance(row[column], list):
            #print(f"Non-iterable item at index {index}: {row['UNIFICATION_PARTY_PHONE_ID']}")
            continue
        
        resulted_string = ""
        for val in row[column]:
            # First method
            values_as_string = ["'{}'".format(int(val)) for val in row[column]]
            resulted_string = ', '.join(values_as_string)
            df.loc[index, column]
            
            # Properly concatenate the string with each value.
        df.loc[index, column] = resulted_string
    return df

ddl = """
    SOURCE_EMAIL VARCHAR(255),
    MATCHING_EMAIL VARCHAR(255),
    MASTER_PARTY_EMAIL_ID INTEGER,
    UNIFICATION_PARTY_EMAIL_ID INTEGER,
    SOURCE_IDENTIFIER VARCHAR(255),
    SOURCE_SYSTEM_IDENTIFIER VARCHAR(255),
    DELETE_FLAG INTEGER,
    INSERT_DATETIME DATE,
    INSERT_PROCESS_ID VARCHAR(255),
    UPDATE_DATETIME DATE,
    UPDATE_DATETIME_PROCESS_ID VARCHAR(255),
    UPDATE_DATETIME_EFFECTIVE_DATE DATE
"""
MTCH_PT_EMAIL = MTCH_PT_EMAIL_definition(ddl)
PARTY_EMAIL = generate_dummy_data_email_df(1000)
PARTY_EMAIL = PARTY_EMAIL.drop_duplicates()

#Type correction
MTCH_PT_EMAIL['UPDATE_DATETIME'] = pd.to_datetime(MTCH_PT_EMAIL['UPDATE_DATETIME'], errors='coerce')
MTCH_PT_EMAIL['INSERT_DATETIME'] = pd.to_datetime(MTCH_PT_EMAIL['INSERT_DATETIME'], errors='coerce')

print(PARTY_EMAIL.shape)
print(MTCH_PT_EMAIL.shape)

## MTCH table population
 #- Insert not validated data 

print(MTCH_PT_EMAIL.shape)
last_checked_date = pd.to_datetime('2024-01-10')
filtered_MTCH_PT = PARTY_EMAIL[PARTY_EMAIL['UPDATE_DATETIME']<last_checked_date]
print(filtered_MTCH_PT.shape)
MTCH_PT_EMAIL = pd.concat([MTCH_PT_EMAIL, insert_emails(filtered_MTCH_PT)], ignore_index=True)
for index, row in tqdm(MTCH_PT_EMAIL.iterrows(),desc = "EMAIL union"):
    MTCH_PT_EMAIL.loc[index,'SOURCE_EMAIL'] = row['SOURCE_EMAIL'].lower()
print(MTCH_PT_EMAIL.shape)


print("MATCHING_EMAIL null value count ",MTCH_PT_EMAIL.MATCHING_EMAIL.isna().sum(),"\n")

for index, row in tqdm(MTCH_PT_EMAIL.iterrows(), "Email validation method is running ...",mininterval=1):
    # Assuming 'EMAIL_NUMBER' is the actual phone number column
    if is_email(row['SOURCE_EMAIL'], check_dns=False):       
        MTCH_PT_EMAIL.loc[index, 'MATCHING_EMAIL'] = row['SOURCE_EMAIL']
    else:
        MTCH_PT_EMAIL.loc[index, 'MATCHING_EMAIL'] = None

print("\n")
print("MATCHING_EMAIL null value count ",MTCH_PT_EMAIL.MATCHING_EMAIL.isna().sum())
print("\n")
print("UNIFICATION_PARTY_EMAIL_ID null value count ",MTCH_PT_EMAIL['UNIFICATION_PARTY_EMAIL_ID'].isna().sum())
print("\n")
MTCH_PT_EMAIL = unify_email(MTCH_PT_EMAIL,'MATCHING_EMAIL','PARTY_ID', 'PARTY_EMAIL_ID', 'UNIFICATION_PARTY_EMAIL_ID')
print("UNIFICATION_PARTY_EMAIL_ID null value count ",MTCH_PT_EMAIL['UNIFICATION_PARTY_EMAIL_ID'].isna().sum())
print("\n")

MTCH_PT_EMAIL = impute_null_values(MTCH_PT_EMAIL,'MATCHING_EMAIL',None)
print("MATCHING_EMAIL null value count ",MTCH_PT_EMAIL.MATCHING_EMAIL.isna().sum())

MTCH_PT['UNIFICATION_PARTY_EMAIL_ID'] = None
#MTCH_PT = MTCH_PT.rename(columns={"MATCHING_MFO": "UNIFICATION_PARTY_PHONE_ID"})
#merged_df = pd.merge(MTCH_PT, MTCH_PT_EMAIL[['PARTY_ID', 'UNIFICATION_PARTY_PHONE_ID']], on='PARTY_ID', how='left')

merged_df = pd.merge(MTCH_PT, 
                     MTCH_PT_EMAIL[['PARTY_ID', 'UNIFICATION_PARTY_EMAIL_ID']], 
                     on='PARTY_ID', 
                     how='left',
                     suffixes=('', '_from_email'))

# Update UNIFICATION_PARTY_PHONE_ID in MTCH_PT with the values from MTCH_PT_EMAIL
MTCH_PT['UNIFICATION_PARTY_EMAIL_ID'] = merged_df['UNIFICATION_PARTY_EMAIL_ID_from_email']

(1000, 12)
(0, 12)
(0, 12)
(1000, 12)


EMAIL union: 1000it [00:00, 11079.95it/s]


(1000, 17)
MATCHING_EMAIL null value count  1000 



Email validation method is running ...: 1000it [00:00, 5722.95it/s]




MATCHING_EMAIL null value count  0


UNIFICATION_PARTY_EMAIL_ID null value count  1000


UNIFICATION_PARTY_EMAIL_ID null value count  0




Null value impute is running ...: 0it [00:00, ?it/s]


MATCHING_EMAIL null value count  0


In [93]:
MCTH_TMP_PT = pd.read_csv('mcth_tmp_pt.csv')
MCTH_TMP_PT.shape


(0, 33)

In [96]:
import warnings
import pandas as pd 
import numpy as np
from datetime import datetime
# Filter out the specific deprecation warning
warnings.filterwarnings('ignore', message='The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.')
warnings.filterwarnings('ignore', message='SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead')

np.random.seed(123)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

def insertion(df):
    transformed_df = pd.DataFrame({
    'MATCHING_FAMILY_NAME_LATIN': df['SOURCE_FAMILY_NAME_LATIN'],
    'MATCHING_BIRTH_DATE': df['SOURCE_BIRTH_DATE'],
    'SOURCE_SYSTEM_IDENTIFIER': df['SOURCE_SYSTEM_IDENTIFIER'],
    'SOURCE_SYSTEM': None,  
    'PREVIOUS_CONSOLIDATED_PARTY_ID': df['PREVIOUS_CONSOLIDATED_PARTY_ID'],
    'UNIFICATION_PARTY_PHONE_ID': df['UNIFICATION_PARTY_PHONE_ID'], #TODO:
    'UNIFICATION_PARTY_EMAIL_ID': df['UNIFICATION_PARTY_EMAIL_ID'], #TODO:
    #'MATCHING_MFO_ID': None,  TODO: Delete 
    'MATCHING_PARTY_ID': df['PARTY_ID'], 
    'L1_MASTER_MATCHING_RULE': df['L1_MASTER_MATCHING_RULE'],
    'MATCHING_EMPLOYER_IDENTIFIER': None,  # TODO: Assuming NULL
    'INSERT_PROCESS_ID': df['INSERT_PROCESS_ID'],
    'MATCHING_UNIFIED_PARTY_ID': df['MATCHING_UNIFIED_PARTY_ID'],
    'MATCHING_COMPANY_ID': df['MATCHING_COMPANY_IDENTIFIER'],  
    'PREVIOUS_UNIFIED_PARTY_ID': df['PREVIOUS_UNIFIED_PARTY_ID'],
    'L1_RECORD_ROLE': df['L1_RECORD_ROLE'],
    'PARTY_ID': df['PARTY_ID'],
    'L2_MASTER_PARTY_ID': df['L2_MASTER_PARTY_ID'],
    'UPDATE_EFFECTIVE_DATE': None,  
    'MATCHING_FIRST_NAME': df['MATCHING_FIRST_NAME'],
    'INSERT_DATETIME': df['INSERT_DATETIME'],
    'WF_JOB_ID': None, 
    'MATCHING_COUNTRY_ID': df['MATCHING_COUNTRY_ID'],
    'UPDATE_DATETIME': df['UPDATE_DATETIME'],
    'L2_RECORD_ROLE': df['L2_RECORD_ROLE'],
    'MATCHING_PARTY_IDENTIFIER': df['SOURCE_PERSON_IDENTIFIER'], 
    'MATCHING_PARTY_TYPE_ID': df['MATCHING_PARTY_TYPE_ID'],
    'MATCHING_PERSON_TYPE_ID': df['MATCHING_PARTY_TYPE_ID'],
    'DELETE_FLAG': None,
    'MATCHING_CONSOLIDATED_PARTY_ID': df['MATCHING_CONSOLIDATED_PARTY_ID'],
    'L2_RECORD_MATCHING_RULE': df['L2_RECORD_MATCHING_RULE'],
    'MATCHING_FIRST_NAME_LATIN': df['MATCHING_FIRST_NAME_LATIN'],
    'UPDATE_PROCESS_ID': None,
    #'MATCHING_CI_CODE': None,  TODO: Delete 
    'MATCHING_FAMILY_NAME': df['MATCHING_FAMILY_NAME'],
    'L1_MASTER_PARTY_ID': df['L1_MASTER_PARTY_ID']
})
    return transformed_df

def insert_all(MTCH_PT,MCTH_TMP_PT):
    """ Inserts all rows to tmp table to run initial unification """
    print("###   Running insert_all   ###")
    MCTH_TMP_PT = pd.concat([MCTH_TMP_PT, insertion(MTCH_PT)], ignore_index=True)
        
    return MCTH_TMP_PT


print("MCTH_TMP_PT shape: ", MCTH_TMP_PT.shape)
print("MTCH_PT shape: ", MTCH_PT.shape)

# birth date correction 
MTCH_PT['SOURCE_BIRTH_DATE'] = MTCH_PT['SOURCE_BIRTH_DATE'].str.replace('/', '.')
MTCH_PT['MATCHING_BIRTH_DATE'] = pd.to_datetime(MTCH_PT['SOURCE_BIRTH_DATE']).dt.date #, format='%m.%d.%Y')


sample_id = MTCH_PT['PARTY_ID'].head(100).tolist() + MTCH_PT['PARTY_ID'].tail(100).tolist() +  MTCH_PT['PARTY_ID'].iloc[10200:10400].tolist()

samp_PO = MTCH_PT['PARTY_ID'].iloc[0:10676].tolist()
samp_FOP = MTCH_PT['PARTY_ID'].iloc[21352:MTCH_PT.shape[0]].tolist()

sample = MTCH_PT.loc[MTCH_PT['PARTY_ID'].isin(sample_id)]
sample_PO = MTCH_PT.loc[MTCH_PT['PARTY_ID'].isin(samp_PO)]
sample_FOP = MTCH_PT.loc[MTCH_PT['PARTY_ID'].isin(samp_FOP)]

sample.UPDATE_DATETIME = '2023-12-01'
sample_PO.MATCHING_PARTY_TYPE_ID = 'PO'
sample_PO.SOURCE_PARTY_TYPE_ID = 'PO'
sample_FOP.MATCHING_PARTY_TYPE_ID = 'FOP'
sample_FOP.SOURCE_PARTY_TYPE_ID = 'FOP'

MTCH_PT.set_index('PARTY_ID', inplace=True)
sample.set_index('PARTY_ID', inplace=True)
sample_PO.set_index('PARTY_ID', inplace=True)
sample_FOP.set_index('PARTY_ID', inplace=True)

# Update the MTCH_PT DataFrame with the modified sample DataFrame
MTCH_PT.update(sample)
MTCH_PT.update(sample_PO)
MTCH_PT.update(sample_FOP)

# Reset the index if needed
MTCH_PT.reset_index(inplace=True)
# Data Normalisation
MCTH_TMP_PT['MATCHING_FAMILY_NAME'] = MCTH_TMP_PT['MATCHING_FAMILY_NAME'].str.lower().str.strip()
#MCTH_TMP_PT['MATCHING_FAMILY_NAME_LATIN'] = MCTH_TMP_PT['MATCHING_FAMILY_NAME_LATIN'].str.lower().str.strip()
MCTH_TMP_PT['MATCHING_FIRST_NAME'] = MCTH_TMP_PT['MATCHING_FIRST_NAME'].str.lower().str.strip()
#MCTH_TMP_PT['MATCHING_FIRST_NAME_LATIN'] = MCTH_TMP_PT['MATCHING_FIRST_NAME_LATIN'].str.lower().str.strip()
MCTH_TMP_PT['MATCHING_COUNTRY_ID'] = MCTH_TMP_PT['MATCHING_COUNTRY_ID'].str.lower().str.strip()
# # Dates
MCTH_TMP_PT['MATCHING_BIRTH_DATE'] = pd.to_datetime(MCTH_TMP_PT['MATCHING_BIRTH_DATE'], errors='coerce')

MTCH_PT['UPDATE_DATETIME'] = pd.to_datetime(MTCH_PT['UPDATE_DATETIME'], errors='coerce')
#Drop Duplicates
MCTH_TMP_PT = MCTH_TMP_PT.drop_duplicates()
last_checked_date = pd.to_datetime('2023-12-05')
#filtered_MTCH_PT = MTCH_PT # Unifikace na vsech datech
filtered_MTCH_PT = MTCH_PT[MTCH_PT['UPDATE_DATETIME']<last_checked_date] # & PK != PK_TMP
transformed_df = insertion(filtered_MTCH_PT)
# Append the transformed data to the MCTH_TMP_PT DataFrame
MCTH_TMP_PT = pd.concat([MCTH_TMP_PT, transformed_df], ignore_index=True)


    

MCTH_TMP_PT shape:  (0, 33)
MTCH_PT shape:  (32029, 46)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample.UPDATE_DATETIME = '2023-12-01'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_PO.MATCHING_PARTY_TYPE_ID = 'PO'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_PO.SOURCE_PARTY_TYPE_ID = 'PO'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [97]:
print(MTCH_PT['SOURCE_PARTY_TYPE_ID'].value_counts())

print(MCTH_TMP_PT.shape)
print(MTCH_PT.shape)

print(MCTH_TMP_PT['MATCHING_PERSON_TYPE_ID'].value_counts())
print(MCTH_TMP_PT['MATCHING_PARTY_TYPE_ID'].value_counts())

SOURCE_PARTY_TYPE_ID
FOP    10677
PO     10676
FO     10676
Name: count, dtype: int64
(400, 37)
(32029, 46)
MATCHING_PERSON_TYPE_ID
PO     300
FOP    100
Name: count, dtype: int64
MATCHING_PARTY_TYPE_ID
PO     300
FOP    100
Name: count, dtype: int64


In [98]:
#from tqdm import tqdm
num_rows = 1 
i = 1
#with tqdm(total=100) as pbar:
while num_rows>0:
    print(i)
    before_adding = last_count = MCTH_TMP_PT.shape[0]
    #Match by primary identification
    filtered_MTCH_PT = MTCH_PT[(MTCH_PT['SOURCE_PERSON_IDENTIFIER'].isin(MCTH_TMP_PT['MATCHING_PARTY_IDENTIFIER']) |
                               (~MTCH_PT['MATCHING_COMPANY_IDENTIFIER'].isna() & MTCH_PT['MATCHING_COMPANY_IDENTIFIER'].isin(MCTH_TMP_PT['MATCHING_COMPANY_ID']))) &
                               ~MTCH_PT['PARTY_ID'].isin(MCTH_TMP_PT['PARTY_ID']) ] # & PK != PK_TMP
    transformed_df = insertion(filtered_MTCH_PT)
    
    # Append the transformed data to the MCTH_TMP_PT DataFrame
    MCTH_TMP_PT = pd.concat([MCTH_TMP_PT, transformed_df], ignore_index=True)
    print("Shape after comparing PERSONAL_IDENTIFIER ",MCTH_TMP_PT.shape[0], f" - {MCTH_TMP_PT.shape[0]-last_count} Rows Were Added")
    last_count = MCTH_TMP_PT.shape[0]
          
    #########
          
    filtered_MTCH_PT = MTCH_PT[MTCH_PT['MATCHING_FAMILY_NAME'].isin(MCTH_TMP_PT['MATCHING_FAMILY_NAME'])&
                               ~MTCH_PT['PARTY_ID'].isin(MCTH_TMP_PT['PARTY_ID'])]
    transformed_df = insertion(filtered_MTCH_PT)

    # Append the transformed data to the MCTH_TMP_PT DataFrame
    MCTH_TMP_PT = pd.concat([MCTH_TMP_PT, transformed_df], ignore_index=True)

    print("Shape after comparing FAMILY_NAME ",MCTH_TMP_PT.shape[0], f" - {MCTH_TMP_PT.shape[0]-last_count} Rows Were Added")
    last_count = MCTH_TMP_PT.shape[0]

    # Phones :
    # phone_list = []
    # for index, row in MCTH_TMP_PT.iterrows():
    #     for val in row['UNIFICATION_PARTY_PHONE_ID']:
    #         phone_list.append(val)
    # phone_set = set(phone_list)
    
    # filtered_MTCH_PT = MTCH_PT[MTCH_PT['MATCHING_FAMILY_NAME'].isin(MCTH_TMP_PT['MATCHING_FAMILY_NAME'])&
    #                            ~MTCH_PT['PARTY_ID'].isin(MCTH_TMP_PT['PARTY_ID'])]

    phone_set = set()
    for ids in MTCH_PT['UNIFICATION_PARTY_PHONE_ID']:
        if isinstance(ids, (list, set, tuple)):
            phone_set.update(ids)

    filtered_MTCH_PT = MTCH_PT[
    MTCH_PT['UNIFICATION_PARTY_PHONE_ID'].apply(lambda ids: isinstance(ids, (list, set, tuple)) and any(id in phone_set for id in ids)) &
    ~MTCH_PT['PARTY_ID'].isin(MCTH_TMP_PT['PARTY_ID'])]     

    transformed_df = insertion(filtered_MTCH_PT)  

    MCTH_TMP_PT = pd.concat([MCTH_TMP_PT, transformed_df], ignore_index=True)
    print("Shape after comparing UNIFICATION_PARTY_PHONE_ID ",MCTH_TMP_PT.shape[0], f" - {MCTH_TMP_PT.shape[0]-last_count} Rows Were Added")
    last_count = MCTH_TMP_PT.shape[0]

    

    email_set = set()
    for ids in MTCH_PT['UNIFICATION_PARTY_EMAIL_ID']:
        if isinstance(ids, (list, set, tuple)):
            email_set.update(ids)

    filtered_MTCH_PT = MTCH_PT[
    MTCH_PT['UNIFICATION_PARTY_EMAIL_ID'].apply(lambda ids: isinstance(ids, (list, set, tuple)) and any(id in email_set for id in ids)) &
    ~MTCH_PT['PARTY_ID'].isin(MCTH_TMP_PT['PARTY_ID'])]     

    transformed_df = insertion(filtered_MTCH_PT)  

    MCTH_TMP_PT = pd.concat([MCTH_TMP_PT, transformed_df], ignore_index=True)
    print("Shape after comparing UNIFICATION_PARTY_EMAIL_ID ",MCTH_TMP_PT.shape[0], f" - {MCTH_TMP_PT.shape[0]-last_count} Rows Were Added")
    last_count = MCTH_TMP_PT.shape[0]
    # TODO: COMPANY_NAME, telefony, emaily,adresy
#     filtered_MTCH_PT = MTCH_PT[MTCH_PT['MATCHING_FAMILY_NAME'].isin(MCTH_TMP_PT['MATCHING_FAMILY_NAME'])&
#                                ~MTCH_PT['PARTY_ID'].isin(MCTH_TMP_PT['PARTY_ID'])]
#     transformed_df = insertion(filtered_MTCH_PT)

#     # Append the transformed data to the MCTH_TMP_PT DataFrame
#     MCTH_TMP_PT = MCTH_TMP_PT.append(transformed_df, ignore_index=True)

#     print("Shape after comparing COMPANY_NAME ",MCTH_TMP_PT.shape[0], f" - {MCTH_TMP_PT.shape[0]-last_count} Rows Were Added")
#     last_count = MCTH_TMP_PT.shape[0]
#         filtered_MTCH_PT =  MTCH_PT[
#        #MTCH_PT['MATCHING_FAMILY_NAME'].isnull() & 
#         MTCH_PT['MATCHING_FIRST_NAME'].isin(MCTH_TMP_PT['MATCHING_FIRST_NAME'] &
#                                    ~MTCH_PT['PARTY_ID'].isin(MCTH_TMP_PT['PARTY_ID']))
#         ]
    filtered_MTCH_PT = MTCH_PT[MTCH_PT['MATCHING_FIRST_NAME'].isin(MCTH_TMP_PT['MATCHING_FIRST_NAME'])&
                               ~MTCH_PT['PARTY_ID'].isin(MCTH_TMP_PT['PARTY_ID'])]
    transformed_df = insertion(filtered_MTCH_PT)

    # Append the transformed data to the MCTH_TMP_PT DataFrame
    MCTH_TMP_PT = pd.concat([MCTH_TMP_PT, transformed_df], ignore_index=True)
    print("Shape after comparing FIRST_NAME ",MCTH_TMP_PT.shape[0], f" - {MCTH_TMP_PT.shape[0]-last_count} Rows Were Added")
    last_count = MCTH_TMP_PT.shape[0]
    
    ############### UNIFIED_PARTY_ID #################
    filtered_MTCH_PT = MTCH_PT[MTCH_PT['MATCHING_CONSOLIDATED_PARTY_ID'].isin(MCTH_TMP_PT['MATCHING_CONSOLIDATED_PARTY_ID'])
                                       & (~MTCH_PT['MATCHING_CONSOLIDATED_PARTY_ID'].isna())
                                       & (~MTCH_PT['PARTY_ID'].isin(MCTH_TMP_PT['PARTY_ID']))]
    transformed_df = insertion(filtered_MTCH_PT)
    
    # Append the transformed data to the MCTH_TMP_PT DataFrame
    MCTH_TMP_PT = pd.concat([MCTH_TMP_PT, transformed_df], ignore_index=True)
    print("Shape after comparing CONSOLIDATED_PARTY_ID ",MCTH_TMP_PT.shape[0], f" - {MCTH_TMP_PT.shape[0]-last_count} Rows Were Added")
    last_count = MCTH_TMP_PT.shape[0]
    
    MTCH_PT[MTCH_PT['MATCHING_UNIFIED_PARTY_ID'].isin(MCTH_TMP_PT['MATCHING_UNIFIED_PARTY_ID']) & (~MTCH_PT['MATCHING_UNIFIED_PARTY_ID'].isna())]
    filtered_MTCH_PT = MTCH_PT[MTCH_PT['MATCHING_UNIFIED_PARTY_ID'].isin(MCTH_TMP_PT['MATCHING_UNIFIED_PARTY_ID'])
                               & (~MTCH_PT['MATCHING_UNIFIED_PARTY_ID'].isna())
                               & (~MTCH_PT['PARTY_ID'].isin(MCTH_TMP_PT['PARTY_ID']))]
    transformed_df = insertion(filtered_MTCH_PT)
    MCTH_TMP_PT = pd.concat([MCTH_TMP_PT, transformed_df], ignore_index=True)
    print("Shape after comparing UNIFIED_PARTY_ID ",MCTH_TMP_PT.shape[0], f" - {MCTH_TMP_PT.shape[0]-last_count} Rows Were Added")
    last_count = MCTH_TMP_PT.shape[0]
    
    print("Shape after comparing CONSOLIDATED_PARTY_ID ",MCTH_TMP_PT.shape[0], f" - {MCTH_TMP_PT.shape[0]-last_count} Rows Were Added")


    
    

    after_adding = MCTH_TMP_PT.shape[0]
    i = i+1
    num_rows = after_adding - before_adding
    print("############")
    print("")
    print("Number of rows added after a cycle",num_rows)
    

1
Shape after comparing PERSONAL_IDENTIFIER  400  - 0 Rows Were Added
Shape after comparing FAMILY_NAME  400  - 0 Rows Were Added
Shape after comparing UNIFICATION_PARTY_PHONE_ID  617  - 217 Rows Were Added
Shape after comparing UNIFICATION_PARTY_EMAIL_ID  1389  - 772 Rows Were Added
Shape after comparing FIRST_NAME  10603  - 9214 Rows Were Added
Shape after comparing CONSOLIDATED_PARTY_ID  10606  - 3 Rows Were Added
Shape after comparing UNIFIED_PARTY_ID  10606  - 0 Rows Were Added
Shape after comparing CONSOLIDATED_PARTY_ID  10606  - 0 Rows Were Added
############

Number of rows added after a cycle 10206
2
Shape after comparing PERSONAL_IDENTIFIER  10606  - 0 Rows Were Added
Shape after comparing FAMILY_NAME  10703  - 97 Rows Were Added
Shape after comparing UNIFICATION_PARTY_PHONE_ID  10703  - 0 Rows Were Added
Shape after comparing UNIFICATION_PARTY_EMAIL_ID  10703  - 0 Rows Were Added
Shape after comparing FIRST_NAME  11263  - 560 Rows Were Added
Shape after comparing CONSOLIDATE

In [99]:
import math 
from tqdm import tqdm

print("PREVIOUS_UNIFIED_PARTY_ID NULL COUNT: ",MCTH_TMP_PT['PREVIOUS_UNIFIED_PARTY_ID'].isna().sum())
print("PREVIOUS_CONSOLIDATED_PARTY_ID NULL COUNT ",MCTH_TMP_PT['PREVIOUS_CONSOLIDATED_PARTY_ID'].isna().sum())
print("")

for index, row in tqdm(MCTH_TMP_PT.iterrows(), desc = "Attributes filling ..."):
    if math.isnan(row['MATCHING_UNIFIED_PARTY_ID']) and  math.isnan(row['MATCHING_CONSOLIDATED_PARTY_ID']):
        MCTH_TMP_PT.loc[index, 'PREVIOUS_UNIFIED_PARTY_ID'] = MCTH_TMP_PT.loc[index, 'PARTY_ID']
        MCTH_TMP_PT.loc[index, 'PREVIOUS_CONSOLIDATED_PARTY_ID'] = MCTH_TMP_PT.loc[index, 'PARTY_ID']
    else:
        MCTH_TMP_PT.loc[index, 'PREVIOUS_UNIFIED_PARTY_ID'] = MCTH_TMP_PT.loc[index, 'MATCHING_UNIFIED_PARTY_ID']
        MCTH_TMP_PT.loc[index, 'PREVIOUS_CONSOLIDATED_PARTY_ID'] = MCTH_TMP_PT.loc[index, 'MATCHING_CONSOLIDATED_PARTY_ID']
print("")
print("PREVIOUS_UNIFIED_PARTY_ID NULL COUNT: ",MCTH_TMP_PT['PREVIOUS_UNIFIED_PARTY_ID'].isna().sum())
print("PREVIOUS_CONSOLIDATED_PARTY_ID NULL COUNT ",MCTH_TMP_PT['PREVIOUS_CONSOLIDATED_PARTY_ID'].isna().sum())

PREVIOUS_UNIFIED_PARTY_ID NULL COUNT:  11347
PREVIOUS_CONSOLIDATED_PARTY_ID NULL COUNT  11347



Attributes filling ...: 11347it [00:02, 4436.34it/s]


PREVIOUS_UNIFIED_PARTY_ID NULL COUNT:  0
PREVIOUS_CONSOLIDATED_PARTY_ID NULL COUNT  0





## Candidate groups
- MATCHING_PARTY_IDENTIFIER match
- FAMILY_NAME match
- FIRST_NAME match
- EMAIL match
- PHONE match 

In [100]:
from collections.abc import Iterable
import pandas as pd

def is_iterable(obj):
    return isinstance(obj, Iterable)

def contains_phone_id(phone_id_list, phone_id):
    # Check if phone_id_list is not NA and is iterable
    #if pd.notna(phone_id_list) and is_iterable(phone_id_list):
    if  is_iterable(phone_id_list):
        # Iterate over items if it's iterable
        return any(phone_id == item for item in phone_id_list if pd.notna(item))
    # Handle the case where phone_id_list is a single non-iterable value
    # elif pd.notna(phone_id_list):
    #     return phone_id == phone_id_list
    else:
        return False
    
def contains_email_id(list_obj, id):
    if is_iterable(list_obj):
        return any(id == item for item in list_obj if pd.notna(item))
    else:
        return False

In [114]:
from tqdm import tqdm
grouped_by_id = MCTH_TMP_PT.groupby(['MATCHING_PARTY_IDENTIFIER']) #, 'MATCHING_COMPANY_IDENTIFIER' and NOT NULL

basic_groups = {id_: group.index.tolist() for id_, group in grouped_by_id}
max = 0
max_id = 0 
for id_, indices in tqdm(basic_groups.items(), desc="Counting ..."):
    if len(indices) > max:
        max = len(indices)
        max_id = id_

print("The biggest group is: ",max_id, " with elemts count of ", max)

# MATCHING_FAMILY_NAME
expanded_groups = {}

for id_, indices in tqdm(basic_groups.items(), desc="Rozšiřování skupin podle rodinného jména"):
    expanded_group = set(indices)
    
    for idx in indices:
        family_name = MCTH_TMP_PT.loc[idx, 'MATCHING_FAMILY_NAME']
        if pd.notna(family_name):
            same_family_name_indices = MCTH_TMP_PT.index[MCTH_TMP_PT['MATCHING_FAMILY_NAME'] == family_name].tolist()
            expanded_group.update(same_family_name_indices)

    expanded_groups[id_] = list(expanded_group)
max = 0
max_id = 0 
for id_, indices in tqdm(expanded_groups.items(), desc="Counting ..."):
    if len(indices) > max:
        max = len(indices)
        max_id = id_

print("The biggest group is: ",max_id, " with elemts count of ", max)

# MATCHING_FIRST_NAME
expanded_groups_first_name = {}

for id_, indices in tqdm(expanded_groups.items(), desc="Přidávání prvků se stejným křestním jménem"):
    group_first_name = set(indices)

    for idx in indices:
        first_name = MCTH_TMP_PT.loc[idx, 'MATCHING_FIRST_NAME']
        if pd.notna(first_name):
            same_first_name_indices = MCTH_TMP_PT.index[(MCTH_TMP_PT['MATCHING_FAMILY_NAME'].isnull()) & (MCTH_TMP_PT['MATCHING_FIRST_NAME'] == first_name)].tolist()
            group_first_name.update(same_first_name_indices)

    expanded_groups_first_name[id_] = list(group_first_name)

max = 0
max_id = 0 
for id_, indices in tqdm(expanded_groups_first_name.items(), desc="Counting ..."):
    if len(indices) > max:
        max = len(indices)
        max_id = id_

print("The biggest group is: ",max_id, " with elemts count of ", max)
#TODO: EMAILS 
expanded_groups_email = {}
for id_, indices in tqdm(expanded_groups_first_name.items(), desc="Přidávání prvků se stejným ID emailu"):
    expanded_groups =set(indices)
    for idx in indices:
        idx_subset = MCTH_TMP_PT.loc[idx, 'UNIFICATION_PARTY_EMAIL_ID']

        if isinstance(idx_subset, (list, set, tuple)):
            # Pokud je idx_subset seznam, set nebo tuple
            if not idx_subset or all(pd.isna(element) for element in idx_subset):
                continue
            email_id_list = set(idx_subset)
        else:
            # Pokud je idx_subset jedna hodnota (není seznam, set, nebo tuple)
            if pd.isna(idx_subset):
                continue
            email_id_list = {idx_subset}

        for email_id in email_id_list:
            if email_id:
                same_email_id_indices = MCTH_TMP_PT.index[MCTH_TMP_PT['UNIFICATION_PARTY_EMAIL_ID'].apply(lambda x: contains_email_id(x, email_id))].tolist()
                expanded_groups.update(same_email_id_indices)
      
    expanded_groups_email[id_] = list(expanded_groups)

max = 0
max_id = 0 
for id_, indices in tqdm(expanded_groups_email.items(), desc="Counting ..."):
    if len(indices) > max:
        max = len(indices)
        max_id = id_

print("The biggest group is: ",max_id, " with elemts count of ", max)
#TODO: phones 
final_groups = {}
for id_, indices in tqdm(expanded_groups_email.items(), desc="Přidávání prvků se stejným ID telefonu"):
    final_group = set(indices)
    for idx in indices:
        idx_subset = MCTH_TMP_PT.loc[idx, 'UNIFICATION_PARTY_PHONE_ID']
        if isinstance(idx_subset, (list, set, tuple)):        
            if not idx_subset or all(pd.isna(element) for element in idx_subset):
                continue
            phone_id_list = set(idx_subset)
        else:
            if pd.isna(idx_subset):
                    continue
            phone_id_list = {idx_subset}
        
        for phone_id in phone_id_list:
            if phone_id:
                same_phone_id_indices = MCTH_TMP_PT.index[MCTH_TMP_PT['UNIFICATION_PARTY_PHONE_ID'].apply(lambda x: contains_phone_id(x, phone_id))].tolist()
                final_group.update(same_phone_id_indices)
    final_groups[id_] = list(final_group)
max = 0
max_id = 0 
for id_, indices in tqdm(final_groups.items(), desc="Counting ..."):
    if len(indices) > max:
        max = len(indices)
        max_id = id_

print("The biggest group is: ",max_id, " with elemts count of ", max)


group_id_to_indices = {group_id: indices for group_id, indices in enumerate(final_groups.values())}

Counting ...: 100%|██████████| 10726/10726 [00:00<00:00, 785969.44it/s]


The biggest group is:  (9989288.0,)  with elemts count of  101


Rozšiřování skupin podle rodinného jména: 100%|██████████| 10726/10726 [00:29<00:00, 362.70it/s]
Counting ...: 100%|██████████| 10726/10726 [00:00<00:00, 306815.87it/s]


The biggest group is:  (9989131.0,)  with elemts count of  167


Přidávání prvků se stejným křestním jménem: 100%|██████████| 10726/10726 [01:31<00:00, 116.70it/s]
Counting ...: 100%|██████████| 10726/10726 [00:00<00:00, 935459.22it/s]


The biggest group is:  (9989131.0,)  with elemts count of  167


Přidávání prvků se stejným ID emailu: 100%|██████████| 10726/10726 [00:36<00:00, 297.63it/s] 
Counting ...: 100%|██████████| 10726/10726 [00:00<00:00, 338551.70it/s]


The biggest group is:  (9989131.0,)  with elemts count of  167


Přidávání prvků se stejným ID telefonu: 100%|██████████| 10726/10726 [00:20<00:00, 525.44it/s] 
Counting ...: 100%|██████████| 10726/10726 [00:00<00:00, 830958.71it/s]

The biggest group is:  (9989131.0,)  with elemts count of  167





## Impaired data filling 
 - MATCHING_CONSOLIDATED_PARTY_ID
 - MATCHING_UNIFIED_PARTY_ID
 - L1_MASTER_PARTY_ID

In [115]:
elements_changed = 0 
for key, value in tqdm(group_id_to_indices.items(), desc="Zpracování hodnot bez par"):
    if len(value) == 1:
        elements_changed += 1
        MCTH_TMP_PT.loc[value[0],'MATCHING_CONSOLIDATED_PARTY_ID'] = MCTH_TMP_PT.loc[value[0]]['PREVIOUS_CONSOLIDATED_PARTY_ID']
        MCTH_TMP_PT.loc[value[0],'MATCHING_UNIFIED_PARTY_ID'] = MCTH_TMP_PT.loc[value[0]]['PREVIOUS_UNIFIED_PARTY_ID']
        MCTH_TMP_PT.loc[value[0],'L1_MASTER_PARTY_ID'] = MCTH_TMP_PT.loc[value[0]]['PARTY_ID']
print(elements_changed," elements were changed ")

Zpracování hodnot bez par: 100%|██████████| 10726/10726 [00:04<00:00, 2369.75it/s]

8913  elements were changed 





## Paired data NULL filling 

In [116]:
for key, value in tqdm(group_id_to_indices.items(), desc="Paired data NULL filling"):
    if len(value)  > 1:
        for v in value:
            elements_changed += 1

            MCTH_TMP_PT.loc[v,'MATCHING_CONSOLIDATED_PARTY_ID'] = MCTH_TMP_PT.loc[v]['PREVIOUS_CONSOLIDATED_PARTY_ID']
            MCTH_TMP_PT.loc[v,'MATCHING_UNIFIED_PARTY_ID'] = MCTH_TMP_PT.loc[v]['PREVIOUS_UNIFIED_PARTY_ID']
            MCTH_TMP_PT.loc[v,'L1_MASTER_PARTY_ID'] = MCTH_TMP_PT.loc[v]['PARTY_ID']

Paired data NULL filling: 100%|██████████| 10726/10726 [00:12<00:00, 855.86it/s]


## Client groups NAME_SURNAME_BIRTH_DATE

In [133]:
from tqdm import tqdm
from rapidfuzz import fuzz

# Definice funkce pro výpočet skóre podobnosti
def similarity_score(row1, row2):    
    """ Function defenition to calulate similiarity score: NAME_SURNAME_BIRTH_DATE"""
    
    name_score = fuzz.partial_ratio(f"{row1['MATCHING_FIRST_NAME']} {row1['MATCHING_FAMILY_NAME']}", 
                                    f"{row2['MATCHING_FIRST_NAME']} {row2['MATCHING_FAMILY_NAME']}") #  TODO: + PARTY_TYPE_ID
    birth_date_score = 100 if row1['MATCHING_BIRTH_DATE'] == row2['MATCHING_BIRTH_DATE'] else 0

    return (name_score + birth_date_score) / 2 

def similarity_score_list_column(row1, row2, column_name):
    """ Function defenition to calulate similiarity score for columns with list"""
    
    name_score = fuzz.partial_ratio(f"{row1['MATCHING_FIRST_NAME']} {row1['MATCHING_FAMILY_NAME']}", 
                                    f"{row2['MATCHING_FIRST_NAME']} {row2['MATCHING_FAMILY_NAME']}")
    #print(row1[column_name])
    #print(row2[column_name])
    if (isinstance(row1[column_name], (list, set, tuple)) and 
        isinstance(row2[column_name], (list, set, tuple))):
        row1_set = set(row1[column_name])
        row2_set = set(row2[column_name])
        if row1_set.intersection(row2_set):
            second_score = 100
        else:
            second_score = 0
        return (name_score + second_score) / 2
    else:
        return 0 
            
    # if pd.isna(row1[column_name]) or pd.isna(row2[column_name]):
    #     return 0 
    # else:
    #     row1_set = set(row1[column_name])
    #     row2_set = set(row2[column_name])
    #     if row1_set.intersection(row2_set):
    #         second_score = 100
    #     else:
    #         second_score = 0
    #     return (name_score + second_score) / 2
        

In [134]:


# Vytvoření skupin klientů
client_groups = {}
#client_groups_low_similarity = {}


for group_id, indices in tqdm(group_id_to_indices.items(), desc="Zpracování primárních skupin"):

    if len(indices) > 1:
        current_group = MCTH_TMP_PT.loc[indices]
        client_group = []
        #client_group_low_similarity = []
        threshold = 90
        for i in range(len(current_group)):
            for j in range(i + 1, len(current_group)):
                if similarity_score(current_group.iloc[i], current_group.iloc[j]) >= threshold or similarity_score_list_column(current_group.iloc[i], current_group.iloc[j], "UNIFICATION_PARTY_EMAIL_ID") >= threshold or similarity_score_list_column(current_group.iloc[i], current_group.iloc[j], "UNIFICATION_PARTY_PHONE_ID") >= threshold:
                    client_group.append((current_group.index[i], current_group.index[j]))
                # else:
                #     client_group_low_similarity.append((current_group.index[i], current_group.index[j]))
        #client_groups_low_similarity[group_id] = client_group_low_similarity
        client_groups[group_id] = client_group
    # else:
    #     client_groups[group_id] = [indices[0]]

# Výpis výsledků
# for group_id, pairs in client_groups.items():
#     if pairs:
#         print(f"ID skupiny: {group_id}, Páry klientů: {pairs}")


Zpracování primárních skupin: 100%|██████████| 10726/10726 [08:37<00:00, 20.74it/s] 


## Minimal key search

In [135]:

client_to_group = {}
client_to_group_unified = {}

for group_id, client_pairs in client_groups.items():
    ind_list = []
    ind_list_PO = []
    ind_list_FO = []
    ind_list_FOP = []
    if len(client_pairs) > 1:
        for client_pairs in client_groups[group_id]:
            for pair in client_pairs:
                
                if (MCTH_TMP_PT.loc[pair, 'MATCHING_PARTY_TYPE_ID'] == 'FO'):
                    ind_list_FO.append(MCTH_TMP_PT.loc[pair, 'PARTY_ID'])
                    ind_list.append(MCTH_TMP_PT.loc[pair, 'PARTY_ID'])
                if (MCTH_TMP_PT.loc[pair, 'MATCHING_PARTY_TYPE_ID'] == 'PO'):
                    ind_list_PO.append(MCTH_TMP_PT.loc[pair, 'PARTY_ID'])               
                if (MCTH_TMP_PT.loc[pair, 'MATCHING_PARTY_TYPE_ID'] == 'FOP'):
                    ind_list_FOP.append(MCTH_TMP_PT.loc[pair, 'PARTY_ID']) 
                    ind_list.append(MCTH_TMP_PT.loc[pair, 'PARTY_ID']) # ???              
        
            if len(ind_list) > 0:
                group_id = min(ind_list)
                #client_to_group[pair] = group_id
            # print(group_id)
            if len(ind_list_FO) > 0:            
                group_id_FO = min(ind_list_FO)
                #client_to_group_PO[ind_list_PO] = group_id_PO
            # print(group_id_PO)
            
            if len(ind_list_PO) > 0:
                group_id_PO = min(ind_list_PO)
            if len(ind_list_FOP) > 0:
                group_id_FOP = min(ind_list_FOP)
            # client_to_group_FO[ind_list_FO] = group_id_FO
            # print(group_id_FO)   
            
            client_to_group[pair] = group_id                     
            if (MCTH_TMP_PT.loc[pair, 'MATCHING_PARTY_TYPE_ID'] == 'FO'): #| (MCTH_TMP_PT.loc[pair, 'MATCHING_PERSON_TYPE_ID']=='FOP'):
                client_to_group_unified[pair] = group_id_FO
            if (MCTH_TMP_PT.loc[pair, 'MATCHING_PARTY_TYPE_ID'] == 'PO'): #| (MCTH_TMP_PT.loc[pair, 'MATCHING_PERSON_TYPE_ID']=='FOP'):
                client_to_group_unified[pair] = group_id_PO
            if (MCTH_TMP_PT.loc[pair, 'MATCHING_PARTY_TYPE_ID'] == 'FOP'): #| 
                client_to_group_unified[pair] = group_id_FOP
        

## Adding CONSOLIDATED, UNIFIYED ID, and ROLE to MCTH_TMP_PT

In [136]:

print(MCTH_TMP_PT['MATCHING_UNIFIED_PARTY_ID'].isna().sum())
print(MCTH_TMP_PT['PREVIOUS_CONSOLIDATED_PARTY_ID'].isna().sum())
print(MCTH_TMP_PT['MATCHING_CONSOLIDATED_PARTY_ID'].isna().sum())
print(MCTH_TMP_PT['PREVIOUS_UNIFIED_PARTY_ID'].isna().sum())
print("")

for  key, value in tqdm(client_groups.items(), desc="Zpracování pravidel"):
    MCTH_TMP_PT.loc[key,'L1_MASTER_MATCHING_RULE'] = "NAME_SURNAME"


for key, value in tqdm(client_to_group.items(), desc="Zpracování hodnot bez par"):
    MCTH_TMP_PT.loc[key,'MATCHING_CONSOLIDATED_PARTY_ID'] = value 

for key, value in tqdm(client_to_group_unified.items(), desc="Zpracování hodnot bez par"):
    if MCTH_TMP_PT.loc[key,'MATCHING_PARTY_TYPE_ID'] == 'PO':
        MCTH_TMP_PT.loc[key,'MATCHING_CONSOLIDATED_PARTY_ID'] = value 
    MCTH_TMP_PT.loc[key,'MATCHING_UNIFIED_PARTY_ID'] = value
    MCTH_TMP_PT.loc[key,'L1_MASTER_PARTY_ID'] = value

for index, row in MCTH_TMP_PT.iterrows():
    if row['PARTY_ID'] == row['MATCHING_UNIFIED_PARTY_ID']:
        MCTH_TMP_PT.loc[index,'L1_RECORD_ROLE'] = 'MASTER'        
    else:
        MCTH_TMP_PT.loc[index,'L1_RECORD_ROLE'] = 'SLAVE'

print(MCTH_TMP_PT['MATCHING_UNIFIED_PARTY_ID'].isna().sum())
print(MCTH_TMP_PT['PREVIOUS_CONSOLIDATED_PARTY_ID'].isna().sum())
print(MCTH_TMP_PT['MATCHING_CONSOLIDATED_PARTY_ID'].isna().sum())
print(MCTH_TMP_PT['PREVIOUS_UNIFIED_PARTY_ID'].isna().sum())


0
0
0
0



Zpracování pravidel: 100%|██████████| 1813/1813 [00:00<00:00, 13809.73it/s]
Zpracování hodnot bez par: 100%|██████████| 477/477 [00:00<00:00, 11082.70it/s]
Zpracování hodnot bez par: 100%|██████████| 477/477 [00:00<00:00, 4419.65it/s]


0
0
0
0
