In [36]:
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine, MetaData, select
import pandas as pd


load_dotenv()
csv_dir =  '/home/flor/Workspace/DEP2/DataEngineerProj2/Data'
SERVER = os.environ.get('SERVER')
DATABASE = os.environ.get('DATABASE')
UID = os.environ.get('USER')
PWD = os.environ.get('PASSWORD')

connection_string = f'mssql+pyodbc://{UID}:{PWD}@{SERVER}/{DATABASE}?driver=ODBC+Driver+17+for+SQL+Server'
engine = create_engine(connection_string)

metadata = MetaData()
metadata.reflect(engine)

def remove_duplicate_ids(df, table_name, column_name, df_column_name, unique_set):
    print("Before", df.shape, table_name)
    
    with engine.connect() as connection:
        table = metadata.tables[table_name]
        existing_ids = {str(row[0]) for row in connection.execute(select(table.c[column_name])).fetchall()}
        common_ids = unique_set.intersection(existing_ids)
    
    print("Existing IDs:", len(existing_ids))
    print("Common IDs:", len(common_ids))
    
    if common_ids:
        print(f"Removing duplicate IDs from {column_name}: {len(common_ids)}") 
        df = df[~df[df_column_name].astype(str).isin(common_ids)]
    
    print("After", df.shape, table_name)
    return df

def get_existing_ids(table_name, column_name, df_column_name, df=None):
    with engine.connect() as connection:
        table = metadata.tables[table_name]
        existing_ids = {str(row[0]) for row in connection.execute(select(table.c[column_name])).fetchall()}

    if df is not None and df_column_name in df.columns:
        existing_ids.update(df[df_column_name].astype(str))

    return existing_ids


In [35]:
import pandas as pd
# ACCOUNT
df_ac = pd.read_csv('../Data/Account.csv')
existing_account_ids = get_existing_ids('Account', 'Account_ID', 'crm_Account_Account', df_ac)
unique_account = set(df_ac['crm_Account_Account']).union(existing_account_ids)
df_ac = remove_duplicate_ids(df_ac,'Account', "Account_ID", 'crm_Account_Account', unique_account)
df_ac.drop('crm_Account_Hoofd_NaCe_Code', axis=1, inplace=True)
df_ac.to_csv('../Data/Account.csv' ,index=False)

# AFSPRAKEN
df_all = pd.read_csv('../Data/Afspraak alle.csv')
existing_afspraak_ids = get_existing_ids('Afspraak_Alle', "Afspraak_ID",'crm_Afspraak_ALLE_Afspraak', df_all)
unique_afspraak = set(df_all['crm_Afspraak_ALLE_Afspraak']).union(existing_afspraak_ids)
df_all = remove_duplicate_ids(df_all,'Afspraak_Alle', "Afspraak_ID",'crm_Afspraak_ALLE_Afspraak', unique_afspraak)
# df_all.drop_duplicates(keep='first', inplace=True)
df_all.to_csv('../Data/Afspraak alle.csv' ,index=False)

# CAMPAGNES
df_cam = pd.read_csv('../Data/Campagne.csv')
existing_campaign_ids = get_existing_ids('Campagne', "Campagne_ID", 'crm_Campagne_Campagne', df_cam)
unique_campaign = set(df_cam['crm_Campagne_Campagne']).union(existing_campaign_ids)
df_cam = remove_duplicate_ids(df_cam,'Campagne', "Campagne_ID", 'crm_Campagne_Campagne', unique_campaign)
df_cam["crm_Campagne_Einddatum"] = pd.to_datetime(df_cam["crm_Campagne_Einddatum"])
df_cam["crm_Campagne_Startdatum"] = pd.to_datetime(df_cam["crm_Campagne_Startdatum"])
df_cam.to_csv('../Data/Campagne.csv', index=False)

# MAILS
df_mail = pd.read_csv("../Data/CDI mailing.csv")
existing_mail_ids = get_existing_ids('CDI_Mailing', "Mailing_ID", "crm_CDI_Mailing_Mailing", df_mail)
unique_mail = set(df_mail["crm_CDI_Mailing_Mailing"]).union(existing_mail_ids)
df_mail = remove_duplicate_ids(df_mail,'CDI_Mailing', "Mailing_ID", "crm_CDI_Mailing_Mailing", unique_mail)
# df_mail.drop_duplicates(keep='first', inplace=True)
df_mail.to_csv('../Data/Campagne.csv', index=False)

# PERSONEN
df_per = pd.read_csv('../Data/Persoon.csv')
existing_persoon_ids = get_existing_ids('Persoon', "Persoon_ID", "crm_Persoon_persoon", df_per)
unique_persoon = set(df_per["crm_Persoon_persoon"]).union(existing_persoon_ids)
df_per = remove_duplicate_ids(df_per,'Persoon', "Persoon_ID", "crm_Persoon_persoon", unique_persoon)
df_per.drop(['crm_Persoon_Mail_regio_Antwerpen_Waasland',
              'crm_Persoon_Mail_regio_Brussel_Hoofdstedelijk_Gewest',
              'crm_Persoon_Mail_regio_Limburg',
              'crm_Persoon_Mail_regio_Mechelen_Kempen',
              'crm_Persoon_Mail_regio_Oost_Vlaanderen',
              'crm_Persoon_Mail_regio_Vlaams_Brabant',
              'crm_Persoon_Mail_regio_Voka_nationaal',
              'crm_Persoon_Mail_regio_West_Vlaanderen'
              ], axis=1, inplace=True)
df_per.drop(['crm_Persoon_Mail_thema_duurzaamheid',
              'crm_Persoon_Mail_thema_financieel_fiscaal',
              'crm_Persoon_Mail_thema_innovatie',
              'crm_Persoon_Mail_thema_internationaal_ondernemen',
              'crm_Persoon_Mail_thema_mobiliteit',
              'crm_Persoon_Mail_thema_omgeving',
              'crm_Persoon_Mail_thema_sales_marketing_communicatie',
              'crm_Persoon_Mail_thema_strategie_en_algemeen_management',
              'crm_Persoon_Mail_thema_talent',
              'crm_Persoon_Mail_thema_welzijn',
              ], axis=1, inplace=True)
df_per.drop(['crm_Persoon_Mail_type_Bevraging',
              'crm_Persoon_Mail_type_communities_en_projecten',
              'crm_Persoon_Mail_type_netwerkevenementen',
              'crm_Persoon_Mail_type_nieuwsbrieven',
              'crm_Persoon_Mail_type_opleidingen',
              'crm_Persoon_Mail_type_persberichten_belangrijke_meldingen',
              ], axis=1, inplace=True)   
df_per.drop('crm_Persoon_Web_Login', axis=1, inplace=True)
# df_per.drop_duplicates(keep='first', inplace=True)
df_per.to_csv('../Data/Persoon.csv', index=False)  

# CONTACTEN
df_con = pd.read_csv('../Data/Contact.csv')
existing_contact_ids = get_existing_ids('Contactfiche', "Contactfiche_ID", 'crm_Contact_Contactpersoon', df_con)
unique_contact = set(df_con['crm_Contact_Contactpersoon']).union(existing_contact_ids)
df_con = remove_duplicate_ids(df_con,'Contactfiche', "Contactfiche_ID", 'crm_Contact_Contactpersoon', unique_contact)
df_con = df_con[df_con['crm_Contact_Account'].isin(unique_account)]
df_con = df_con[df_con['crm_Contact_Persoon_ID'].isin(unique_persoon)]
df_con.to_csv('../Data/Contact.csv', index=False)

# FUNCTIES
df_func = pd.read_csv("../Data/Functie.csv")
existing_functie_ids = get_existing_ids('Functie', "Functie_ID", "crm_Functie_Functie", df_per)
unique_functie = set(df_func["crm_Functie_Functie"]).union(existing_functie_ids)
df_func = remove_duplicate_ids(df_func,'Functie', "Functie_ID", "crm_Functie_Functie", unique_functie)
# df_func.drop_duplicates(keep='first', inplace=True)
df_func.to_csv("../Data/Functie.csv", index=False)

# GEBRUIKERS
df_geb = pd.read_csv("../Data/Gebruikers.csv")
existing_gebruiker_ids = get_existing_ids('Gebruiker', "Gebruiker_ID", 'crm_Gebruikers_CRM_User_ID', df_geb)
unique_gebruiker = set(df_geb['crm_Gebruikers_CRM_User_ID']).union(existing_gebruiker_ids)
df_geb = remove_duplicate_ids(df_geb,'Gebruiker', "Gebruiker_ID", 'crm_Gebruikers_CRM_User_ID', unique_gebruiker)
# df_geb.drop_duplicates(keep='first', inplace=True)
df_geb.to_csv("../Data/Functie.csv", index=False)

# SESSIE_INSCHRIJVING
df_ses_in = pd.read_csv("../Data/Sessie inschrijving.csv")
existing_sesin_ids = get_existing_ids('SessieInschrijving', "SessieInschrijving_ID", 'crm_SessieInschrijving_SessieInschrijving', df_ses_in)
unique_ses_in = set(df_ses_in['crm_SessieInschrijving_SessieInschrijving']).union(existing_sesin_ids)
df_ses_in = remove_duplicate_ids(df_ses_in,'SessieInschrijving', "SessieInschrijving_ID", 'crm_SessieInschrijving_SessieInschrijving', unique_ses_in)

# SESSIES
df_ses = pd.read_csv("../Data/Sessie.csv")
existing_ses_ids = get_existing_ids('Sessie', "Sessie_ID", "crm_Sessie_Sessie", df_ses)
unique_sessie = set(df_ses["crm_Sessie_Sessie"]).union(existing_ses_ids)
df_ses = remove_duplicate_ids(df_ses,'Sessie', "Sessie_ID", "crm_Sessie_Sessie", unique_sessie)
df_ses = df_ses[df_ses["crm_Sessie_Campagne"].isin(unique_campaign)]
# df_ses.drop_duplicates(keep='first', inplace=True)
df_ses["crm_Sessie_Eind_Datum_Tijd"] = pd.to_datetime(df_ses["crm_Sessie_Eind_Datum_Tijd"], format="%d-%m-%Y %H:%M:%S") 
df_ses["crm_Sessie_Start_Datum_Tijd"] = pd.to_datetime(df_ses["crm_Sessie_Start_Datum_Tijd"], format="%d-%m-%Y %H:%M:%S") 
df_ses.to_csv("../Data/Sessie.csv", index=False)

# INSCHRIJVINGEN
df_insc = pd.read_csv("../Data/Inschrijving.csv")
existing_insc_ids = get_existing_ids('Inschrijving', "Inschrijving_ID", "crm_Inschrijving_Inschrijving", df_insc)
unique_inschrijving = set(df_insc["crm_Inschrijving_Inschrijving"]).union(existing_insc_ids)
df_insc = remove_duplicate_ids(df_insc,'Inschrijving', "Inschrijving_ID", "crm_Inschrijving_Inschrijving", unique_inschrijving)

if 'crm_Inschrijving_Campagne' not in df_insc.columns:
    # # Merge Inschrijving and Sessie_Inschrijving on Inschrijving_ID
    df_merged = pd.merge(df_insc, df_ses_in, how='inner',  left_on='crm_Inschrijving_Inschrijving', right_on='crm_SessieInschrijving_Inschrijving')

    # # Merge with Sessie on Sessie_ID
    df_merged = pd.merge(df_merged, df_ses, how='inner', left_on='crm_SessieInschrijving_Sessie', right_on='crm_Sessie_Sessie') #crm_Sessie_Sessie

    # # Merge with Campagne on Campagne_ID
    df_merged = pd.merge(df_merged, df_cam, how='inner', left_on='crm_Sessie_Campagne', right_on='crm_Campagne_Campagne')


    df_insc = df_merged[['crm_Inschrijving_Aanwezig_Afwezig',
                        'crm_Inschrijving_Bron',
                        'crm_Inschrijving_Contactfiche',
                        'crm_Inschrijving_Datum_inschrijving',
                        'crm_Inschrijving_Inschrijving',
                        'crm_Inschrijving_Facturatie_Bedrag',
                        'crm_Campagne_Campagne',
                        'crm_Campagne_Naam'
                        ]]

    df_insc.rename(columns={'crm_Campagne_Campagne':"crm_Inschrijving_Campagne",
                            'crm_Campagne_Naam':'crm_Inschrijving_Campagne_Naam_'}, inplace=True)


df_insc = df_insc[df_insc['crm_Inschrijving_Contactfiche'].isin(unique_contact)]
df_insc = df_insc[df_insc["crm_Inschrijving_Campagne"].isin(unique_campaign)]
df_insc["crm_Inschrijving_Datum_inschrijving"] = pd.to_datetime(df_insc["crm_Inschrijving_Datum_inschrijving"])

df_insc.to_csv("../Data/Inschrijving.csv", index=False)

# SESSIE_INSCHRIJVING
df_ses_in = df_ses_in[df_ses_in["crm_SessieInschrijving_Sessie"].isin(unique_sessie)]
df_ses_in = df_ses_in[df_ses_in["crm_SessieInschrijving_Inschrijving"].isin(unique_inschrijving)]
# df_ses_in.drop_duplicates(keep="first", inplace=True)
df_ses_in.to_csv("../Data/Sessie inschrijving.csv", index=False)

# ACTIVITEITSCODES
df_acti = pd.read_csv('../Data/Activiteitscode.csv')
existing_acti_ids = get_existing_ids('Activiteitscode', "Activiteitscode_ID", 'crm_ActiviteitsCode_Activiteitscode', df_acti)
unique_activiteit = set(df_acti['crm_ActiviteitsCode_Activiteitscode']).union(existing_acti_ids)
df_acti = remove_duplicate_ids(df_acti,'Activiteitscode', "Activiteitscode_ID", 'crm_ActiviteitsCode_Activiteitscode', unique_activiteit)
df_acti.drop_duplicates(keep='first', inplace=True)
df_acti.to_csv('../Data/Activiteitscode.csv', index=False)

# ACCOUNT ACTIVITEITSCODE
df_aac = pd.read_csv('../Data/Account activiteitscode.csv')
existing_aac_ids = get_existing_ids('Account_ActiviteitsCode', "Activiteitscode_ID", "crm_Account_ActiviteitsCode_Activiteitscode", df_aac)
unique_acc_act = set(df_aac['crm_Account_ActiviteitsCode_Activiteitscode']).union(existing_aac_ids)
df_aac = remove_duplicate_ids(df_aac,'Account_ActiviteitsCode', "Activiteitscode_ID", "crm_Account_ActiviteitsCode_Activiteitscode", unique_acc_act)
# df_aac.drop_duplicates(keep='first', inplace=True)
df_aac.dropna(inplace=True)
df_aac = df_aac[df_aac['crm_Account_ActiviteitsCode_Account'].isin(unique_account)]
df_aac.to_csv('../Data/Account activiteitscode.csv', index=False)

# ACCOUNT FINANCIELE DATA
df_fd = pd.read_csv("../Data/Account financiële data.csv")
existing_fd_ids = get_existing_ids('Account_Financiele_Data', "Account_ID", "crm_FinancieleData_OndernemingID", df_fd)
unique_fd = set(df_fd['crm_FinancieleData_OndernemingID']).union(existing_fd_ids)
df_fd = remove_duplicate_ids(df_fd,'Account_Financiele_Data', "Account_ID", "crm_FinancieleData_OndernemingID", unique_fd)
df_fd = df_fd[df_fd['crm_FinancieleData_OndernemingID'].isin(unique_account)]
df_fd = df_fd.drop_duplicates(subset=['crm_FinancieleData_OndernemingID', 'crm_FinancieleData_Boekjaar'], keep='first')
df_fd.to_csv("../Data/Account financiële data.csv", index=False)

# PAGEVIEW
df_pa = pd.read_csv("../Data/cdi pageviews.csv")
existing_pa_ids = get_existing_ids('CDI_PageView', "PageView_ID", "crm_CDI_PageView_Page_View", df_pa)
unique_pa = set(df_pa['crm_CDI_PageView_Page_View']).union(existing_pa_ids)
df_pa = remove_duplicate_ids(df_pa,'CDI_PageView', "PageView_ID", "crm_CDI_PageView_Page_View", unique_pa)

mean_duration = df_pa["crm_CDI_PageView_Duration"].mean()
df_pa["crm_CDI_PageView_Duration"].fillna(mean_duration, inplace=True)

df_pa.drop('crm_CDI_PageView_Anonymous_Visitor', inplace=True, axis=1)
df_pa.drop('crm_CDI_PageView_Url', inplace=True, axis=1)
df_pa.drop('crm_CDI_PageView_Web_Content', inplace=True, axis=1)

df_pa = df_pa[df_pa['crm_CDI_PageView_Contact'].isin(unique_contact)]
df_pa = df_pa[df_pa['crm_CDI_PageView_Campaign'].isin(unique_campaign)]

df_pa["crm_CDI_PageView_Time"] = pd.to_datetime(df_pa["crm_CDI_PageView_Time"].str.replace('(UTC)', ''))
df_pa["crm_CDI_PageView_Viewed_On"] = pd.to_datetime(df_pa["crm_CDI_PageView_Viewed_On"], format="%d-%m-%Y %H:%M:%S") #
df_pa["crm_CDI_PageView_Aangemaakt_op"] = pd.to_datetime(df_pa["crm_CDI_PageView_Aangemaakt_op"], format="%d-%m-%Y %H:%M:%S") #
df_pa["crm_CDI_PageView_Gewijzigd_op"] = pd.to_datetime(df_pa["crm_CDI_PageView_Gewijzigd_op"], format="%d-%m-%Y %H:%M:%S") #

df_pa.to_csv("../Data/cdi pageviews.csv", index=False)


# VISITS
df_vi = pd.read_csv("../Data/CDI visits.csv")
existing_vi_ids = get_existing_ids('CDI_Visits', "Visit_ID", "crm_CDI_Visit_Visit", df_vi)
unique_vi = set(df_vi['crm_CDI_Visit_Visit']).union(existing_vi_ids)
df_vi = remove_duplicate_ids(df_vi,'CDI_Visits', "Visit_ID", "crm_CDI_Visit_Visit", unique_vi)

df_vi.drop(["crm_CDI_Visit_Browser",
            "crm_CDI_Visit_Campagne_Code",
            "crm_CDI_Visit_Referring_Host",
            "crm_CDI_Visit_Bounce",
            "crm_CDI_Visit_Adobe_Reader",
            "crm_CDI_Visit_containssocialprofile",
            "crm_CDI_Visit_IP_Company",
            "crm_CDI_Visit_Entry_Page",
            "crm_CDI_Visit_Exit_Page",
            "crm_CDI_Visit_Referrer",
            "crm_CDI_Visit_Total_Pages",
            "crm_CDI_Visit_Keywords" ], axis=1, inplace=True) 

df_vi = df_vi[df_vi["crm_CDI_Visit_Contact"].isin(unique_contact)]
df_vi = df_vi[df_vi["crm_CDI_Visit_Email_Send"].isin(unique_mail)]
df_vi = df_vi[df_vi["crm_CDI_Visit_Campaign"].isin(unique_campaign)]

df_vi["crm_CDI_Visit_Time"] = pd.to_datetime(df_vi["crm_CDI_Visit_Time"].str.replace('(UTC)', ''))
df_vi["crm_CDI_Visit_Aangemaakt_op"] = pd.to_datetime(df_vi["crm_CDI_Visit_Aangemaakt_op"], format="%d-%m-%Y %H:%M:%S") #
df_vi["crm_CDI_Visit_Gewijzigd_op"] = pd.to_datetime(df_vi["crm_CDI_Visit_Gewijzigd_op"], format="%d-%m-%Y %H:%M:%S") # 
df_vi["crm_CDI_Visit_Ended_On"] = pd.to_datetime(df_vi["crm_CDI_Visit_Ended_On"], format="%d-%m-%Y %H:%M:%S") # 
df_vi["crm_CDI_Visit_Started_On"] = pd.to_datetime(df_vi["crm_CDI_Visit_Started_On"], format="%d-%m-%Y %H:%M:%S") # 

df_vi.to_csv("../Data/CDI visits.csv", index=False)

# AFSPRAAK ACCOUNT GELINKT
df_gc = pd.read_csv('../Data/Afspraak_account_gelinkt_cleaned.csv')
existing_gc_ids = get_existing_ids('Afspraak_Account_Gelinkt', "Afspraak_ID", "crm_Afspraak_ACCOUNT_GELINKT_Afspraak", df_gc)
unique_gc = set(df_gc['crm_Afspraak_ACCOUNT_GELINKT_Afspraak']).union(existing_gc_ids)
df_gc = remove_duplicate_ids(df_gc,'Afspraak_Account_Gelinkt', "Afspraak_ID", "crm_Afspraak_ACCOUNT_GELINKT_Afspraak", unique_gc)

df_gc = df_gc[df_gc['crm_Afspraak_ACCOUNT_GELINKT_Afspraak'].isin(unique_afspraak)]
# df_gc.drop_duplicates('crm_Afspraak_ACCOUNT_GELINKT_Afspraak',keep='first', inplace=True)

df_gc.to_csv('../Data/Afspraak_account_gelinkt_cleaned.csv', index=False)

# AFSPRAAK BETREFT CONTACT
df_cc = pd.read_csv("../Data/Afspraak betreft contact_cleaned.csv")
unique_cc = set(df_cc['crm_Afspraak_BETREFT_CONTACTFICHE_Afspraak'])
df_cc = remove_duplicate_ids(df_cc,'Afspraak_Betreft_Contactfiche', "Afspraak_ID", "crm_Afspraak_BETREFT_CONTACTFICHE_Afspraak", unique_cc)

df_cc = df_cc[df_cc['crm_Afspraak_BETREFT_CONTACTFICHE_Betreft_id'].isin(unique_contact)]
df_cc = df_cc[df_cc['crm_Afspraak_BETREFT_CONTACTFICHE_Afspraak'].isin(unique_afspraak)]
# df_cc.drop_duplicates(['crm_Afspraak_BETREFT_CONTACTFICHE_Betreft_id', 'crm_Afspraak_BETREFT_CONTACTFICHE_Afspraak'],keep='first' ,inplace=True )
df_cc.to_csv("../Data/Afspraak betreft contact_cleaned.csv", index=False)

# ACTIVITEIT VEREIST CONTACT
df_vc = pd.read_csv("../Data/Activiteit vereist contact.csv")
existing_vc_ids = get_existing_ids('Afspraak_Vereist_Contact', "Contactfiche_ID", "crm_ActiviteitVereistContact_ReqAttendee", df_vc)
unique_vc = set(df_vc['crm_ActiviteitVereistContact_ReqAttendee']).union(existing_vc_ids)
df_vc = remove_duplicate_ids(df_vc,'Afspraak_Vereist_Contact', "Contactfiche_ID", "crm_ActiviteitVereistContact_ReqAttendee", unique_vc)

df_vc = df_vc[df_vc["crm_ActiviteitVereistContact_ActivityId"].isin(unique_activiteit)]
df_vc = df_vc[df_vc["crm_ActiviteitVereistContact_ReqAttendee"].isin(unique_contact)]
# df_vc.drop_duplicates(['crm_ActiviteitVereistContact_ReqAttendee', 'crm_ActiviteitVereistContact_ActivityId'], keep="first", inplace=True)
df_vc.to_csv("../Data/Activiteit vereist contact.csv", index=False)

# AFRPAAK BETREFT ACCOUNT
df_acc = pd.read_csv("../Data/Afspraak betreft account_cleaned.csv")
existing_acc_ids = get_existing_ids('Afspraak_Betreft_Account', "Afspraak_ID", "crm_Afspraak_BETREFT_ACCOUNT_Afspraak", df_acc)
unique_acc = set(df_acc['crm_Afspraak_BETREFT_ACCOUNT_Afspraak']).union(existing_acc_ids)
df_acc = remove_duplicate_ids(df_acc,'Afspraak_Betreft_Account', "Afspraak_ID", "crm_Afspraak_BETREFT_ACCOUNT_Afspraak", unique_acc)

df_acc = df_acc[df_acc['crm_Afspraak_BETREFT_ACCOUNT_Betreft_id'].isin(unique_account)]
df_acc = df_acc[df_acc['crm_Afspraak_BETREFT_ACCOUNT_Afspraak'].isin(unique_afspraak)]
# df_acc.drop_duplicates(keep='first', inplace=True)
df_acc.to_csv("../Data/Afspraak betreft account_cleaned.csv", index=False)

# CONTACT FUNCTIE
df_cf = pd.read_csv("../Data/Contact functie.csv")
existing_cf_ids = get_existing_ids('ContactFunctie', "Persoon_ID", "crm_ContactFunctie_Contactpersoon", df_cf)
unique_cf = set(df_cf['crm_ContactFunctie_Contactpersoon']).union(existing_cf_ids)
df_cf = remove_duplicate_ids(df_cf,'ContactFunctie', "Persoon_ID", "crm_ContactFunctie_Contactpersoon", unique_cf)

df_cf = df_cf[df_cf['crm_ContactFunctie_Functie'].isin(unique_functie)]
df_cf = df_cf[df_cf['crm_ContactFunctie_Contactpersoon'].isin(unique_persoon)]
df_cf.to_csv("../Data/Contact functie.csv", index=False)

# SENT EMAIL
df_sent = pd.read_csv("../Data/CDI sent email clicks.csv")
existing_sent_ids = get_existing_ids('CDI_Sent_Email_Clicks', "Sent_Email", "crm_CDI_SentEmail_kliks_Sent_Email", df_sent)
unique_sent = set(df_sent['crm_CDI_SentEmail_kliks_Sent_Email']).union(existing_sent_ids)
df_sent = remove_duplicate_ids(df_sent,'CDI_Sent_Email_Clicks', "Sent_Email", "crm_CDI_SentEmail_kliks_Sent_Email", unique_sent)

df_sent = df_sent[df_sent['crm_CDI_SentEmail_kliks_Contact'].isin(unique_contact)]
df_sent = df_sent[df_sent['crm_CDI_SentEmail_kliks_E_mail_versturen'].isin(unique_mail)]
df_sent.to_csv("../Data/CDI sent email clicks.csv", index=False)

# INFO EN KLACHTEN
df_info = pd.read_csv("../Data/Info en klachten.csv")
existing_info_ids = get_existing_ids('Info_en_Klachten', "Aanvraag", "crm_Info_en_Klachten_Aanvraag", df_info)
unique_info = set(df_info['crm_Info_en_Klachten_Aanvraag']).union(existing_info_ids)
df_info = remove_duplicate_ids(df_info,'Info_en_Klachten', "Aanvraag", "crm_Info_en_Klachten_Aanvraag", unique_info)

df_info = df_info[df_info["crm_Info_en_Klachten_Eigenaar"].isin(unique_gebruiker)]
df_info = df_info[df_info["crm_Info_en_Klachten_Account"].isin(unique_account)]
# df_info.drop_duplicates(keep="first", inplace=True)
df_info.to_csv("../Data/Info en klachten.csv", index=False)

# LIDMAATSCHAP
df_lid = pd.read_csv("../Data/Lidmaatschap.csv")
existing_lid_ids = get_existing_ids('Info_en_Klachten', "Aanvraag", "crm_Info_en_Klachten_Aanvraag", df_lid)
unique_lid = set(df_lid['crm_Lidmaatschap_Lidmaatschap']).union(existing_lid_ids)
df_lid = remove_duplicate_ids(df_lid,'Lidmaatschap', "Lidmaatschap_ID", "crm_Lidmaatschap_Lidmaatschap", unique_lid)

df_lid["crm_Lidmaatschap_Datum_Opzeg"] = pd.to_datetime(df_lid["crm_Lidmaatschap_Datum_Opzeg"] , format="%d-%m-%Y" ) #
df_lid["crm_Lidmaatschap_Startdatum"] = pd.to_datetime(df_lid["crm_Lidmaatschap_Startdatum"], format="%d-%m-%Y") # 
df_lid = df_lid[df_lid["crm_Lidmaatschap_Onderneming"].isin(unique_account)]
# df_lid.drop_duplicates(keep="first", inplace=True)

df_lid.to_csv("../Data/Lidmaatschap.csv", index=False)

# WEB CONTENT
df_web = pd.read_csv("../Data/CDI web content.csv")
existing_web_ids = get_existing_ids('CDI_Web_Content', "WebContent_ID", "crm_CDI_WebContent_Web_Content", df_web)
unique_web = set(df_web['crm_CDI_WebContent_Web_Content']).union(existing_web_ids)
df_web = remove_duplicate_ids(df_web,'CDI_Web_Content', "WebContent_ID", "crm_CDI_WebContent_Web_Content", unique_web)

df_web['crm_CDI_WebContent_Created_On'] = pd.to_datetime(df_web['crm_CDI_WebContent_Created_On'] )
df_web['crm_CDI_WebContent_Modified_On'] = pd.to_datetime(df_web['crm_CDI_WebContent_Modified_On'], format="%d-%m-%Y %H:%M:%S")
df_web.drop_duplicates(keep="first", inplace=True)

df_web.to_csv("../Data/CDI web content.csv", index=False)


Before (41452, 17) Account
Existing IDs: 290506
Common IDs: 290506
Removing duplicate IDs from Account_ID: 290506
After (0, 17) Account
Before (8265, 1) Afspraak_Alle
Existing IDs: 8265
Common IDs: 8265
Removing duplicate IDs from Afspraak_ID: 8265
After (0, 1) Afspraak_Alle
Before (311, 11) Campagne
Existing IDs: 468
Common IDs: 468
Removing duplicate IDs from Campagne_ID: 468
After (0, 11) Campagne
Before (57, 4) CDI_Mailing
Existing IDs: 917
Common IDs: 917
Removing duplicate IDs from Mailing_ID: 917
After (0, 4) CDI_Mailing
Before (7617, 29) Persoon
Existing IDs: 583614
Common IDs: 583614
Removing duplicate IDs from Persoon_ID: 583614
After (0, 29) Persoon
Before (64107, 6) Contactfiche
Existing IDs: 1173107
Common IDs: 1173107
Removing duplicate IDs from Contactfiche_ID: 1173107
After (2562, 6) Contactfiche
Before (71, 2) Functie
Existing IDs: 146
Common IDs: 146
Removing duplicate IDs from Functie_ID: 146
After (71, 2) Functie
Before (146, 2) Gebruiker
Existing IDs: 146
Common ID

  df_insc["crm_Inschrijving_Datum_inschrijving"] = pd.to_datetime(df_insc["crm_Inschrijving_Datum_inschrijving"])


Before (40, 3) Activiteitscode
Existing IDs: 40
Common IDs: 40
Removing duplicate IDs from Activiteitscode_ID: 40
After (0, 3) Activiteitscode
Before (14692, 3) Account_ActiviteitsCode
Existing IDs: 39
Common IDs: 39
Removing duplicate IDs from Activiteitscode_ID: 39
After (0, 3) Account_ActiviteitsCode
Before (0, 6) Account_Financiele_Data
Existing IDs: 180320
Common IDs: 180320
Removing duplicate IDs from Account_ID: 180320
After (0, 6) Account_Financiele_Data


  df_pa = pd.read_csv("../Data/cdi pageviews.csv")


Before (184191, 21) CDI_PageView
Existing IDs: 1369
Common IDs: 1369
Removing duplicate IDs from PageView_ID: 1369
After (182822, 21) CDI_PageView
Before (62265, 35) CDI_Visits
Existing IDs: 7536
Common IDs: 7536
Removing duplicate IDs from Visit_ID: 7536
After (61969, 35) CDI_Visits
Before (184, 7) Afspraak_Account_Gelinkt
Existing IDs: 2936
Common IDs: 2936
Removing duplicate IDs from Afspraak_ID: 2936
After (174, 7) Afspraak_Account_Gelinkt
Before (204, 7) Afspraak_Betreft_Contactfiche
Existing IDs: 2552
Common IDs: 10
Removing duplicate IDs from Afspraak_ID: 10
After (194, 7) Afspraak_Betreft_Contactfiche
Before (1877191, 2) Afspraak_Vereist_Contact
Existing IDs: 0
Common IDs: 0
After (1877191, 2) Afspraak_Vereist_Contact
Before (204, 7) Afspraak_Betreft_Account
Existing IDs: 4876
Common IDs: 4876
Removing duplicate IDs from Afspraak_ID: 4876
After (201, 7) Afspraak_Betreft_Account
Before (0, 2) ContactFunctie
Existing IDs: 0
Common IDs: 0
After (0, 2) ContactFunctie
Before (41885,