In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

### Account cleanen

In [70]:
df_account = pd.read_csv('../data_clean/Account_fixed.csv')

# alleen de actieve accounts
df_account = df_account[df_account['account_reden_van_status'] != 'Inactief']
df_account = df_account[df_account['account_reden_van_status'] != 'Stopzetting']

# alleen de accounts uit Oost-Vlaanderen
df_account = df_account[df_account['account_adres_provincie'] == 'Oost-Vlaanderen']

# account adres samenvoegen
df_account['account_adres'] = df_account['account_adres_plaats'].str.lower() + ' ' \
            + df_account['account_adres_geografische_subregio'].str.lower()
df_account['account_adres'] = df_account['account_adres'].str.replace(r'\([a-z.-]+\)', '', regex=True).str.replace('  ', ' ')\

# account ondernemingstype samenvoegen
df_account['account_onderneming'] = df_account['account_ondernemingstype'] + ', ' \
                            + df_account['account_ondernemingsaard'] + ', ' \
                            + df_account['account_primaire_activiteit'] + ' ' \

df_account['account_onderneming'] = df_account['account_onderneming'].str.replace('unknown', '').str.replace(', , ', '') \
                                                            .str.strip().str.lower() \
                                                            .str.replace(r',$|^,', '', regex=True) \
                                                            .str.replace('&', '').str.replace('-', '') \

# drop kolommen
df_account.drop(['account_industriezone_naam_', 'account_oprichtingsdatum', 'account_reden_van_status',
           'account_status', 'account_voka_nr_', 'account_is_voka_entiteit','account_adres_geografische_regio', 
           'account_adres_geografische_subregio', 'account_adres_plaats', 'account_adres_postcode', 
           'account_adres_provincie', 'account_adres_land', 'account_ondernemingstype', 
           'account_ondernemingsaard', 'account_primaire_activiteit'], axis=1, inplace=True)

### Contact cleanen

In [71]:
df_contact = pd.read_csv('../data_clean/Contact_fixed.csv')
df_contact = df_contact[df_contact['contact_status'] != 'Inactief']
df_contact['contact_functietitel'] = df_contact['contact_functietitel'].str.lower().str.replace(r'[^\w\s]', '', regex=True) \
                                                        .str.replace('  ', ' ').str.strip()
df_contact.drop(['contact_status', 'contact_voka_medewerker'], axis=1, inplace=True)

### Mergen van Account en Contact zodat alleen de contacten van Oost-Vlaanderen overblijven

Inner join, anders 11000 accounts zonder contactpersoon

In [72]:
df_account_contact = df_account.merge(df_contact, left_on='account_account_id', right_on='contact_account', how='inner')
df_account_contact.drop(['contact_account'], axis=1, inplace=True)

In [73]:
def print_nunique(df, acc_col='account_account_id', cont_col='contact_contactpersoon_id'):
    try:
        con_un = df[cont_col].nunique()
    except:
        con_un = 'niet gevonden'
    
    try:
        acc_un = df[acc_col].nunique()
    except:
        acc_un = 'niet gevonden'

    print(f'#account: {acc_un}, #contact: {con_un}')

In [74]:
print_nunique(df_account_contact)

#account: 30716, #contact: 67299


##### Persoon cleanen

In [75]:
# df_persoon = pd.read_csv('../data_clean/Persoon_fixed.csv')

# df_persoon = df_persoon[df_persoon['persoon_mail_regio_oost_vlaanderen'] == 1]
# df_persoon = df_persoon[df_persoon['persoon_reden_van_status'] == 'Actief']

# for col in df_persoon.columns:
#     if col.__contains__('persoon_mail_regio'):
#         df_persoon.drop(col, axis=1, inplace=True)

# df_persoon.drop(['persoon_persoonsnr_', 'persoon_web_login', 'persoon_reden_van_status'], axis=1, inplace=True)

# df_persoon.shape

### Account_contact mergen met persoon

Persoon niet meer gebruiken -> maar 4374 non-null

In [76]:
# df_account_contact_persoon = df_account_contact.merge(df_persoon, left_on='contact_persoon_id', right_on='persoon_persoon_id', how='left')
# df_account_contact_persoon.drop(['contact_persoon_id'], axis=1, inplace=True)
# print(df_account_contact_persoon.shape)
# df_account_contact_persoon.info()
df_account_contact.drop(['contact_persoon_id'], axis=1, inplace=True)

### Account_contact mergen met contactfunctie en functie

In [77]:
# read in csv files
df_contact_functie = pd.read_csv('../data_clean/Contact_functie_fixed.csv')
df_functie = pd.read_csv('../data_clean/Functie_fixed.csv')

# merge
df_account_contact = df_account_contact.merge(df_contact_functie, left_on='contact_contactpersoon_id'
                                              , right_on='contactfunctie_contactpersoon', how='inner')
df_account_contact.drop(['contactfunctie_contactpersoon'], axis=1, inplace=True)

Merge met functie

In [78]:
df_account_contact = df_account_contact.merge(df_functie, left_on='contactfunctie_functie', right_on='functie_functie_id', how='inner')
df_account_contact.drop(['contactfunctie_functie', 'functie_functie_id'], axis=1, inplace=True)
df_account_contact.shape

(90938, 6)

In [79]:
print_nunique(df_account_contact)

#account: 30707, #contact: 67225


##### Functie naam kan verschillend zijn voor hetzelfde contact, dus die mergen we

In [80]:
# Define a custom aggregation function to merge 'functie_naam'
def merge_functie_naam(series):
    return ', '.join(series)

# Group by 'account_account_id' and 'contact_contactpersoon_id' and merge 'functie_naam'
df_account_contact = df_account_contact.groupby(
    ['account_account_id', 'contact_contactpersoon_id'], 
    as_index=False).agg({'functie_naam': merge_functie_naam, 
                         **{col: 'first' for col in df_account_contact.columns if col != 'functie_naam'}})

df_account_contact['functie_naam'] = df_account_contact['functie_naam'].str.lower() \
        .str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

df_account_contact.shape

(67225, 6)

In [81]:
print_nunique(df_account_contact)

#account: 30707, #contact: 67225


### Account_contact mergen met account_activiteitscode en activiteitscode

In [82]:
# csv files inlezen
df_account_activiteitscode = pd.read_csv('../data_clean/Account_activiteitscode_fixed.csv')
df_activiteitscode = pd.read_csv('../data_clean/Activiteitscode_fixed.csv')

# merge
df_activiteit = df_account_activiteitscode.merge(df_activiteitscode, left_on='account_activiteitscode_activiteitscode', right_on='activiteitscode_activiteitscode_id', how='inner')

# alleen de actieve activiteiten
df_activiteit = df_activiteit[df_activiteit['activiteitscode_status'] == 'Actief']

# drop kolommen
df_activiteit.drop(['account_activiteitscode_activiteitscode', 'activiteitscode_activiteitscode_id'
                    , 'activiteitscode_status', 'account_activiteitscode_inf_account_inf_activiteitscodeid'], axis=1, inplace=True)

activiteitscode naam combineren waar account niet uniek is

In [83]:
# where account_activiteitscode_account is not unique, combine the activiteitscode_naam
def merge_activiteitscode_naam(series):
    return ', '.join(series)

df_activiteit = df_activiteit.groupby(
    ['account_activiteitscode_account'], 
    as_index=False).agg({'activiteitscode_naam': merge_activiteitscode_naam, 
                         **{col: 'first' for col in df_activiteit.columns if col != 'activiteitscode_naam'}})

df_activiteit['activiteitscode_naam'] = df_activiteit['activiteitscode_naam'].str.replace(' en ', ' ').str.replace(' & ', ' ') \
                                                            .str.replace('-', '').str.replace('  ', ' ').str.lower().str.strip() \

print(df_activiteit.shape)

(13408, 2)


mergen met account_contact

In [84]:
df_account_contact = df_account_contact.merge(df_activiteit, left_on='account_account_id'
                                              , right_on='account_activiteitscode_account', how='left')
df_account_contact.drop(['account_activiteitscode_account'], axis=1, inplace=True)
df_account_contact.shape

(67225, 7)

In [85]:
print_nunique(df_account_contact)

#account: 30707, #contact: 67225


#####  Maar 16160 vd 67225 activiteitscodes zijn non-null

In [86]:
df_account_contact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67225 entries, 0 to 67224
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   functie_naam               67225 non-null  object
 1   account_account_id         67225 non-null  object
 2   account_adres              67225 non-null  object
 3   account_onderneming        67225 non-null  object
 4   contact_contactpersoon_id  67225 non-null  object
 5   contact_functietitel       67225 non-null  object
 6   activiteitscode_naam       16160 non-null  object
dtypes: object(7)
memory usage: 3.6+ MB


### Account financiele data mergen met account_contact

In [87]:
df_account_financieel = pd.read_csv('../data_clean/Account_financiële_data_fixed.csv', sep=',')
df_account_financieel.drop(['financieledata_gewijzigd_op', 'financieledata_fte', 'financieledata_aantal_maanden'], axis=1, inplace=True)
df_account_financieel['financieledata_toegevoegde_waarde'] = df_account_financieel['financieledata_toegevoegde_waarde'].astype(str)

df_account_contact_finance = df_account_contact.merge(df_account_financieel, left_on='account_account_id', right_on='financieledata_ondernemingid', how='left')
df_account_contact_finance.drop(['financieledata_ondernemingid'], axis=1, inplace=True)

# turn financieledata_toegevoegde_waarde into a numeric value
df_account_contact_finance['financieledata_toegevoegde_waarde'] = df_account_contact_finance['financieledata_toegevoegde_waarde'] \
                                                                    .str.replace(',', '.') \
                                                                    .str.replace('unknown', '0') \
                                                                    .astype(float)

# De financieel toegevoegde waarde optellen per account en de boekjaren hun range gebruiken (vb 2007 - 2022 => 15 jaar)
df_account_contact_finance = df_account_contact_finance.groupby(
    ['account_account_id', 'contact_contactpersoon_id'], 
    as_index=False).agg({'financieledata_toegevoegde_waarde': 'sum', 
                         'financieledata_boekjaar': lambda x: x.max() - x.min(), 
                         **{col: 'first' for col in df_account_contact_finance.columns if col not in ['financieledata_toegevoegde_waarde', 'financieledata_boekjaar']}})

df_account_contact_finance.rename(columns={'financieledata_boekjaar': 'financieledata_boekjaar_range'}, inplace=True)

##### Alle niet meer benodigde dataframes verwijderen

In [88]:
try:
    del df_account
    del df_contact
    del df_contact_functie
    del df_functie
    del df_account_financieel
    del df_account_contact
except:
    print('already deleted or non-existent')

### Afspraken cleanen en mergen

In [89]:
# Csv files inlezen
df_afspraak_acc_gelinkt = pd.read_csv('../data_clean/Afspraak_account_gelinkt_cleaned_fixed.csv', sep=',')
df_afspraak_acc_gelinkt.drop(['afspraak_account_gelinkt_account'], axis=1, inplace=True)

df_afspraak_betreft_acc = pd.read_csv('../data_clean/Afspraak_betreft_account_cleaned_fixed.csv', sep=',')
df_afspraak_betreft_acc.drop(['afspraak_betreft_account_betreft_id'], axis=1, inplace=True)

df_afspraak_betreft_contact = pd.read_csv('../data_clean/Afspraak_betreft_contact_cleaned_fixed.csv', sep=',')
df_afspraak_betreft_contact.drop(['afspraak_betreft_contactfiche_betreft_id'], axis=1, inplace=True)

df_afspraak_alle = pd.read_csv('../data_clean/Afspraak_alle_fixed.csv', sep=',')

# Kolomnamen hernoemen en afspraakt_betreft toevoegen (betreft account = 1, betreft contact = 0)
df_afspraak_acc_gelinkt.columns = ['afspraak_afspraak_id', 'afspraak_thema', 'afspraak_subthema', 'afspraak_onderwerp', 'afspraak_eindtijd', 'afspraak_keyphrases']
df_afspraak_betreft_acc.columns = ['afspraak_afspraak_id', 'afspraak_thema', 'afspraak_subthema', 'afspraak_onderwerp', 'afspraak_eindtijd', 'afspraak_keyphrases']
df_afspraak_betreft_contact.columns = ['afspraak_afspraak_id', 'afspraak_thema', 'afspraak_subthema', 'afspraak_onderwerp', 'afspraak_eindtijd', 'afspraak_keyphrases']

df_afspraak_betreft_acc["afspraak_betreft"] = 1
df_afspraak_acc_gelinkt["afspraak_betreft"] = 1
df_afspraak_betreft_contact["afspraak_betreft"] = 0

# Get the unique afspraak_afspraak_id's from afspraak_acc_gelinkt
acc_gelinkt_id_list = df_afspraak_acc_gelinkt['afspraak_afspraak_id'].unique()

# Concatenate the dataframes
df_afspraken = pd.concat([df_afspraak_betreft_acc, df_afspraak_acc_gelinkt, df_afspraak_betreft_contact], ignore_index=True)
df_afspraken.drop_duplicates(inplace=True)

# Mergen met Afspraak_alle
df_afspraken = df_afspraken.merge(df_afspraak_alle, left_on='afspraak_afspraak_id', right_on='afspraak_alle_afspraak_id', how='inner')

# String cleanup
df_afspraken['afspraak_thema'] = df_afspraken['afspraak_subthema'].str.replace('\(', '', regex=True)
df_afspraken['afspraak_thema'] = df_afspraken['afspraak_thema'].str.replace('\)', '', regex=True)
df_afspraken['afspraak_onderwerp'] = df_afspraken['afspraak_onderwerp'].str.lower().astype(str)

# Drop kolommen
df_afspraken.drop(['afspraak_alle_afspraak_id', 'afspraak_eindtijd', 'afspraak_subthema'], axis=1, inplace=True)
df_afspraken.drop_duplicates(inplace=True)

# show rows where afspraak_afspraak_id is not unique
list_to_change_afspraak_betreft_to_2 = df_afspraken[df_afspraken['afspraak_afspraak_id'].duplicated(keep=False)]['afspraak_afspraak_id'].unique()

# for every afspraak_afspraak_id that is not unique, change afspraak_betreft to 2
for afspraak_id in list_to_change_afspraak_betreft_to_2:
    df_afspraken.loc[df_afspraken['afspraak_afspraak_id'] == afspraak_id, 'afspraak_betreft'] = 2

df_afspraken.drop_duplicates(inplace=True)

# Acc_gelinkt = 1 anders 0 => adhv acc_gelinkt_id_list (ctrl+f) staat hierboven in zelfde cell
for afspraak_id in acc_gelinkt_id_list:
    df_afspraken.loc[df_afspraken['afspraak_afspraak_id'] == afspraak_id, 'afspraak_account_gelinkt'] = 1

df_afspraken['afspraak_account_gelinkt'].fillna(0, inplace=True)

# Mergen met Activiteit_vereist_contact
df_activiteit_vereist_contact = pd.read_csv('../data_clean/Activiteit_vereist_contact_fixed.csv', sep=',')
df_afspraken_total = df_afspraken.merge(df_activiteit_vereist_contact, left_on='afspraak_afspraak_id', right_on='activiteitvereistcontact_activityid_id', how='inner')
df_afspraken_total.drop(['activiteitvereistcontact_activityid_id', 'afspraak_afspraak_id'], axis=1, inplace=True)

Alle niet benodigde dataframes verwijderen

In [90]:
try:
    del df_afspraak_acc_gelinkt
    del df_afspraak_betreft_acc
    del df_afspraak_betreft_contact
    del df_afspraak_alle
    del df_afspraken
    del df_activiteit_vereist_contact
except:
    print('already deleted or non-existent')

### Account en Afspraken mergen

In [91]:
acc_afs_merged = df_account_contact_finance.merge(df_afspraken_total, left_on=['contact_contactpersoon_id'], right_on=['activiteitvereistcontact_reqattendee'], how='left')
acc_afs_merged.drop(['activiteitvereistcontact_reqattendee'], axis=1, inplace=True)

# NaN values vervangen
acc_afs_merged['financieledata_boekjaar_range'].fillna(0, inplace=True)
acc_afs_merged['afspraak_betreft'].fillna(-1, inplace=True)
acc_afs_merged['afspraak_account_gelinkt'].fillna(-1, inplace=True)

acc_afs_merged['activiteitscode_naam'].fillna('unknown', inplace=True)
acc_afs_merged['afspraak_thema'].fillna('unknown', inplace=True)
acc_afs_merged['afspraak_onderwerp'].fillna('unknown', inplace=True)
acc_afs_merged['afspraak_keyphrases'].fillna('unknown', inplace=True)

# Strings cleanen
acc_afs_merged['afspraak_thema'] = acc_afs_merged['afspraak_thema'].str.lower() \
        .str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

acc_afs_merged['afspraak_onderwerp'] = acc_afs_merged['afspraak_onderwerp'].str.replace('ov-', '').str.replace('ov -', '') \
        .str.replace('ov ', '').str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

acc_afs_merged['afspraak_keyphrases'] = acc_afs_merged['afspraak_keyphrases'].str.lower().str.replace(r'[^\w\s]', '', regex=True) \
                                                                            .str.replace('  ', ' ').str.strip()

# combine 'afspraak_thema', 'afspraak_onderwerp', 'afspraak_keyphrases' into one column
acc_afs_merged['afspraak_keyphrases'] = acc_afs_merged['afspraak_thema'].astype(str).str.lower() + ', ' \
                            + acc_afs_merged['afspraak_onderwerp'].astype(str) + ', ' \
                            + acc_afs_merged['afspraak_keyphrases'].astype(str)

acc_afs_merged.drop(['afspraak_thema', 'afspraak_onderwerp'], axis=1, inplace=True)
acc_afs_merged.drop_duplicates(inplace=True)

# Alle string kolommen van Account en Contact samenvoegen
columns_to_process = ['account_onderneming', 'account_adres', 'activiteitscode_naam', 'contact_functietitel', 'functie_naam']

for column in columns_to_process:
    acc_afs_merged[column] = acc_afs_merged[column].astype(str)

acc_afs_merged['account_keyphrases'] = acc_afs_merged[columns_to_process].apply(lambda x: ', '.join(x), axis=1)

acc_afs_merged.drop(columns_to_process, axis=1, inplace=True)

print_nunique(acc_afs_merged)

#account: 30707, #contact: 67225


Alle niet benodigde dataframes verwijderen

In [92]:
try:
    del df_account_contact_finance
    del df_afspraken_total
except:
    print('already deleted or non-existent')

### Campagne cleanen

In [93]:
df_campagne = pd.read_csv('../data_clean/Campagne_fixed.csv', sep=',')

# campagne naam cleanen
df_campagne['campagne_naam'] = df_campagne['campagne_naam'].str.replace('OV-', '').str.replace('ov-', '') \
                                                            .str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True) \
                                                            .str.replace('  ', ' ').str.strip().str.lower().str.replace('  ', ' ')

# String kolommen samenvoegen
df_campagne['campagne_keyphrases'] = df_campagne['campagne_soort_campagne'].str.lower() + ', ' \
            + df_campagne['campagne_type_campagne'].str.lower() + ', ' \
            + df_campagne['campagne_naam']

# Drop kolommen
df_campagne.drop(['campagne_einddatum', 'campagne_startdatum', 'campagne_campagne_nr', 
                  'campagne_naam_in_email', 'campagne_reden_van_status', 'campagne_status',
                  'campagne_url_voka_be', 'campagne_soort_campagne', 'campagne_type_campagne', 
                  'campagne_naam'], axis=1, inplace=True)

### Campagne, Sessie, Sessie_inschrijving en Inschrijving mergen

In [94]:
# Csv files inlezen
df_sessie = pd.read_csv('../data_clean/Sessie_fixed.csv', sep=',')
df_sessie.drop(['sessie_eind_datum_tijd', 'sessie_sessie_nr_', 'sessie_start_datum_tijd'], axis=1, inplace=True)
df_inschrijving = pd.read_csv('../data_clean/Inschrijving_fixed.csv', sep=',')
df_inschrijving.drop(['inschrijving_datum_inschrijving', 'inschrijving_campagne_naam_'], axis=1, inplace=True)
df_sessie_inschrijving = pd.read_csv('../data_clean/Sessie_inschrijving_fixed.csv', sep=',')

# Sessie, Sessie_inschrijving en Inschrijving mergen
df_sessie_inschrijving_merge = df_sessie_inschrijving.merge(df_sessie, left_on='sessieinschrijving_sessie', right_on='sessie_sessie_id', how='inner')
df_sessie_inschrijving_merge.drop(['sessieinschrijving_sessie'], axis=1, inplace=True)
df_sessie_inschrijving_merge = df_sessie_inschrijving_merge.merge(df_inschrijving, left_on='sessieinschrijving_inschrijving', right_on='inschrijving_inschrijving_id', how='inner')
df_sessie_inschrijving_merge.drop(['sessieinschrijving_inschrijving', 'inschrijving_inschrijving_id', 'sessie_sessie_id', 'sessieinschrijving_sessieinschrijving_id'], axis=1, inplace=True)
df_sessie_inschrijving_merge.drop_duplicates(inplace=True)

# De hierboven gemergde dataframes en Campagne mergen
df_camp_inschrijving_merge = df_sessie_inschrijving_merge.merge(df_campagne, left_on='inschrijving_campagne', right_on='campagne_campagne_id', how='inner')
df_camp_inschrijving_merge.drop(['inschrijving_campagne'], axis=1, inplace=True)
df_camp_inschrijving_merge.drop_duplicates(inplace=True)

# String kolommen cleanen
df_camp_inschrijving_merge['sessie_activiteitstype'] = df_camp_inschrijving_merge['sessie_activiteitstype'].str.lower() \
                    .str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

df_camp_inschrijving_merge['sessie_thema_naam_'] = df_camp_inschrijving_merge['sessie_thema_naam_'].str.lower() \
                    .str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

df_camp_inschrijving_merge['sessie_product'] = df_camp_inschrijving_merge['sessie_product'].str.replace('OV-', '').str.replace('-LLT', '') \
                    .str.lower().str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

### Account, Afspraak en Campagne mergen 

--> kunnen kiezen om alleen accounts en contacten te houden waarbij campagne aanwezig is of we kunnen kiezen om dit te droppen

--> dit gebeurt hieronder bij # drop rows where campagne_id is null ...

--> Als we kiezen voor niet te droppen, dan moeten we de num_cols en cat_cols uit commentaar halen

In [95]:
df_acc_afs_camp_merge = acc_afs_merged.merge(df_camp_inschrijving_merge, left_on='contact_contactpersoon_id'
                                             , right_on='inschrijving_contactfiche', how='left')

df_acc_afs_camp_merge.drop(['inschrijving_contactfiche', 'sessie_campagne'], axis=1, inplace=True)

# drop rows where campagne_campagne_id is NaN
df_acc_afs_camp_merge.dropna(subset=['campagne_campagne_id'], inplace=True)

# num_cols = df_acc_afs_camp_merge.select_dtypes(include=['float64', 'int64']).columns
# cat_cols = df_acc_afs_camp_merge.select_dtypes(include=['object']).columns
# # fill NaN values with -1 for numeric columns
# df_acc_afs_camp_merge[num_cols] = df_acc_afs_camp_merge[num_cols].fillna(-1)
# # fill NaN values with 'unknown' for categorical columns
# df_acc_afs_camp_merge[cat_cols] = df_acc_afs_camp_merge[cat_cols].fillna('unknown')

# rows samenvoegen
df_acc_afs_camp_merge['inschrijving_aanwezig_afwezig'] \
    = df_acc_afs_camp_merge['inschrijving_aanwezig_afwezig'].astype(str) \
                                                            .str.replace('Aanwezig', '1') \
                                                            .str.replace('Afwezig', '0') \
                                                            .str.replace('unknown', '-1') \
                                                            .astype(int)

df_acc_afs_camp_merge['inschrijving_facturatie_bedrag'] \
      = df_acc_afs_camp_merge['inschrijving_facturatie_bedrag'].astype(str) \
                                                               .str.replace(',', '.') \
                                                               .str.replace('unknown', '-1') \
                                                               .astype(float)

df_acc_afs_camp_merge['inschrijving_bron'] \
    = df_acc_afs_camp_merge['inschrijving_bron'].astype(str) \
                                                .str.replace('unknown', '-1') \
                                                .str.replace('Website', '1') \
                                                .str.replace('Email', '0') \
                                                .astype(int)

df_acc_afs_camp_merge['sessie_keyphrases'] = df_acc_afs_camp_merge['sessie_activiteitstype'] + ', ' \
            + df_acc_afs_camp_merge['sessie_thema_naam_'] + ', ' \
            + df_acc_afs_camp_merge['sessie_product']

# droppen van kolommen
df_acc_afs_camp_merge.drop(['sessie_activiteitstype', 'sessie_thema_naam_', 'sessie_product'], axis=1, inplace=True)

print_nunique(df_acc_afs_camp_merge)

#account: 5035, #contact: 11767


verwijder onnodige dataframes

In [96]:
try:
    del acc_afs_merged
    del df_campagne
    del df_sessie
    del df_inschrijving
    del df_sessie_inschrijving
    del df_sessie_inschrijving_merge
    del df_camp_inschrijving_merge
except:
    print('already deleted or non-existent')

### CDI Pageview, CDI Visit, CDI Mailing en CDI Sentemailclick cleanen en mergen

##### Visit en Pageview mergen

In [97]:
df_pageview = pd.read_csv('../data_clean/CDI_pageviews_fixed.csv', sep=',')
df_visit = pd.read_csv('../data_clean/CDI_visits_fixed.csv', sep=',')

# Merge en drop kolommen
df_visit_pageview = df_visit.merge(df_pageview, left_on='visit_visit_id', right_on='visit', how='left')
df_visit_pageview.drop(
    ['visit', 'visit_ip_postcode', 'visit_aangemaakt_op', 'visit_adobe_reader', 'visit_campagne_code',
    'visit_contact_naam_', 'visit_containssocialprofile', 'visit_ended_on', 'visit_ip_address',
    'visit_ip_organization', 'visit_keywords', 'visit_ip_longitude', 'visit_ip_latitude', 'visit_referrer', 
    'visit_score', 'visit_started_on', 'visit_ip_status', 'visit_time', 'visit_visit_id', 'visit_gewijzigd_op',
    'browser', 'campaign', 'contact', 'duration', 'operatingsystem',
    'pageview_id', 'referrertype', 'time', 'pagetitle', 'type', 'url',
    'viewedon', 'visitorkey', 'webcontent', 'aangemaaktop', 'gewijzigddoor',
    'gewijzigdop', 'status', 'redenvanstatus'
    ], axis=1, inplace=True)

df_visit_pageview.drop_duplicates(inplace=True)

# from visit_entry_page and visit_exit_page, extract the page name until the ?
df_visit_pageview['visit_entry_page'] = df_visit_pageview['visit_entry_page'].str.split('?').str[0]
df_visit_pageview['visit_exit_page'] = df_visit_pageview['visit_exit_page'].str.split('?').str[0]

df_visit_pageview['visit_ip_company'] = df_visit_pageview['visit_ip_company'].str.lower() \
            .str.replace(r'[^\w\s]', '', regex=True) \
            .str.replace('  ', ' ').str.strip()

df_visit_pageview['visit_entry_page'] = df_visit_pageview['visit_entry_page'].str.replace(r'https:\/\/www.voka.be\/', '', regex=True) \
            .str.replace('/', ' ').str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True) \
            .str.replace('  ', ' ').str.strip()

df_visit_pageview['visit_exit_page'] = df_visit_pageview['visit_exit_page'].str.replace(r'https:\/\/www.voka.be\/', '', regex=True) \
            .str.replace('/', ' ').str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True) \
            .str.replace('  ', ' ').str.strip()

df_visit_pageview['visit_referring_host'] = df_visit_pageview['visit_referring_host'].str.replace('www.', '') \
            .str.replace('.com', '').str.replace('.be', '').str.replace('.net', '') \
            .str.replace(r'[^\w\s]', ' ', regex=True).str.replace('  ', ' ').str.strip()

df_visit_pageview['visit_referrer_type'] = df_visit_pageview['visit_referrer_type'].str.replace('E-', '') \
            .str.lower()

# String kolommen samenvoegen
df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_browser'].str.lower() + ', ' \
            + df_visit_pageview['visit_operating_system'].str.lower() + ', ' \
            + df_visit_pageview['visit_ip_stad'].str.lower() + ', ' \
            + df_visit_pageview['visit_ip_land'].str.lower() + ', ' \
            + df_visit_pageview['visit_ip_company'] + ', ' \
            + df_visit_pageview['visit_entry_page'] + ', ' \
            + df_visit_pageview['visit_exit_page'] + ', '  \
            + df_visit_pageview['visit_referring_host'] + ', ' \
            + df_visit_pageview['visit_referrer_type']

# Drop kolommen
df_visit_pageview.drop(['visit_browser', 'visit_operating_system', 'visit_ip_stad', 'visit_ip_land', 'visit_ip_company', 
                        'visit_entry_page', 'visit_exit_page', 'visit_referring_host', 'visit_referrer_type'], axis=1, inplace=True)

# Dtypes aanpassen + one hot encoden
df_visit_pageview['visit_bouce'] \
   = df_visit_pageview['visit_bounce'] \
      .astype(str).str.replace('Ja', '1').str.replace('Nee', '0') \
      .str.replace('unknown', '-1').astype(int)

df_visit_pageview['visit_duration'] = df_visit_pageview['visit_duration'].astype(int)

df_visit_pageview['visit_first_visit'] \
   = df_visit_pageview['visit_first_visit'] \
      .str.replace('Ja', '1').str.replace('Nee', '0') \
      .str.replace('unknown', '-1').astype(int)

df_visit_pageview['visit_total_pages'] \
   = df_visit_pageview['visit_total_pages']\
      .replace('unknown', '-1.0').astype(float)

##### Mailing en sentemailclick mergen

In [98]:
# Csv files inlezen
df_mailing = pd.read_csv('../data_clean/CDI_mailing_fixed.csv', sep=',')
df_mailing.drop(['mailing_sent_on'], axis=1, inplace=True)
df_mailing.drop_duplicates(inplace=True)
df_click = pd.read_csv('../data_clean/CDI_sent_email_clicks_fixed.csv', sep=',')
df_click.drop(['sentemail_kliks_contact', 'sentemail_kliks_sent_email_id'], axis=1, inplace=True)
df_click.drop_duplicates(inplace=True)

# Merge en drop kolommen
df_mailing_merged = df_mailing.merge(df_click, left_on='mailing_mailing_id', right_on='sentemail_kliks_e_mail_versturen', how='inner')
df_mailing_merged.drop(['sentemail_kliks_e_mail_versturen'], axis=1, inplace=True)

# Sentmail_kliks_clicks optellen per mailing
df_mailing_merged = df_mailing_merged.groupby(['mailing_mailing_id', 'mailing_name', 'mailing_onderwerp'], 
                          as_index=False).agg({'sentemail_kliks_clicks': 'sum', 
                                                **{col: 'first' for col in df_mailing_merged.columns if col not in ['sentemail_kliks_clicks']}})

##### pageview_visit en mailing sentemailclick mergen

In [99]:
# Merge
df_pageview_visit_mailing = df_visit_pageview.merge(df_mailing_merged, left_on='visit_email_send', right_on='mailing_mailing_id', how='left')
df_pageview_visit_mailing.drop(['visit_email_send', 'mailing_mailing_id', 'visit_campaign'], axis=1, inplace=True)

# String kolommen cleanen
df_pageview_visit_mailing['mailing_name'] = df_pageview_visit_mailing['mailing_name'].str.replace('OV-', '') \
                                            .str.replace('OV ', '').str.replace('OV -', '') \
                                            .str.replace('-', ' ').str.replace(r'[^\w\s]', ' ', regex=True) \
                                            .str.replace('  ', ' ').str.strip().str.lower().str.replace('  ', ' ')

df_pageview_visit_mailing['mailing_onderwerp'] = df_pageview_visit_mailing['mailing_onderwerp'] \
                                            .str.replace('-', ' ').str.replace(r'[^\w\s]', ' ', regex=True) \
                                            .str.replace('  ', ' ').str.strip().str.lower().str.replace('  ', ' ')

# String kolommen samenvoegen
df_pageview_visit_mailing['mailing_keyphrases'] \
    = df_pageview_visit_mailing['mailing_name'] + ', ' \
        + df_pageview_visit_mailing['mailing_onderwerp']

df_pageview_visit_mailing['sentemail_kliks_clicks'] \
    = df_pageview_visit_mailing['sentemail_kliks_clicks'].replace('unknown', '0').astype(int)

# drop kolommen
df_pageview_visit_mailing.drop(['mailing_name', 'mailing_onderwerp'], axis=1, inplace=True)

### Total merge

In [110]:
df_merged_total = df_acc_afs_camp_merge.merge(df_pageview_visit_mailing, left_on='contact_contactpersoon_id', right_on='visit_contact', how='left')
df_merged_total.drop(['visit_contact', 'campagne_campagne_id'], axis=1, inplace=True)

# Numeric en categorical kolommen selecteren
num_cols = df_merged_total.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df_merged_total.select_dtypes(include=['object']).columns

# fill NaN values with -1 for numeric columns
df_merged_total[num_cols] = df_merged_total[num_cols].fillna(-1)
# fill NaN values with 'unknown' for categorical columns
df_merged_total[cat_cols] = df_merged_total[cat_cols].fillna('unknown')

# visit_bounce blijft object voor een of ander manier dus hier veranderen
df_merged_total['visit_bouce'] = df_merged_total['visit_bouce'].replace('unknown', -1).astype(int)

# verander alle int64 en int32 naar int8
int_cols = df_merged_total.select_dtypes(include=['int64', 'int32']).columns
df_merged_total[int_cols] = df_merged_total[int_cols].astype('int8')

# Unique aantal accounts en contacten printen
print_nunique(df_merged_total)
df_merged_total.drop(['contact_contactpersoon_id', 'account_account_id'], axis=1, inplace=True)

# regex to remove emoticons
regex_emoticons = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]'

# Keyphrases cleanen
replacement_dict = {
    
    ', ,': ',',
    regex_emoticons: '',
}

# Apply replacements to all specified columns
columns_to_clean = ['afspraak_keyphrases', 'account_keyphrases', 'campagne_keyphrases', 'sessie_keyphrases', 'visit_keyphrases', 'mailing_keyphrases']
for column in columns_to_clean:
    df_merged_total[column] = df_merged_total[column].str.replace('unknown', '').str.replace(', ,', ',')\
                                                    .str.replace(regex_emoticons, '', regex=True)
    df_merged_total[column] = df_merged_total[column].str.lower().str.strip()

df_merged_total['visit_keyphrases'] = df_merged_total['visit_keyphrases'].str.replace(r'node [0-9]+', '', regex=True) \
                                    .str.replace('  ', ' ').str.strip()

# Opslaan naar csv
if os.path.exists('../data_clean/merged_total.csv'):
    os.remove('../data_clean/merged_total.csv')
df_merged_total.to_csv('../data_clean/merged_total.csv', index=False)

#account: 5035, #contact: 11767
