In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

### Account cleanen

In [4]:
df_account = pd.read_csv('../data_clean/Account_fixed.csv')

# alleen de actieve accounts
df_account = df_account[df_account['account_reden_van_status'] != 'Inactief']
df_account = df_account[df_account['account_reden_van_status'] != 'Stopzetting']

# onnodige kolommen verwijderen
df_account.drop(['account_industriezone_naam_',
           'account_oprichtingsdatum',
           'account_reden_van_status',
           'account_status',
           'account_voka_nr_',
           'account_is_voka_entiteit',
], axis=1, inplace=True)

# alleen de accounts uit Oost-Vlaanderen
df_account = df_account[df_account['account_adres_provincie'] == 'Oost-Vlaanderen']

# account adres samenvoegen
plaats = df_account['account_adres_plaats'].str.lower()
postcode = df_account['account_adres_postcode'].astype(str)
subregio = df_account['account_adres_geografische_subregio'].str.lower()
provincie = df_account['account_adres_provincie'].str.lower()
land = df_account['account_adres_land'].str.lower()

df_account['account_adres'] = plaats + ' ' + postcode + ' ' + subregio + ' ' + provincie + ' ' + land

df_account.drop(['account_adres_geografische_regio', 
           'account_adres_geografische_subregio', 
           'account_adres_plaats', 'account_adres_postcode', 
           'account_adres_provincie', 'account_adres_land'], axis=1, inplace=True)

# account ondernemingstype samenvoegen
df_account['account_onderneming'] = df_account['account_ondernemingstype'] + ', ' \
                            + df_account['account_ondernemingsaard'] + ', ' \
                            + df_account['account_primaire_activiteit'] + ' ' \

df_account.drop(['account_ondernemingstype', 
           'account_ondernemingsaard', 
           'account_primaire_activiteit'], axis=1, inplace=True)

# account ondernemingstype strings opschonen
df_account['account_onderneming'] = df_account['account_onderneming'].str.replace('unknown', '')
df_account['account_onderneming'] = df_account['account_onderneming'].str.strip()
df_account['account_onderneming'] = df_account['account_onderneming'].str.lower()

df_account.shape

(41859, 3)

### Contact cleanen

In [5]:
df_contact = pd.read_csv('../data_clean/Contact_fixed.csv')

df_contact = df_contact[df_contact['contact_status'] != 'Inactief']
df_contact.drop(['contact_status', 'contact_voka_medewerker'], axis=1, inplace=True)

df_contact.shape

(393514, 4)

### Mergen van Account en Contact zodat alleen de contacten van Oost-Vlaanderen overblijven

Inner join, anders 11000 accounts zonder contactpersoon

In [6]:
df_account_contact = df_account.merge(df_contact, left_on='account_account_id', right_on='contact_account', how='inner')
df_account_contact.drop(['contact_account'], axis=1, inplace=True)
df_account_contact.shape

(67299, 6)

In [7]:
def print_nunique(df, acc_col='account_account_id', cont_col='contact_contactpersoon_id'):
    try:
        con_un = df[cont_col].nunique()
    except:
        con_un = 'niet gevonden'
    
    try:
        acc_un = df[acc_col].nunique()
    except:
        acc_un = 'niet gevonden'

    print(f'#account: {acc_un}, #contact: {con_un}')

In [8]:
print_nunique(df_account_contact)

#account: 30716, #contact: 67299


##### Persoon cleanen

In [197]:
# df_persoon = pd.read_csv('../data_clean/Persoon_fixed.csv')

# df_persoon = df_persoon[df_persoon['persoon_mail_regio_oost_vlaanderen'] == 1]
# df_persoon = df_persoon[df_persoon['persoon_reden_van_status'] == 'Actief']

# for col in df_persoon.columns:
#     if col.__contains__('persoon_mail_regio'):
#         df_persoon.drop(col, axis=1, inplace=True)

# df_persoon.drop(['persoon_persoonsnr_', 'persoon_web_login', 'persoon_reden_van_status'], axis=1, inplace=True)

# df_persoon.shape

### Account_contact mergen met persoon

In [198]:
# df_account_contact_persoon = df_account_contact.merge(df_persoon, left_on='contact_persoon_id', right_on='persoon_persoon_id', how='left')
# df_account_contact_persoon.drop(['contact_persoon_id'], axis=1, inplace=True)
# df_account_contact_persoon.shape

In [199]:
# df_account_contact_persoon.info()

##### Persoon niet meer gebruiken -> maar 4374 non-null

In [9]:
df_account_contact.drop(['contact_persoon_id'], axis=1, inplace=True)

### Account_contact mergen met contactfunctie en functie

In [10]:
df_contact_functie = pd.read_csv('../data_clean/Contact_functie_fixed.csv')
df_contact_functie.columns

Index(['contactfunctie_contactpersoon', 'contactfunctie_functie'], dtype='object')

In [11]:
df_functie = pd.read_csv('../data_clean/Functie_fixed.csv')
df_functie.columns

Index(['functie_functie_id', 'functie_naam'], dtype='object')

Merge contact functie

In [12]:
df_account_contact = df_account_contact.merge(df_contact_functie, left_on='contact_contactpersoon_id', right_on='contactfunctie_contactpersoon', how='inner')
df_account_contact.drop(['contactfunctie_contactpersoon'], axis=1, inplace=True)
df_account_contact.shape

(90938, 6)

Merge met functie

In [13]:
df_account_contact = df_account_contact.merge(df_functie, left_on='contactfunctie_functie', right_on='functie_functie_id', how='inner')
df_account_contact.drop(['contactfunctie_functie', 'functie_functie_id'], axis=1, inplace=True)
df_account_contact.shape

(90938, 6)

In [14]:
print_nunique(df_account_contact)

#account: 30707, #contact: 67225


##### Functie naam kan verschillend zijn voor hetzelfde contact, dus die mergen we

In [15]:
# Define a custom aggregation function to merge 'functie_naam'
def merge_functie_naam(series):
    return ', '.join(series)

# Group by 'account_account_id' and 'contact_contactpersoon_id' and merge 'functie_naam'
df_account_contact = df_account_contact.groupby(
    ['account_account_id', 'contact_contactpersoon_id'], 
    as_index=False).agg({'functie_naam': merge_functie_naam, 
                         **{col: 'first' for col in df_account_contact.columns if col != 'functie_naam'}})

df_account_contact.shape

(67225, 6)

In [16]:
print_nunique(df_account_contact)

#account: 30707, #contact: 67225


### Account_contact mergen met account_activiteitscode en activiteitscode

In [17]:
df_account_activiteitscode = pd.read_csv('../data_clean/Account_activiteitscode_fixed.csv')
print(df_account_activiteitscode.shape)
df_account_activiteitscode.columns

(14673, 3)


Index(['account_activiteitscode_account',
       'account_activiteitscode_activiteitscode',
       'account_activiteitscode_inf_account_inf_activiteitscodeid'],
      dtype='object')

In [18]:
df_activiteitscode = pd.read_csv('../data_clean/Activiteitscode_fixed.csv')
print(df_activiteitscode.shape)
df_activiteitscode.columns

(40, 3)


Index(['activiteitscode_naam', 'activiteitscode_activiteitscode_id',
       'activiteitscode_status'],
      dtype='object')

account_activiteitscode mergen met activiteitscode

In [19]:
df_activiteit = df_account_activiteitscode.merge(df_activiteitscode, left_on='account_activiteitscode_activiteitscode', right_on='activiteitscode_activiteitscode_id', how='inner')
df_activiteit = df_activiteit[df_activiteit['activiteitscode_status'] == 'Actief']
df_activiteit.drop(['account_activiteitscode_activiteitscode', 'activiteitscode_activiteitscode_id'
                    , 'activiteitscode_status', 'account_activiteitscode_inf_account_inf_activiteitscodeid'], axis=1, inplace=True)
df_activiteit.shape

(14665, 2)

activiteitscode naam combineren waar account niet uniek is

In [20]:
# where account_activiteitscode_account is not unique, combine the activiteitscode_naam
def merge_activiteitscode_naam(series):
    return ', '.join(series)

df_activiteit = df_activiteit.groupby(
    ['account_activiteitscode_account'], 
    as_index=False).agg({'activiteitscode_naam': merge_activiteitscode_naam, 
                         **{col: 'first' for col in df_activiteit.columns if col != 'activiteitscode_naam'}})

print(df_activiteit.shape)

df_activiteit.head()

(13408, 2)


Unnamed: 0,activiteitscode_naam,account_activiteitscode_account
0,Overige industrie & diensten,00002DAC-0A69-E111-B43A-00505680000A
1,Consultancy,00068436-F919-E211-9DAA-005056B06EB4
2,Technologische industrie & diensten,0009A6E6-2369-E111-B43A-00505680000A
3,Voeding,000D39CF-BE68-E111-B43A-00505680000A
4,Overige industrie & diensten,0016CAE8-BD68-E111-B43A-00505680000A


mergen met account_contact

In [21]:
df_account_contact = df_account_contact.merge(df_activiteit, left_on='account_account_id'
                                              , right_on='account_activiteitscode_account', how='left')
df_account_contact.drop(['account_activiteitscode_account'], axis=1, inplace=True)
df_account_contact.shape

(67225, 7)

In [22]:
print_nunique(df_account_contact)

#account: 30707, #contact: 67225


#####  Maar 16160 vd 67225 activiteitscodes zijn non-null

In [23]:
df_account_contact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67225 entries, 0 to 67224
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   functie_naam               67225 non-null  object
 1   account_account_id         67225 non-null  object
 2   account_adres              67225 non-null  object
 3   account_onderneming        67225 non-null  object
 4   contact_contactpersoon_id  67225 non-null  object
 5   contact_functietitel       67225 non-null  object
 6   activiteitscode_naam       16160 non-null  object
dtypes: object(7)
memory usage: 3.6+ MB


### Account financiele data mergen met account_contact

In [24]:
df_account_financieel = pd.read_csv('../data_clean/Account_financiële_data_fixed.csv', sep=',')
df_account_financieel.drop(['financieledata_gewijzigd_op', 'financieledata_fte', 'financieledata_aantal_maanden'], axis=1, inplace=True)
df_account_financieel['financieledata_toegevoegde_waarde'] = df_account_financieel['financieledata_toegevoegde_waarde'].astype(str)

print(df_account_financieel.shape)
df_account_financieel.columns

(2030180, 3)


Index(['financieledata_ondernemingid', 'financieledata_boekjaar',
       'financieledata_toegevoegde_waarde'],
      dtype='object')

In [26]:
df_account_contact_finance = df_account_contact.merge(df_account_financieel, left_on='account_account_id', right_on='financieledata_ondernemingid', how='left')
df_account_contact_finance.drop(['financieledata_ondernemingid'], axis=1, inplace=True)
df_account_contact_finance.shape

(687078, 9)

In [27]:
# turn financieledata_toegevoegde_waarde into a numeric value
df_account_contact_finance['financieledata_toegevoegde_waarde'] = df_account_contact_finance['financieledata_toegevoegde_waarde'].str.replace(',', '.')
df_account_contact_finance['financieledata_toegevoegde_waarde'] = df_account_contact_finance['financieledata_toegevoegde_waarde'].str.replace('unknown', '0')
df_account_contact_finance['financieledata_toegevoegde_waarde'] = df_account_contact_finance['financieledata_toegevoegde_waarde'].astype(float)

##### De financieel toegevoegde waarde optellen per account en de boekjaren hun range gebruiken (vb 2007 - 2022 => 15 jaar)

In [28]:
# group by 'account_account_id' and 'contact_contactpersoon_id' and merge 'financieledata_toegevoegde_waarde' by sum and 'financieledata_boekjaar' by max - min
df_account_contact_finance = df_account_contact_finance.groupby(
    ['account_account_id', 'contact_contactpersoon_id'], 
    as_index=False).agg({'financieledata_toegevoegde_waarde': 'sum', 
                         'financieledata_boekjaar': lambda x: x.max() - x.min(), 
                         **{col: 'first' for col in df_account_contact_finance.columns if col not in ['financieledata_toegevoegde_waarde', 'financieledata_boekjaar']}})

df_account_contact_finance.rename(columns={'financieledata_boekjaar': 'financieledata_boekjaar_range'}, inplace=True)

In [29]:
df_account_contact_finance.shape

(67225, 9)

In [30]:
print_nunique(df_account_contact_finance)

#account: 30707, #contact: 67225


##### Alle niet meer benodigde dataframes verwijderen

In [32]:
try:
    del df_account
    del df_contact
    del df_contact_functie
    del df_functie
    del df_account_financieel
    del df_account_contact
except:
    print('already deleted or non-existent')

already deleted or non-existent


### Afspraken cleanen en mergen

In [33]:
df_afspraak_acc_gelinkt = pd.read_csv('../data_clean/Afspraak_account_gelinkt_cleaned_fixed.csv', sep=',')
print(df_afspraak_acc_gelinkt.shape)
print(list(df_afspraak_acc_gelinkt.columns))
df_afspraak_acc_gelinkt.drop(['afspraak_account_gelinkt_account'], axis=1, inplace=True) 

(2934, 7)
['afspraak_account_gelinkt_afspraak_id', 'afspraak_account_gelinkt_thema', 'afspraak_account_gelinkt_subthema', 'afspraak_account_gelinkt_onderwerp', 'afspraak_account_gelinkt_eindtijd', 'afspraak_account_gelinkt_account', 'afspraak_account_gelinkt_keyphrases']


In [34]:
df_afspraak_betreft_acc = pd.read_csv('../data_clean/Afspraak_betreft_account_cleaned_fixed.csv', sep=',')
print(df_afspraak_betreft_acc.shape)
print(list(df_afspraak_betreft_acc.columns))
df_afspraak_betreft_acc.drop(['afspraak_betreft_account_betreft_id'], axis=1, inplace=True)

(4876, 7)
['afspraak_betreft_account_afspraak_id', 'afspraak_betreft_account_thema', 'afspraak_betreft_account_subthema', 'afspraak_betreft_account_onderwerp', 'afspraak_betreft_account_betreft_id', 'afspraak_betreft_account_eindtijd', 'afspraak_betreft_account_keyphrases']


In [35]:
df_afspraak_betreft_contact = pd.read_csv('../data_clean/Afspraak_betreft_contact_cleaned_fixed.csv', sep=',')
print(df_afspraak_betreft_contact.shape)
print(list(df_afspraak_betreft_contact.columns))
df_afspraak_betreft_contact.drop(['afspraak_betreft_contactfiche_betreft_id'], axis=1, inplace=True)

(2552, 7)
['afspraak_betreft_contactfiche_afspraak_id', 'afspraak_betreft_contactfiche_thema', 'afspraak_betreft_contactfiche_subthema', 'afspraak_betreft_contactfiche_onderwerp', 'afspraak_betreft_contactfiche_betreft_id', 'afspraak_betreft_contactfiche_eindtijd', 'afspraak_betreft_contactfiche_keyphrases']


##### Kolomnamen hernoemen en afspraakt_betreft toevoegen (betreft account = 1, betreft contact = 0)

In [36]:
df_afspraak_acc_gelinkt.columns = ['afspraak_afspraak_id', 'afspraak_thema', 'afspraak_subthema', 'afspraak_onderwerp', 'afspraak_eindtijd', 'afspraak_keyphrases']
df_afspraak_betreft_acc.columns = ['afspraak_afspraak_id', 'afspraak_thema', 'afspraak_subthema', 'afspraak_onderwerp', 'afspraak_eindtijd', 'afspraak_keyphrases']
df_afspraak_betreft_contact.columns = ['afspraak_afspraak_id', 'afspraak_thema', 'afspraak_subthema', 'afspraak_onderwerp', 'afspraak_eindtijd', 'afspraak_keyphrases']

df_afspraak_betreft_acc["afspraak_betreft"] = 1
df_afspraak_acc_gelinkt["afspraak_betreft"] = 1
df_afspraak_betreft_contact["afspraak_betreft"] = 0

In [37]:
acc_gelinkt_id_list = df_afspraak_acc_gelinkt['afspraak_afspraak_id'].unique()
acc_gelinkt_id_list.shape

(2934,)

##### Afspraken mergen en al cleanen

In [38]:
df_afspraken = pd.concat([df_afspraak_betreft_acc, df_afspraak_acc_gelinkt, df_afspraak_betreft_contact], ignore_index=True)
df_afspraken.drop_duplicates(inplace=True)
print(df_afspraken.shape)
print(list(df_afspraken.columns))

(9685, 7)
['afspraak_afspraak_id', 'afspraak_thema', 'afspraak_subthema', 'afspraak_onderwerp', 'afspraak_eindtijd', 'afspraak_keyphrases', 'afspraak_betreft']


In [39]:
df_afspraak_alle = pd.read_csv('../data_clean/Afspraak_alle_fixed.csv', sep=',')
print(df_afspraak_alle.shape)
print(list(df_afspraak_alle.columns))
df_afspraak_alle['afspraak_alle_afspraak_id'].nunique()

(8265, 1)
['afspraak_alle_afspraak_id']


8265

In [40]:
df_afspraken = df_afspraken.merge(df_afspraak_alle, left_on='afspraak_afspraak_id', right_on='afspraak_alle_afspraak_id', how='inner')

df_afspraken['afspraak_thema'] = df_afspraken['afspraak_subthema'].str.replace('\(', '', regex=True)
df_afspraken['afspraak_thema'] = df_afspraken['afspraak_thema'].str.replace('\)', '', regex=True)

df_afspraken['afspraak_onderwerp'] = df_afspraken['afspraak_onderwerp'].str.lower().str.capitalize()
df_afspraken['afspraak_onderwerp'] = df_afspraken['afspraak_onderwerp'].str.replace(' ', '-')
df_afspraken['afspraak_onderwerp'] = df_afspraken['afspraak_onderwerp'].astype(str)

df_afspraken.drop(['afspraak_alle_afspraak_id', 'afspraak_eindtijd', 'afspraak_subthema'], axis=1, inplace=True)

df_afspraken.drop_duplicates(inplace=True)

print(df_afspraken.shape)
print(df_afspraken.columns)

(9678, 5)
Index(['afspraak_afspraak_id', 'afspraak_thema', 'afspraak_onderwerp',
       'afspraak_keyphrases', 'afspraak_betreft'],
      dtype='object')


In [41]:
df_afspraken['afspraak_afspraak_id'].nunique()

7624

##### Waar afspraak_betreft 0 en 1 is maken we er 2 van

In [42]:
# show rows where afspraak_afspraak_id is not unique
list_to_change_afspraak_betreft_to_2 = df_afspraken[df_afspraken['afspraak_afspraak_id'].duplicated(keep=False)]['afspraak_afspraak_id'].unique()

# for every afspraak_afspraak_id that is not unique, change afspraak_betreft to 2
for afspraak_id in list_to_change_afspraak_betreft_to_2:
    df_afspraken.loc[df_afspraken['afspraak_afspraak_id'] == afspraak_id, 'afspraak_betreft'] = 2

df_afspraken.drop_duplicates(inplace=True)
df_afspraken.shape

(7624, 5)

##### Acc_gelinkt = 1 anders 0 -> adhv acc_gelinkt_id_list

In [43]:
for afspraak_id in acc_gelinkt_id_list:
    df_afspraken.loc[df_afspraken['afspraak_afspraak_id'] == afspraak_id, 'afspraak_account_gelinkt'] = 1

df_afspraken['afspraak_account_gelinkt'].fillna(0, inplace=True)

##### Nu nog mergen met activiteit_vereist_contact

In [44]:
df_activiteit_vereist_contact = pd.read_csv('../data_clean/Activiteit_vereist_contact_fixed.csv', sep=',')
print(df_activiteit_vereist_contact.shape)
print(list(df_activiteit_vereist_contact.columns))
df_activiteit_vereist_contact['activiteitvereistcontact_reqattendee'].nunique()

(4635, 2)
['activiteitvereistcontact_activityid_id', 'activiteitvereistcontact_reqattendee']


3042

In [45]:
df_afspraken_total = df_afspraken.merge(df_activiteit_vereist_contact, left_on='afspraak_afspraak_id', right_on='activiteitvereistcontact_activityid_id', how='inner')
df_afspraken_total.drop(['activiteitvereistcontact_activityid_id', 'afspraak_afspraak_id'], axis=1, inplace=True)
df_afspraken_total.shape

(4545, 6)

In [46]:
df_afspraken_total.shape

(4545, 6)

Alle niet benodigde dataframes verwijderen

In [47]:
try:
    del df_afspraak_acc_gelinkt
    del df_afspraak_betreft_acc
    del df_afspraak_betreft_contact
    del df_afspraak_alle
    del df_afspraken
    del df_activiteit_vereist_contact
except:
    print('already deleted or non-existent')

### Account en Afspraken mergen

In [48]:
acc_afs_merged = df_account_contact_finance.merge(df_afspraken_total, left_on=['contact_contactpersoon_id'], right_on=['activiteitvereistcontact_reqattendee'], how='left')
acc_afs_merged.drop(['activiteitvereistcontact_reqattendee'], axis=1, inplace=True)
acc_afs_merged.shape

(68618, 14)

In [49]:
print_nunique(acc_afs_merged)

#account: 30707, #contact: 67225


NaN opvullen

In [50]:
acc_afs_merged['financieledata_boekjaar_range'].fillna(0, inplace=True)
acc_afs_merged['afspraak_betreft'].fillna(-1, inplace=True)
acc_afs_merged['afspraak_account_gelinkt'].fillna(-1, inplace=True)

acc_afs_merged['activiteitscode_naam'].fillna('unknown', inplace=True)
acc_afs_merged['afspraak_thema'].fillna('unknown', inplace=True)
acc_afs_merged['afspraak_onderwerp'].fillna('unknown', inplace=True)
acc_afs_merged['afspraak_keyphrases'].fillna('unknown', inplace=True)

##### Strings cleanen

In [51]:
acc_afs_merged['afspraak_onderwerp'] = acc_afs_merged['afspraak_onderwerp'].str.replace('Ov-', '')
acc_afs_merged['afspraak_onderwerp'] = acc_afs_merged['afspraak_onderwerp'].str.lower().str.replace('-', ' ')
acc_afs_merged['afspraak_onderwerp'] = acc_afs_merged['afspraak_onderwerp'].str.replace('   ', ' ')
acc_afs_merged['afspraak_onderwerp'] = acc_afs_merged['afspraak_onderwerp'].str.replace('&', '')
acc_afs_merged['afspraak_onderwerp'] = acc_afs_merged['afspraak_onderwerp'].str.strip()

acc_afs_merged['afspraak_thema'] = acc_afs_merged['afspraak_thema'].str.lower()

Alle string columns van Afspraak samenvoegen

In [52]:
# combine 'afspraak_thema', 'afspraak_onderwerp', 'afspraak_keyphrases' into one column
acc_afs_merged['afspraak_thema'] = acc_afs_merged['afspraak_thema'].astype(str)
acc_afs_merged['afspraak_onderwerp'] = acc_afs_merged['afspraak_onderwerp'].astype(str)
acc_afs_merged['afspraak_keyphrases'] = acc_afs_merged['afspraak_keyphrases'].astype(str)

replacement_dict = {'\(': '', '\)': '', '\'': '', '\"': '', '\[': '', '\]': ''}

acc_afs_merged['afspraak_keyphrases'] = acc_afs_merged['afspraak_keyphrases'].replace(replacement_dict, regex=True)

acc_afs_merged['afspraak_keyphrases'] = acc_afs_merged['afspraak_thema'] + ', ' + acc_afs_merged['afspraak_onderwerp'] + ', ' + acc_afs_merged['afspraak_keyphrases']
acc_afs_merged['afspraak_keyphrases'] = acc_afs_merged['afspraak_keyphrases'].str.replace('unknown, ', '')
acc_afs_merged['afspraak_keyphrases'] = acc_afs_merged['afspraak_keyphrases'].str.replace(', unknown', '')
acc_afs_merged.drop(['afspraak_thema', 'afspraak_onderwerp'], axis=1, inplace=True)

acc_afs_merged.drop_duplicates(inplace=True)
acc_afs_merged.shape

(68610, 12)

Alle string columns van Account en contact samenvoegen

In [None]:
columns_to_process = ['account_onderneming', 'account_adres', 'activiteitscode_naam', 'contact_functietitel', 'functie_naam']

for column in columns_to_process:
    acc_afs_merged[column] = acc_afs_merged[column].astype(str)

acc_afs_merged['account_keyphrases'] = acc_afs_merged[columns_to_process].apply(lambda x: ', '.join(x), axis=1)
acc_afs_merged['account_keyphrases'] = acc_afs_merged['account_keyphrases'].str.replace('unknown, |, unknown|unknown|   |  |/', ' ').str.lower().str.strip()
acc_afs_merged.drop(columns_to_process, axis=1, inplace=True)

acc_afs_merged.shape

print_nunique(acc_afs_merged)

Alle niet benodigde dataframes verwijderen

In [63]:
try:
    del df_account_contact_finance
    del df_afspraken_total
except:
    print('already deleted or non-existent')

### Campagne cleanen en mergen

In [70]:
df_campagne = pd.read_csv('../data_clean/Campagne_fixed.csv', sep=',')
df_campagne.drop(['campagne_einddatum', 'campagne_startdatum', 'campagne_campagne_nr', 
                  'campagne_naam_in_email', 'campagne_reden_van_status', 'campagne_status',
                  'campagne_url_voka_be'], axis=1, inplace=True)

df_campagne['campagne_keyphrases'] = df_campagne['campagne_soort_campagne'] + ', ' + df_campagne['campagne_type_campagne'] + ', ' \
            + df_campagne['campagne_naam']

df_campagne['campagne_keyphrases'] = df_campagne['campagne_keyphrases'].str.replace('unknown, ', '')
df_campagne['campagne_keyphrases'] = df_campagne['campagne_keyphrases'].str.replace(', unknown', '')

df_campagne.drop(['campagne_soort_campagne', 'campagne_type_campagne', 'campagne_naam'], axis=1, inplace=True)

df_campagne['campagne_keyphrases'] = df_campagne['campagne_keyphrases'].str.lower().str.replace('OV-', '')
df_campagne['campagne_keyphrases'] = df_campagne['campagne_keyphrases'].str.replace('OV-NW-', '')
df_campagne['campagne_keyphrases'] = df_campagne['campagne_keyphrases'].str.replace('OV-IO-', '')
df_campagne['campagne_keyphrases'] = df_campagne['campagne_keyphrases'].str.replace('ov-io-', '')
df_campagne['campagne_keyphrases'] = df_campagne['campagne_keyphrases'].str.replace('ov-nw-', '')
df_campagne['campagne_keyphrases'] = df_campagne['campagne_keyphrases'].str.replace('ov-', '')

print(df_campagne.shape)
print(df_campagne.columns)

(4092, 2)
Index(['campagne_campagne_id', 'campagne_keyphrases'], dtype='object')


### Sessie cleanen

In [255]:
df_sessie = pd.read_csv('../data_clean/Sessie_fixed.csv', sep=',')
df_sessie.drop(['sessie_eind_datum_tijd', 'sessie_sessie_nr_', 'sessie_start_datum_tijd'], axis=1, inplace=True)
print(df_sessie.shape)
print(df_sessie.columns)

(9403, 5)
Index(['sessie_activiteitstype', 'sessie_campagne', 'sessie_product',
       'sessie_sessie_id', 'sessie_thema_naam_'],
      dtype='object')


### Inschrijving cleanen

In [256]:
df_inschrijving = pd.read_csv('../data_clean/Inschrijving_fixed.csv', sep=',')
df_inschrijving.drop(['inschrijving_datum_inschrijving', 'inschrijving_campagne_naam_'], axis=1, inplace=True)
print(df_inschrijving.shape)
print(df_inschrijving.columns)

(49175, 6)
Index(['inschrijving_aanwezig_afwezig', 'inschrijving_bron',
       'inschrijving_contactfiche', 'inschrijving_inschrijving_id',
       'inschrijving_facturatie_bedrag', 'inschrijving_campagne'],
      dtype='object')


In [257]:
print_nunique(df_inschrijving, cont_col='inschrijving_contactfiche')

#account: niet gevonden, #contact: 18222


### Sessie_inschrijving cleanen

In [258]:
df_sessie_inschrijving = pd.read_csv('../data_clean/Sessie_inschrijving_fixed.csv', sep=',')
print(df_sessie_inschrijving.shape)
print(df_sessie_inschrijving.columns)

(88309, 3)
Index(['sessieinschrijving_sessieinschrijving_id', 'sessieinschrijving_sessie',
       'sessieinschrijving_inschrijving'],
      dtype='object')


In [259]:
df_sessie_inschrijving['sessieinschrijving_inschrijving'].nunique()

47312

### Sessie, Sessie_inschrijving en Inschrijving mergen

In [260]:
df_sessie_inschrijving_merge = df_sessie_inschrijving.merge(df_sessie, left_on='sessieinschrijving_sessie', right_on='sessie_sessie_id', how='inner')
df_sessie_inschrijving_merge.drop(['sessieinschrijving_sessie'], axis=1, inplace=True)
print(df_sessie_inschrijving_merge.shape)

(88309, 7)


In [261]:
df_sessie_inschrijving_merge = df_sessie_inschrijving_merge.merge(df_inschrijving, left_on='sessieinschrijving_inschrijving', right_on='inschrijving_inschrijving_id', how='inner')
df_sessie_inschrijving_merge.drop(['sessieinschrijving_inschrijving', 'inschrijving_inschrijving_id', 'sessie_sessie_id', 'sessieinschrijving_sessieinschrijving_id'], axis=1, inplace=True)
df_sessie_inschrijving_merge.drop_duplicates(inplace=True)
print(df_sessie_inschrijving_merge.shape)
print(df_sessie_inschrijving_merge.columns)

(53222, 9)
Index(['sessie_activiteitstype', 'sessie_campagne', 'sessie_product',
       'sessie_thema_naam_', 'inschrijving_aanwezig_afwezig',
       'inschrijving_bron', 'inschrijving_contactfiche',
       'inschrijving_facturatie_bedrag', 'inschrijving_campagne'],
      dtype='object')


### Campagne, Sessie, Sessie_inschrijving en Inschrijving mergen

In [262]:
df_camp_inschrijving_merge = df_sessie_inschrijving_merge.merge(df_campagne, left_on='inschrijving_campagne', right_on='campagne_campagne_id', how='inner')
df_camp_inschrijving_merge.drop(['inschrijving_campagne'], axis=1, inplace=True)
df_camp_inschrijving_merge.drop_duplicates(inplace=True)
print(df_camp_inschrijving_merge.shape)
print(df_camp_inschrijving_merge.columns)

(53222, 10)
Index(['sessie_activiteitstype', 'sessie_campagne', 'sessie_product',
       'sessie_thema_naam_', 'inschrijving_aanwezig_afwezig',
       'inschrijving_bron', 'inschrijving_contactfiche',
       'inschrijving_facturatie_bedrag', 'campagne_campagne_id',
       'campagne_keyphrases'],
      dtype='object')


In [263]:
df_camp_inschrijving_merge.head()

Unnamed: 0,sessie_activiteitstype,sessie_campagne,sessie_product,sessie_thema_naam_,inschrijving_aanwezig_afwezig,inschrijving_bron,inschrijving_contactfiche,inschrijving_facturatie_bedrag,campagne_campagne_id,campagne_keyphrases
0,Voka Politica,A57DE97C-460B-E811-80EF-001DD8B72B61,OV-Netwerkactiviteit-Regionaal,Netwerking,Aanwezig,Email,D41C3FC7-A96D-E111-B43A-00505680000A,0,A57DE97C-460B-E811-80EF-001DD8B72B61,"offline, netwerkevenement, ov - voka politica ..."
1,Voka Politica,A57DE97C-460B-E811-80EF-001DD8B72B61,OV-Netwerkactiviteit-Regionaal,Netwerking,Aanwezig,Email,83502ED6-C567-E711-80E8-001DD8B72B61,0,A57DE97C-460B-E811-80EF-001DD8B72B61,"offline, netwerkevenement, ov - voka politica ..."
2,Voka Politica,A57DE97C-460B-E811-80EF-001DD8B72B61,OV-Netwerkactiviteit-Regionaal,Netwerking,Aanwezig,Email,E2628A18-3836-E711-80E6-001DD8B72B61,0,A57DE97C-460B-E811-80EF-001DD8B72B61,"offline, netwerkevenement, ov - voka politica ..."
3,Voka Politica,A57DE97C-460B-E811-80EF-001DD8B72B61,OV-Netwerkactiviteit-Regionaal,Netwerking,Aanwezig,Email,07C159CA-428F-E211-A980-005056B06EB4,0,A57DE97C-460B-E811-80EF-001DD8B72B61,"offline, netwerkevenement, ov - voka politica ..."
4,Voka Politica,A57DE97C-460B-E811-80EF-001DD8B72B61,OV-Netwerkactiviteit-Regionaal,Netwerking,Aanwezig,Email,F71D6860-428F-E211-A980-005056B06EB4,0,A57DE97C-460B-E811-80EF-001DD8B72B61,"offline, netwerkevenement, ov - voka politica ..."


In [264]:
df_camp_inschrijving_merge['inschrijving_contactfiche'].nunique()

17881

### Account, Afspraak en Campagne mergen 
--> kunnen kiezen om alleen accounts en contacten te houden waarbij campagne aanwezig is of we kunnen kiezen om dit te droppen
--> dit gebeurt hieronder bij # drop rows where campagne_id is null ...

In [265]:
df_acc_afs_camp_merge = acc_afs_merged.merge(df_camp_inschrijving_merge, left_on='contact_contactpersoon_id'
                                             , right_on='inschrijving_contactfiche', how='left')

df_acc_afs_camp_merge.drop(['inschrijving_contactfiche', 'sessie_campagne'], axis=1, inplace=True)

# drop rows where campagne_campagne_id is NaN
df_acc_afs_camp_merge.dropna(subset=['campagne_campagne_id'], inplace=True)

# num_cols = df_acc_afs_camp_merge.select_dtypes(include=['float64', 'int64']).columns
# cat_cols = df_acc_afs_camp_merge.select_dtypes(include=['object']).columns
# # fill NaN values with -1 for numeric columns
# df_acc_afs_camp_merge[num_cols] = df_acc_afs_camp_merge[num_cols].fillna(-1)
# # fill NaN values with 'unknown' for categorical columns
# df_acc_afs_camp_merge[cat_cols] = df_acc_afs_camp_merge[cat_cols].fillna('unknown')

# rows samenvoegen
df_acc_afs_camp_merge['inschrijving_aanwezig_afwezig'] = df_acc_afs_camp_merge['inschrijving_aanwezig_afwezig'].astype(str)
df_acc_afs_camp_merge['inschrijving_aanwezig_afwezig'] = df_acc_afs_camp_merge['inschrijving_aanwezig_afwezig'].str.replace('Aanwezig', '1')
df_acc_afs_camp_merge['inschrijving_aanwezig_afwezig'] = df_acc_afs_camp_merge['inschrijving_aanwezig_afwezig'].str.replace('Afwezig', '0')
df_acc_afs_camp_merge['inschrijving_aanwezig_afwezig'] = df_acc_afs_camp_merge['inschrijving_aanwezig_afwezig'].str.replace('unknown', '-1')
df_acc_afs_camp_merge['inschrijving_aanwezig_afwezig'] = df_acc_afs_camp_merge['inschrijving_aanwezig_afwezig'].astype(int)

df_acc_afs_camp_merge['inschrijving_facturatie_bedrag'] = df_acc_afs_camp_merge['inschrijving_facturatie_bedrag'].astype(str)
df_acc_afs_camp_merge['inschrijving_facturatie_bedrag'] = df_acc_afs_camp_merge['inschrijving_facturatie_bedrag'].str.replace(',', '.')
df_acc_afs_camp_merge['inschrijving_facturatie_bedrag'] = df_acc_afs_camp_merge['inschrijving_facturatie_bedrag'].str.replace('unknown', '-1')
df_acc_afs_camp_merge['inschrijving_facturatie_bedrag'] = df_acc_afs_camp_merge['inschrijving_facturatie_bedrag'].astype(float)

df_acc_afs_camp_merge['inschrijving_bron'] = df_acc_afs_camp_merge['inschrijving_bron'].astype(str)
df_acc_afs_camp_merge['inschrijving_bron'] = df_acc_afs_camp_merge['inschrijving_bron'].str.replace('unknown', '-1')
df_acc_afs_camp_merge['inschrijving_bron'] = df_acc_afs_camp_merge['inschrijving_bron'].str.replace('Website', '1')
df_acc_afs_camp_merge['inschrijving_bron'] = df_acc_afs_camp_merge['inschrijving_bron'].str.replace('Email', '0')
df_acc_afs_camp_merge['inschrijving_bron'] = df_acc_afs_camp_merge['inschrijving_bron'].astype(int)

df_acc_afs_camp_merge['sessie_keyphrases'] = df_acc_afs_camp_merge['sessie_activiteitstype'] + ', ' \
            + df_acc_afs_camp_merge['sessie_thema_naam_'] + ', ' \
            + df_acc_afs_camp_merge['sessie_product']
df_acc_afs_camp_merge['sessie_keyphrases'] = df_acc_afs_camp_merge['sessie_keyphrases'].str.replace('unknown, ', '')
df_acc_afs_camp_merge['sessie_keyphrases'] = df_acc_afs_camp_merge['sessie_keyphrases'].str.replace(', unknown', '')

df_acc_afs_camp_merge.drop(['sessie_activiteitstype', 'sessie_thema_naam_', 'sessie_product'], axis=1, inplace=True)

df_acc_afs_camp_merge['sessie_keyphrases'] = df_acc_afs_camp_merge['sessie_keyphrases'].str.lower().str.replace('OV-', '')
df_acc_afs_camp_merge['sessie_keyphrases'] = df_acc_afs_camp_merge['sessie_keyphrases'].str.replace('ov-', '').str.strip()

print(df_acc_afs_camp_merge.shape)
print(df_acc_afs_camp_merge.columns)

(53943, 14)
Index(['financieledata_toegevoegde_waarde', 'financieledata_boekjaar_range',
       'account_account_id', 'contact_contactpersoon_id',
       'afspraak_keyphrases', 'afspraak_betreft', 'afspraak_account_gelinkt',
       'account_keyphrases', 'inschrijving_aanwezig_afwezig',
       'inschrijving_bron', 'inschrijving_facturatie_bedrag',
       'campagne_campagne_id', 'campagne_keyphrases', 'sessie_keyphrases'],
      dtype='object')


In [266]:
print_nunique(df_acc_afs_camp_merge)

#account: 5035, #contact: 11767


verwijder onnodige dataframes

In [267]:
try:
    del acc_afs_merged
    del df_campagne
    del df_sessie
    del df_inschrijving
    del df_sessie_inschrijving
    del df_sessie_inschrijving_merge
    del df_camp_inschrijving_merge
except:
    print('already deleted or non-existent')

### CDI Pageview, CDI Visit, CDI Mailing en CDI Sentemailclick cleanen en mergen

In [268]:
df_pageview = pd.read_csv('../data_clean/CDI_pageviews_fixed.csv', sep=',')
print(df_pageview.shape)
print(df_pageview.columns)

(1651, 20)
Index(['browser', 'campaign', 'contact', 'duration', 'operatingsystem',
       'pageview_id', 'referrertype', 'time', 'pagetitle', 'type', 'url',
       'viewedon', 'visit', 'visitorkey', 'webcontent', 'aangemaaktop',
       'gewijzigddoor', 'gewijzigdop', 'status', 'redenvanstatus'],
      dtype='object')


In [269]:
df_visit = pd.read_csv('../data_clean/CDI_visits_fixed.csv', sep=',')
print(df_visit.shape)
print(sorted(list(df_visit.columns)))

(24444, 35)
['visit_aangemaakt_op', 'visit_adobe_reader', 'visit_bounce', 'visit_browser', 'visit_campagne_code', 'visit_campaign', 'visit_contact', 'visit_contact_naam_', 'visit_containssocialprofile', 'visit_duration', 'visit_email_send', 'visit_ended_on', 'visit_entry_page', 'visit_exit_page', 'visit_first_visit', 'visit_gewijzigd_op', 'visit_ip_address', 'visit_ip_company', 'visit_ip_land', 'visit_ip_latitude', 'visit_ip_longitude', 'visit_ip_organization', 'visit_ip_postcode', 'visit_ip_stad', 'visit_ip_status', 'visit_keywords', 'visit_operating_system', 'visit_referrer', 'visit_referrer_type', 'visit_referring_host', 'visit_score', 'visit_started_on', 'visit_time', 'visit_total_pages', 'visit_visit_id']


##### Visit en Pageview mergen

In [270]:
df_visit_pageview = df_visit.merge(df_pageview, left_on='visit_visit_id', right_on='visit', how='left')
df_visit_pageview.drop(['visit', 'visit_ip_postcode', 'visit_aangemaakt_op', 'visit_adobe_reader', 'visit_campagne_code',
                        'visit_contact_naam_', 'visit_containssocialprofile', 'visit_ended_on', 'visit_ip_address',
                        'visit_ip_organization', 'visit_keywords', 'visit_ip_longitude', 'visit_ip_latitude', 'visit_referrer', 
                        'visit_score', 'visit_started_on', 'visit_ip_status', 'visit_time', 'visit_visit_id', 'visit_gewijzigd_op',
                        'browser', 'campaign', 'contact', 'duration', 'operatingsystem',
                        'pageview_id', 'referrertype', 'time', 'pagetitle', 'type', 'url',
                        'viewedon', 'visitorkey', 'webcontent', 'aangemaaktop', 'gewijzigddoor',
                        'gewijzigdop', 'status', 'redenvanstatus'], axis=1, inplace=True)

df_visit_pageview.drop_duplicates(inplace=True)

# from visit_entry_page and visit_exit_page, extract the page name until the ?
df_visit_pageview['visit_entry_page'] = df_visit_pageview['visit_entry_page'].str.split('?').str[0]
df_visit_pageview['visit_exit_page'] = df_visit_pageview['visit_exit_page'].str.split('?').str[0]

df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_browser'].str.lower() + ', ' \
            + df_visit_pageview['visit_operating_system'].str.lower() + ', ' \
            + df_visit_pageview['visit_ip_stad'] + ', ' \
            + df_visit_pageview['visit_ip_land'] + ', ' + df_visit_pageview['visit_ip_company'] + ', ' \
            + df_visit_pageview['visit_entry_page'] + ', ' \
            + df_visit_pageview['visit_exit_page'] + ', '  \
            + df_visit_pageview['visit_referring_host'].astype(str) \
            + ', ' + df_visit_pageview['visit_referrer_type'].astype(str)

df_visit_pageview.drop(['visit_browser', 'visit_operating_system', 'visit_ip_stad', 'visit_ip_land', 'visit_ip_company', 
                        'visit_entry_page', 'visit_exit_page', 'visit_referring_host', 'visit_referrer_type'], axis=1, inplace=True)

df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_keyphrases'].str.replace('unknown, ', '')
df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_keyphrases'].str.replace(', unknown', '')
df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_keyphrases'].str.replace('unknown', '')
df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_keyphrases'].str.lower()
df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_keyphrases'].str.replace(r'https://www\.([^/]+)/', r'\1/', regex=True)
df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_keyphrases'].str.replace(r'https://www\.[^/]+/([^/]+)/', r'\1/', regex=True)
df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_keyphrases'].str.replace(r'https://www\.[^/]+/[^/]+/([^/]+)', r'\1', regex=True)
df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_keyphrases'].str.replace('.be/', ' ')
df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_keyphrases'].str.replace('/', ' ')
df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_keyphrases'].str.replace('-', ' ')
df_visit_pageview['visit_keyphrases'] = df_visit_pageview['visit_keyphrases'].str.replace('e mail', 'mail')

df_visit_pageview['visit_bouce'] = df_visit_pageview['visit_bounce'].astype(str)
df_visit_pageview['visit_bounce'] = df_visit_pageview['visit_bounce'].str.replace('Ja', '1')
df_visit_pageview['visit_bounce'] = df_visit_pageview['visit_bounce'].str.replace('Nee', '0')
df_visit_pageview['visit_bouce'] = df_visit_pageview['visit_bounce'].str.replace('unknown', '-1')
df_visit_pageview['visit_bouce'] = df_visit_pageview['visit_bounce'].astype(int)

df_visit_pageview['visit_duration'] = df_visit_pageview['visit_duration'].astype(int)

df_visit_pageview['visit_first_visit'] = df_visit_pageview['visit_first_visit'].str.replace('Ja', '1')
df_visit_pageview['visit_first_visit'] = df_visit_pageview['visit_first_visit'].str.replace('Nee', '0')
df_visit_pageview['visit_first_visit'] = df_visit_pageview['visit_first_visit'].str.replace('unknown', '-1')
df_visit_pageview['visit_first_visit'] = df_visit_pageview['visit_first_visit'].astype(int)

df_visit_pageview['visit_total_pages'] = df_visit_pageview['visit_total_pages'].replace('unknown', '-1.0')
df_visit_pageview['visit_total_pages'] = df_visit_pageview['visit_total_pages'].astype(float)

print(df_visit_pageview.shape)
print(df_visit_pageview.columns)

(23654, 9)
Index(['visit_bounce', 'visit_campaign', 'visit_contact', 'visit_duration',
       'visit_email_send', 'visit_first_visit', 'visit_total_pages',
       'visit_keyphrases', 'visit_bouce'],
      dtype='object')


In [271]:
df_visit_pageview['visit_keyphrases'].unique()

array(['firefox, windows, gentbrugge, belgium, belgacom sa, voka activiteiten de wissel inspiratiesessies rond bedrijfsoverdracht en overname, voka activiteiten de wissel inspiratiesessies rond bedrijfsoverdracht en overname, mail',
       'safari, mac, brecht, belgium, telenet, voka activiteiten voka netwerkfeest met viering voka legende, voka activiteiten voka netwerkfeest met viering voka legende, mail',
       'safari, mac, hove, belgium, telenet, voka bigrefresh, voka bigrefresh, mail',
       ...,
       'safari, mac, hoboken, belgium, telenet, voka activiteiten infosessie insidertrip marokko met daniel termont, voka activiteiten insidertrip marokko, mail',
       'safari, mac, wervik, belgium, belgacom sa, voka activiteiten infosessie insidertrip marokko met daniel termont, voka activiteiten insidertrip marokko, mail',
       'safari, mac, ghent, belgium, telenet, voka activiteiten week internationaal ondernemen zwitserland, voka wio, mail'],
      dtype=object)

##### Mailing en sentemailclick mergen

In [272]:
df_mailing = pd.read_csv('../data_clean/CDI_mailing_fixed.csv', sep=',')
df_mailing.drop(['mailing_sent_on'], axis=1, inplace=True)
df_mailing.drop_duplicates(inplace=True)
print(df_mailing.shape)
print(df_mailing.columns)

(860, 3)
Index(['mailing_mailing_id', 'mailing_name', 'mailing_onderwerp'], dtype='object')


In [273]:
df_click = pd.read_csv('../data_clean/CDI_sent_email_clicks_fixed.csv', sep=',')
df_click.drop(['sentemail_kliks_contact', 'sentemail_kliks_sent_email_id'], axis=1, inplace=True)
df_click.drop_duplicates(inplace=True)
print(df_click.shape)
print(df_click.columns)

(4807, 2)
Index(['sentemail_kliks_clicks', 'sentemail_kliks_e_mail_versturen'], dtype='object')


In [274]:
df_mailing_merged = df_mailing.merge(df_click, left_on='mailing_mailing_id', right_on='sentemail_kliks_e_mail_versturen', how='inner')
df_mailing_merged.drop(['sentemail_kliks_e_mail_versturen'], axis=1, inplace=True)
print(df_mailing_merged.shape)
print(df_mailing_merged.columns)

(4807, 4)
Index(['mailing_mailing_id', 'mailing_name', 'mailing_onderwerp',
       'sentemail_kliks_clicks'],
      dtype='object')


In [275]:
df_mailing_merged = df_mailing_merged.groupby(['mailing_mailing_id', 'mailing_name', 'mailing_onderwerp'], 
                          as_index=False).agg({'sentemail_kliks_clicks': 'sum', 
                                                **{col: 'first' for col in df_mailing_merged.columns if col not in ['sentemail_kliks_clicks']}})

In [276]:
df_mailing_merged.shape

(762, 4)

##### pageview_visit en mailing sentemailclick mergen

In [277]:
df_pageview_visit_mailing = df_visit_pageview.merge(df_mailing_merged, left_on='visit_email_send', right_on='mailing_mailing_id', how='left')
df_pageview_visit_mailing.drop(['visit_email_send', 'mailing_mailing_id', 'visit_campaign'], axis=1, inplace=True)

df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_name'] + ', ' + df_pageview_visit_mailing['mailing_onderwerp']
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace('unknown, ', '')
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace(', unknown', '')
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace('unknown', '')
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace('OV-', '')
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace('ov-', '')
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace('OV -', '')
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace('ov -', '')
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace('OV ', '')
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace('\xa0', '')

teams_dict = {
'JO': 'jong ondernemen ',
'DO': 'duurzaam ondernemen ',
'IN': 'innovatie digitalisering ',
'IO': 'internationaal ondernemen ',
'AO': 'arbeidsmarkt ',
'EX': 'expert ',
'GR': 'Groei ',
'BB': 'Belangenbehartiging ',
'CO': 'communicatie ',
'NW': 'netwerking ',
'HA': 'Haven ',
'MA': 'match '
}

df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace(r'([A-Z]+)-', lambda x: teams_dict.get(x.group(1), x.group(1)), regex=True)
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace('|', '')
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace(':', '')
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace(r'\d{6,8}|\d+\s\d+\s\d+', '', regex=True)
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace(r'\'s', '', regex=True)
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace('  ', ' ')
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.replace('-', ' ').str.strip()
df_pageview_visit_mailing['mailing_keyphrases'] = df_pageview_visit_mailing['mailing_keyphrases'].str.lower().str.strip()

df_pageview_visit_mailing['sentemail_kliks_clicks'] = df_pageview_visit_mailing['sentemail_kliks_clicks'].replace('unknown', '0')
df_pageview_visit_mailing['sentemail_kliks_clicks'] = df_pageview_visit_mailing['sentemail_kliks_clicks'].astype(int)

df_pageview_visit_mailing.drop(['mailing_name', 'mailing_onderwerp'], axis=1, inplace=True)

print(df_pageview_visit_mailing.shape)
print(df_pageview_visit_mailing.columns)

(23654, 9)
Index(['visit_bounce', 'visit_contact', 'visit_duration', 'visit_first_visit',
       'visit_total_pages', 'visit_keyphrases', 'visit_bouce',
       'sentemail_kliks_clicks', 'mailing_keyphrases'],
      dtype='object')


In [278]:
df_pageview_visit_mailing.head()

Unnamed: 0,visit_bounce,visit_contact,visit_duration,visit_first_visit,visit_total_pages,visit_keyphrases,visit_bouce,sentemail_kliks_clicks,mailing_keyphrases
0,1,7DCF89E7-D469-E111-B43A-00505680000A,0,0,1.0,"firefox, windows, gentbrugge, belgium, belgaco...",1,6,"groei overdracht en overname, vragen rond over..."
1,1,6913126E-B56A-E111-B43A-00505680000A,0,0,1.0,"safari, mac, brecht, belgium, telenet, voka ac...",1,1254,"nieuwsbrief , nieuws uit je regio, rechtstreek..."
2,1,E14E454F-3B6F-E111-B43A-00505680000A,0,0,1.0,"safari, mac, hove, belgium, telenet, voka bigr...",1,15,"nieuwsbrief, bedrijfsnieuws uit je regio, rech..."
3,0,7DCF89E7-D469-E111-B43A-00505680000A,103,0,2.0,"firefox, windows, gentbrugge, belgium, belgaco...",0,67,netwerking nieuws uit aalst_openingsweek voka ...
4,1,D4E2762D-8E10-EC11-8123-001DD8B72B61,0,1,1.0,"android, linux, amay, belgium, telenet operati...",1,54,"nieuwsbrief , nieuws uit je regio, rechtstreek..."


### Total merge

In [279]:
df_merged_total = df_acc_afs_camp_merge.merge(df_pageview_visit_mailing, left_on='contact_contactpersoon_id', right_on='visit_contact', how='left')
df_merged_total.drop(['visit_contact', 'campagne_campagne_id'], axis=1, inplace=True)

num_cols = df_merged_total.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df_merged_total.select_dtypes(include=['object']).columns

# fill NaN values with -1 for numeric columns
df_merged_total[num_cols] = df_merged_total[num_cols].fillna(-1)
# fill NaN values with 'unknown' for categorical columns
df_merged_total[cat_cols] = df_merged_total[cat_cols].fillna('unknown')

# visit_bounce blijft object voor een of ander manier dus hier veranderen
df_merged_total['visit_bouce'] = df_merged_total['visit_bouce'].replace('unknown', -1)
df_merged_total['visit_bouce'] = df_merged_total['visit_bouce'].astype(int)

# verander alle int64 en int32 naar int8
int_cols = df_merged_total.select_dtypes(include=['int64', 'int32']).columns
df_merged_total[int_cols] = df_merged_total[int_cols].astype('int8')

df_merged_total.shape

(250666, 21)

In [280]:
print_nunique(df_merged_total)
df_merged_total.drop(['contact_contactpersoon_id', 'account_account_id'], axis=1, inplace=True)

#account: 5035, #contact: 11767


In [281]:
df_merged_total.head()

Unnamed: 0,financieledata_toegevoegde_waarde,financieledata_boekjaar_range,afspraak_keyphrases,afspraak_betreft,afspraak_account_gelinkt,account_keyphrases,inschrijving_aanwezig_afwezig,inschrijving_bron,inschrijving_facturatie_bedrag,campagne_keyphrases,sessie_keyphrases,visit_bounce,visit_duration,visit_first_visit,visit_total_pages,visit_keyphrases,visit_bouce,sentemail_kliks_clicks,mailing_keyphrases
0,413465.0,5.0,unknown,-1.0,-1.0,"familiebedrijf, diensten, vastgoed, melle 9090...",1,1,50.0,"offline, netwerkevenement, ov - kick-off commu...","ma events, netwerking, netwerkactiviteit-project",unknown,-1.0,-1.0,-1.0,unknown,-1,-1.0,unknown
1,494107.0,9.0,unknown,-1.0,-1.0,"bedrijf, diensten, consultancy, geraardsbergen...",1,1,0.0,"offline, opleiding, corona round tables - sales","gr werking, marketing & sales, opleidingen-llt",unknown,-1.0,-1.0,-1.0,unknown,-1,-1.0,unknown
2,1687010.57,3.0,"retentie lidmaatschap, ledenbezoek 2023, indu...",2.0,1.0,"bedrijf, diensten, milieu, meilegem 9630 ouden...",1,1,0.0,"offline, netwerkevenement, nieuwjaarsreceptie ...","nw voka connect gent, netwerking, netwerkactiv...",1,0.0,0.0,1.0,"chrome, windows, ghent, belgium, pro, voka nod...",1,52.0,"jong ondernemen stamgasten 2023, uitnodiging s..."
3,1687010.57,3.0,"retentie lidmaatschap, ledenbezoek 2023, indu...",2.0,1.0,"bedrijf, diensten, milieu, meilegem 9630 ouden...",1,1,0.0,"offline, netwerkevenement, nieuwjaarsreceptie ...","nw voka connect gent, netwerking, netwerkactiv...",1,0.0,0.0,1.0,"chrome, windows, ghent, belgium, telenet, voka...",1,18.0,netwerking nieuwjaarsreceptie vlaamse ardennen...
4,1687010.57,3.0,"retentie lidmaatschap, ledenbezoek 2023, indu...",2.0,1.0,"bedrijf, diensten, milieu, meilegem 9630 ouden...",1,1,0.0,"offline, netwerkevenement, nieuwjaarsreceptie ...","nw voka connect gent, netwerking, netwerkactiv...",1,0.0,0.0,1.0,"chrome, windows, ghent, belgium, pro, voka act...",1,136.0,"nieuwsbrief , nieuws van voka oost vlaanderen,..."


In [282]:
df_merged_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250666 entries, 0 to 250665
Data columns (total 19 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   financieledata_toegevoegde_waarde  250666 non-null  float64
 1   financieledata_boekjaar_range      250666 non-null  float64
 2   afspraak_keyphrases                250666 non-null  object 
 3   afspraak_betreft                   250666 non-null  float64
 4   afspraak_account_gelinkt           250666 non-null  float64
 5   account_keyphrases                 250666 non-null  object 
 6   inschrijving_aanwezig_afwezig      250666 non-null  int8   
 7   inschrijving_bron                  250666 non-null  int8   
 8   inschrijving_facturatie_bedrag     250666 non-null  float64
 9   campagne_keyphrases                250666 non-null  object 
 10  sessie_keyphrases                  250666 non-null  object 
 11  visit_bounce                       2506

In [283]:
if os.path.exists('../data_clean/merged_total.csv'):
    os.remove('../data_clean/merged_total.csv')
df_merged_total.to_csv('../data_clean/merged_total.csv', index=False)