In [3]:
import pandas as pd
import numpy as np
import os

### Notes:

##### marketing_pressure wordt berekend adhv deze kolommen : 

```Python
[
    persoon_mail_thema_, persoon_mail_type_, persoon_marketingcommunicatie, visit_first_visit, 
    visit_total_pages, sentemail_kliks_clicks, inschrijving_bron
]
```

- Hierbij wordt sentemail_kliks_clicks eerst omgezet naar een frequentie (# clicks / # emails)

- Bij inschrijving bron is email = 1, website 0 en anders -1

- Persoon_marketingcommunicatie wordt omgezet naar een numerieke kolom met 1 voor Flexibel en 0 voor strict en -1 voor geen waarde

##### keyphrases worden berekend adhv deze kolommen : 

```Python
[
    account_adres, account_onderneming, contact_functietitel, functie_naam, activiteitscode_naam, afspraak_thema, afspraak_onderwerp, afspraak_keyphrases, afspraak_betreft, campagne_naam, campagne_type_campagne, campagne_soort_campagne, mailing_name, mailing_onderwerp
]
```


### Account en contact merge

In [301]:
df_account = pd.read_csv('../data_clean/Account_fixed.csv')

# alleen de actieve accounts
df_account = df_account[df_account['account_reden_van_status'] != 'Inactief']
df_account = df_account[df_account['account_reden_van_status'] != 'Stopzetting']

# alleen de accounts uit Oost-Vlaanderen
df_account = df_account[df_account['account_adres_provincie'] == 'Oost-Vlaanderen']

# account adres samenvoegen
df_account['account_adres'] = df_account['account_adres_plaats'].str.lower() + ' ' \
            + df_account['account_adres_geografische_subregio'].str.lower()
df_account['account_adres'] = df_account['account_adres'].str.replace(r'\([a-z.-]+\)', '', regex=True).str.replace('  ', ' ')\

# account ondernemingstype samenvoegen
df_account['account_onderneming'] = df_account['account_ondernemingstype'] + ', ' \
                            + df_account['account_ondernemingsaard'] + ', ' \
                            + df_account['account_primaire_activiteit'] + ' ' \

df_account['account_onderneming'] = df_account['account_onderneming'].str.replace('unknown', '').str.replace(', , ', '') \
                                                            .str.strip().str.lower() \
                                                            .str.replace(r',$|^,', '', regex=True) \
                                                            .str.replace('&', '').str.replace('-', '') \

# drop kolommen
df_account.drop(['account_industriezone_naam_', 'account_oprichtingsdatum', 'account_reden_van_status',
           'account_status', 'account_voka_nr_', 'account_is_voka_entiteit','account_adres_geografische_regio', 
           'account_adres_geografische_subregio', 'account_adres_plaats', 'account_adres_postcode', 
           'account_adres_provincie', 'account_adres_land', 'account_ondernemingstype', 
           'account_ondernemingsaard', 'account_primaire_activiteit'], axis=1, inplace=True)

##############################################################################################################
#### CONTACT ####
##############################################################################################################

df_contact = pd.read_csv('../data_clean/Contact_fixed.csv')
df_contact = df_contact[df_contact['contact_status'] != 'Inactief']
df_contact['contact_functietitel'] = df_contact['contact_functietitel'].str.lower().str.replace(r'[^\w\s]', '', regex=True) \
                                                        .str.replace('  ', ' ').str.strip()
df_contact.drop(['contact_status', 'contact_voka_medewerker'], axis=1, inplace=True)

##############################################################################################################
#### CONTACT ACCOUNT MERGE ####
##############################################################################################################

df_account_contact = df_account.merge(df_contact, left_on='account_account_id', right_on='contact_account', how='inner')
df_account_contact.drop(['contact_account'], axis=1, inplace=True)

### Persoon merge

In [302]:
df_persoon = pd.read_csv('../data_clean/Persoon_fixed.csv')

df_persoon = df_persoon[df_persoon['persoon_reden_van_status'] == 'Actief']

for col in df_persoon.columns:
    if col.__contains__('persoon_mail_regio'):
        df_persoon.drop(col, axis=1, inplace=True)

df_persoon.drop(['persoon_persoonsnr_', 'persoon_web_login', 'persoon_reden_van_status'], axis=1, inplace=True)

##############################################################################################################
#### ACCOUNT CONTACT PERSOON MERGE ####
##############################################################################################################

df_account_contact_persoon = df_account_contact.merge(df_persoon, left_on='contact_persoon_id', right_on='persoon_persoon_id', how='left')
df_account_contact_persoon.drop(['contact_persoon_id', 'persoon_persoon_id'], axis=1, inplace=True)

# NaN waarden opvullen
for col in df_account_contact_persoon.columns:
    if col.__contains__('persoon_mail_type') or col.__contains__('persoon_mail_thema'):
        df_account_contact_persoon[col] = df_account_contact_persoon[col].fillna(0)

df_account_contact_persoon['persoon_marketingcommunicatie'] = df_account_contact_persoon['persoon_marketingcommunicatie'].fillna('-1')
df_account_contact_persoon['persoon_marketingcommunicatie'] = df_account_contact_persoon['persoon_marketingcommunicatie'] \
                                                            .str.replace('Strikt', '0').str.replace('Flexibel', '1') \
                                                            .str.replace('Uitgeschreven', '-1').str.replace('unknown', '-1').astype(int)

marketing_pressure_cols = [col for col in df_account_contact_persoon.columns if col.__contains__('persoon_mail_type') 
                           or col.__contains__('persoon_mail_thema') or col.__contains__('persoon_marketingcommunicatie')]

marketing_pressure_cols = [col for col in df_account_contact_persoon.columns if col.__contains__('persoon_mail_type') 
                           or col.__contains__('persoon_mail_thema') or col.__contains__('persoon_marketingcommunicatie')]

df_account_contact_persoon['marketing_pressure'] = df_account_contact_persoon[marketing_pressure_cols].sum(axis=1)
df_account_contact_persoon['marketing_pressure'] = df_account_contact_persoon['marketing_pressure'].astype(int)
df_account_contact_persoon.drop(marketing_pressure_cols, axis=1, inplace=True)

### Functie

In [303]:
df_contact_functie = pd.read_csv('../data_clean/Contact_functie_fixed.csv')
df_functie = pd.read_csv('../data_clean/Functie_fixed.csv')

df_account_contact_persoon = df_account_contact_persoon.merge(df_contact_functie, left_on='contact_contactpersoon_id'
                                              , right_on='contactfunctie_contactpersoon', how='inner')
df_account_contact_persoon.drop(['contactfunctie_contactpersoon'], axis=1, inplace=True)


df_account_contact_persoon = df_account_contact_persoon.merge(df_functie, left_on='contactfunctie_functie'
                                                              , right_on='functie_functie_id', how='inner')
df_account_contact_persoon.drop(['contactfunctie_functie', 'functie_functie_id'], axis=1, inplace=True)

# Define a custom aggregation function to merge 'functie_naam'
def merge_functie_naam(series):
    return ', '.join(series)

# Group by 'account_account_id' and 'contact_contactpersoon_id' and merge 'functie_naam'
df_account_contact_persoon = df_account_contact_persoon.groupby(
    ['account_account_id', 'contact_contactpersoon_id'], 
    as_index=False).agg({'functie_naam': merge_functie_naam, 
                         **{col: 'first' for col in df_account_contact_persoon.columns if col != 'functie_naam'}})

df_account_contact_persoon['functie_naam'] = df_account_contact_persoon['functie_naam'].str.lower() \
        .str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

### Activiteitscode

In [304]:
df_account_activiteitscode = pd.read_csv('../data_clean/Account_activiteitscode_fixed.csv')
df_activiteitscode = pd.read_csv('../data_clean/Activiteitscode_fixed.csv')

df_activiteit = df_account_activiteitscode.merge(df_activiteitscode, left_on='account_activiteitscode_activiteitscode', right_on='activiteitscode_activiteitscode_id', how='inner')

df_activiteit.drop(['account_activiteitscode_activiteitscode', 'activiteitscode_activiteitscode_id'
                    , 'activiteitscode_status', 'account_activiteitscode_inf_account_inf_activiteitscodeid'], axis=1, inplace=True)

# where account_activiteitscode_account is not unique, combine the activiteitscode_naam
def merge_activiteitscode_naam(series):
    return ', '.join(series)

df_activiteit = df_activiteit.groupby(
    ['account_activiteitscode_account'], 
    as_index=False).agg({'activiteitscode_naam': merge_activiteitscode_naam, 
                         **{col: 'first' for col in df_activiteit.columns if col != 'activiteitscode_naam'}})

df_activiteit['activiteitscode_naam'] = df_activiteit['activiteitscode_naam'].str.replace(' en ', ' ').str.replace(' & ', ' ') \
                                                            .str.replace('-', '').str.replace('  ', ' ').str.lower().str.strip()

df_account_contact_persoon = df_account_contact_persoon.merge(df_activiteit, left_on='account_account_id'
                                              , right_on='account_activiteitscode_account', how='left')
df_account_contact_persoon.drop(['account_activiteitscode_account'], axis=1, inplace=True)
df_account_contact_persoon.shape

(67225, 8)

### Financieel

In [305]:
df_account_financieel = pd.read_csv('../data_clean/Account_financiële_data_fixed.csv', sep=',')
df_account_financieel.drop(['financieledata_gewijzigd_op', 'financieledata_fte', 'financieledata_aantal_maanden'], axis=1, inplace=True)
df_account_financieel['financieledata_toegevoegde_waarde'] = df_account_financieel['financieledata_toegevoegde_waarde'].astype(str)

df_account_contact_persoon_finance = df_account_contact_persoon.merge(df_account_financieel, left_on='account_account_id', right_on='financieledata_ondernemingid', how='left')
df_account_contact_persoon_finance.drop(['financieledata_ondernemingid'], axis=1, inplace=True)

# turn financieledata_toegevoegde_waarde into a numeric value
df_account_contact_persoon_finance['financieledata_toegevoegde_waarde'] = df_account_contact_persoon_finance['financieledata_toegevoegde_waarde'] \
                                                                    .str.replace(',', '.') \
                                                                    .str.replace('unknown', '0') \
                                                                    .astype(float)

# De financieel toegevoegde waarde optellen per account en de boekjaren hun range gebruiken (vb 2007 - 2022 => 15 jaar)
df_account_contact_persoon_finance = df_account_contact_persoon_finance.groupby(
    ['account_account_id', 'contact_contactpersoon_id'], 
    as_index=False).agg({'financieledata_toegevoegde_waarde': 'sum', 
                         'financieledata_boekjaar': lambda x: x.max() - x.min(), 
                         **{col: 'first' for col in df_account_contact_persoon_finance.columns if col not in ['financieledata_toegevoegde_waarde', 'financieledata_boekjaar']}})

df_account_contact_persoon_finance.rename(columns={'financieledata_boekjaar': 'boekjaar_range', 'financieledata_toegevoegde_waarde': 'toegevoegde_waarde'}, inplace=True)

df_account_contact_persoon_finance['avg_waarde_jaar'] = df_account_contact_persoon_finance['toegevoegde_waarde'] / df_account_contact_persoon_finance['boekjaar_range']
df_account_contact_persoon_finance['avg_waarde_jaar'] = df_account_contact_persoon_finance['avg_waarde_jaar'].round(2)

### Afspraken

In [306]:
# Csv files inlezen
df_afspraak_acc_gelinkt = pd.read_csv('../data_clean/Afspraak_account_gelinkt_cleaned_fixed.csv', sep=',')
df_afspraak_acc_gelinkt.drop(['afspraak_account_gelinkt_account'], axis=1, inplace=True)

df_afspraak_betreft_acc = pd.read_csv('../data_clean/Afspraak_betreft_account_cleaned_fixed.csv', sep=',')
df_afspraak_betreft_acc.drop(['afspraak_betreft_account_betreft_id'], axis=1, inplace=True)

df_afspraak_betreft_contact = pd.read_csv('../data_clean/Afspraak_betreft_contact_cleaned_fixed.csv', sep=',')
df_afspraak_betreft_contact.drop(['afspraak_betreft_contactfiche_betreft_id'], axis=1, inplace=True)

df_afspraak_alle = pd.read_csv('../data_clean/Afspraak_alle_fixed.csv', sep=',')

# Kolomnamen hernoemen en afspraakt_betreft toevoegen (betreft account = 1, betreft contact = 0)
df_afspraak_acc_gelinkt.columns = ['afspraak_afspraak_id', 'afspraak_thema', 'afspraak_subthema', 'afspraak_onderwerp', 'afspraak_eindtijd', 'afspraak_keyphrases']
df_afspraak_betreft_acc.columns = ['afspraak_afspraak_id', 'afspraak_thema', 'afspraak_subthema', 'afspraak_onderwerp', 'afspraak_eindtijd', 'afspraak_keyphrases']
df_afspraak_betreft_contact.columns = ['afspraak_afspraak_id', 'afspraak_thema', 'afspraak_subthema', 'afspraak_onderwerp', 'afspraak_eindtijd', 'afspraak_keyphrases']

df_afspraak_betreft_acc["afspraak_betreft"] = 'account'
df_afspraak_acc_gelinkt["afspraak_betreft"] = 'account'
df_afspraak_betreft_contact["afspraak_betreft"] = 'contact'

# Get the unique afspraak_afspraak_id's from afspraak_acc_gelinkt
acc_gelinkt_id_list = df_afspraak_acc_gelinkt['afspraak_afspraak_id'].unique()

# Concatenate the dataframes
df_afspraken = pd.concat([df_afspraak_betreft_acc, df_afspraak_acc_gelinkt, df_afspraak_betreft_contact], ignore_index=True)
df_afspraken.drop_duplicates(inplace=True)

# Mergen met Afspraak_alle
df_afspraken = df_afspraken.merge(df_afspraak_alle, left_on='afspraak_afspraak_id', right_on='afspraak_alle_afspraak_id', how='inner')

# String cleanup
df_afspraken['afspraak_thema'] = df_afspraken['afspraak_subthema'].str.replace('\(', '', regex=True)
df_afspraken['afspraak_thema'] = df_afspraken['afspraak_thema'].str.replace('\)', '', regex=True)
df_afspraken['afspraak_onderwerp'] = df_afspraken['afspraak_onderwerp'].str.lower().astype(str)

# Drop kolommen
df_afspraken.drop(['afspraak_alle_afspraak_id', 'afspraak_eindtijd', 'afspraak_subthema'], axis=1, inplace=True)
df_afspraken.drop_duplicates(inplace=True)

# show rows where afspraak_afspraak_id is not unique
list_to_change_afspraak_betreft_to_2 = df_afspraken[df_afspraken['afspraak_afspraak_id'].duplicated(keep=False)]['afspraak_afspraak_id'].unique()

# for every afspraak_afspraak_id that is not unique, change afspraak_betreft to 2
for afspraak_id in list_to_change_afspraak_betreft_to_2:
    df_afspraken.loc[df_afspraken['afspraak_afspraak_id'] == afspraak_id, 'afspraak_betreft'] = 2

df_afspraken.drop_duplicates(inplace=True)

for afspraak_id in acc_gelinkt_id_list:
    df_afspraken.loc[df_afspraken['afspraak_afspraak_id'] == afspraak_id, 'afspraak_account_gelinkt'] = 1

df_afspraken['afspraak_account_gelinkt'].fillna(0, inplace=True)

# Mergen met Activiteit_vereist_contact
df_activiteit_vereist_contact = pd.read_csv('../data_clean/Activiteit_vereist_contact_fixed.csv', sep=',')
df_afspraken_total = df_afspraken.merge(df_activiteit_vereist_contact, left_on='afspraak_afspraak_id', right_on='activiteitvereistcontact_activityid_id', how='inner')
df_afspraken_total.drop(['activiteitvereistcontact_activityid_id', 'afspraak_afspraak_id'], axis=1, inplace=True)

##############################################################################################################
#### ACCOUNT CONTACT PERSOON ACTIVITEIT MERGE ####
##############################################################################################################

merged_total = df_account_contact_persoon_finance.merge(df_afspraken_total, left_on=['contact_contactpersoon_id'], right_on=['activiteitvereistcontact_reqattendee'], how='left')
merged_total.drop(['activiteitvereistcontact_reqattendee'], axis=1, inplace=True)

# NaN values vervangen
merged_total['boekjaar_range'].fillna(0, inplace=True)
merged_total['afspraak_betreft'].fillna(-1, inplace=True)
merged_total['afspraak_account_gelinkt'].fillna(-1, inplace=True)

merged_total['activiteitscode_naam'].fillna('unknown', inplace=True)
merged_total['afspraak_thema'].fillna('unknown', inplace=True)
merged_total['afspraak_onderwerp'].fillna('unknown', inplace=True)
merged_total['afspraak_keyphrases'].fillna('unknown', inplace=True)

# Strings cleanen
merged_total['afspraak_thema'] = merged_total['afspraak_thema'].str.lower() \
        .str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

merged_total['afspraak_onderwerp'] = merged_total['afspraak_onderwerp'].str.replace('ov-', '').str.replace('ov -', '') \
        .str.replace('ov ', '').str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

merged_total['afspraak_keyphrases'] = merged_total['afspraak_keyphrases'].str.lower().str.replace(r'[^\w\s]', '', regex=True) \
                                                                            .str.replace('  ', ' ').str.strip()

merged_total.drop_duplicates(inplace=True)
merged_total.shape

(68610, 16)

In [307]:
# RAM geheugen vrijmaken -> verwijder niet gebruikte dataframes
try:
    del df_account
    del df_contact
    del df_account_contact
    del df_persoon
    del df_account_contact_persoon
    del df_contact_functie
    del df_functie
    del df_account_activiteitscode
    del df_activiteitscode
    del df_activiteit
    del df_account_contact_persoon_finance
    del df_afspraak_acc_gelinkt
    del df_afspraak_betreft_acc
    del df_afspraak_betreft_contact
    del df_afspraak_alle
    del df_afspraken
    del df_activiteit_vereist_contact
except:
    print('Dataframes already deleted')

### Campagne en Inschrijving

In [308]:
df_campagne = pd.read_csv('../data_clean/Campagne_fixed.csv', sep=',')
df_inschrijving = pd.read_csv('../data_clean/Inschrijving_fixed.csv', sep=',')
df_inschrijving.drop(['inschrijving_datum_inschrijving', 'inschrijving_campagne_naam_'], axis=1, inplace=True)

# campagne naam cleanen
df_campagne['campagne_naam'] = df_campagne['campagne_naam'].str.replace('OV-', '').str.replace('ov-', '') \
                                                            .str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True) \
                                                            .str.replace('  ', ' ').str.strip().str.lower().str.replace('  ', ' ')

# Drop kolommen
df_campagne.drop(['campagne_einddatum', 'campagne_startdatum', 'campagne_campagne_nr', 
                  'campagne_naam_in_email', 'campagne_reden_van_status', 'campagne_status',
                  'campagne_url_voka_be'
                  ], axis=1, inplace=True)

# Merge campagne en inschrijving
df_campagne_inschrijving = df_campagne.merge(df_inschrijving, left_on='campagne_campagne_id', right_on='inschrijving_campagne', how='inner')
df_campagne_inschrijving.drop(['inschrijving_campagne', 'inschrijving_inschrijving_id'], axis=1, inplace=True)
df_campagne_inschrijving.drop_duplicates(inplace=True)
df_campagne_inschrijving.shape

(48427, 8)

In [309]:
merged_total2 = merged_total.merge(df_campagne_inschrijving, left_on='contact_contactpersoon_id'
                                             , right_on='inschrijving_contactfiche', how='inner')

merged_total2.drop(['inschrijving_contactfiche'], axis=1, inplace=True)

# drop rows where campagne_campagne_id is NaN
merged_total2.dropna(subset=['campagne_campagne_id'], inplace=True)

# rows samenvoegen
merged_total2['inschrijving_aanwezig_afwezig'] \
    = merged_total2['inschrijving_aanwezig_afwezig'].astype(str) \
                                                            .str.replace('Aanwezig', '1') \
                                                            .str.replace('Afwezig', '0') \
                                                            .str.replace('unknown', '-1') \
                                                            .astype(int)

merged_total2['inschrijving_facturatie_bedrag'] \
      = merged_total2['inschrijving_facturatie_bedrag'].astype(str) \
                                                               .str.replace(',', '.') \
                                                               .str.replace('unknown', '-1') \
                                                               .astype(float)

merged_total2['inschrijving_bron'] \
    = merged_total2['inschrijving_bron'].astype(str) \
                                                .str.replace('unknown', '-1') \
                                                .str.replace('Website', '0') \
                                                .str.replace('Email', '1') \
                                                .astype(int)

merged_total2.shape

(48123, 23)

In [310]:
# RAM geheugen vrijmaken -> verwijder niet gebruikte dataframes
try:
    del df_campagne
    del df_inschrijving
    del df_campagne_inschrijving
    del merged_total
except:
    print('Dataframes already deleted')

### Visits, Mailing, Sent_mail_clicks

In [311]:
df_visit = pd.read_csv('../data_clean/CDI_visits_fixed.csv', sep=',')

df_visit.drop(
    ['visit_ip_postcode', 'visit_aangemaakt_op', 'visit_adobe_reader', 'visit_campagne_code',
    'visit_contact_naam_', 'visit_containssocialprofile', 'visit_ended_on', 'visit_ip_address',
    'visit_ip_organization', 'visit_keywords', 'visit_ip_longitude', 'visit_ip_latitude', 'visit_referrer', 
    'visit_score', 'visit_started_on', 'visit_ip_status', 'visit_time', 'visit_visit_id', 'visit_gewijzigd_op',
    'visit_entry_page', 'visit_exit_page', 'visit_ip_company', 'visit_referring_host', 'visit_referrer_type', 
    'visit_bounce', 'visit_duration', 'visit_browser', 'visit_ip_stad', 'visit_ip_land', 'visit_operating_system',
   ], axis=1, inplace=True)

df_visit.drop_duplicates(inplace=True)

df_visit['visit_first_visit'] = df_visit['visit_first_visit'] \
      .str.replace('Ja', '1').str.replace('Nee', '0') \
      .str.replace('unknown', '-1').astype(int)

df_visit['visit_total_pages'] = df_visit['visit_total_pages']\
      .replace('unknown', '-1.0').astype(float)

##############################################################################################################

# Csv files inlezen
df_mailing = pd.read_csv('../data_clean/CDI_mailing_fixed.csv', sep=',')
df_mailing.drop(['mailing_sent_on'], axis=1, inplace=True)
df_mailing.drop_duplicates(inplace=True)
df_click = pd.read_csv('../data_clean/CDI_sent_email_clicks_fixed.csv', sep=',')
df_click.drop(['sentemail_kliks_contact', 'sentemail_kliks_sent_email_id'], axis=1, inplace=True)
df_click.drop_duplicates(inplace=True)

# Merge en drop kolommen
df_mailing_merged = df_mailing.merge(df_click, left_on='mailing_mailing_id', right_on='sentemail_kliks_e_mail_versturen', how='inner')
df_mailing_merged.drop(['sentemail_kliks_e_mail_versturen'], axis=1, inplace=True)

# Voeg een nieuwe kolom toe voor het aantal mails per groep
df_mailing_merged['aantal_mails'] = df_mailing_merged.groupby(
    ['mailing_mailing_id'])['mailing_mailing_id'].transform('count')

# Sentmail_kliks_clicks optellen per mailing
df_mailing_merged = df_mailing_merged.groupby(['mailing_mailing_id', 'mailing_name', 'mailing_onderwerp'], 
                          as_index=False).agg({'sentemail_kliks_clicks': 'sum', 
                                                **{col: 'first' for col in df_mailing_merged.columns if col not in ['sentemail_kliks_clicks']}})

##############################################################################################################

df_visit_mailing = df_visit.merge(df_mailing_merged, left_on='visit_email_send', right_on='mailing_mailing_id', how='left')
df_visit_mailing.drop(['visit_email_send', 'mailing_mailing_id', 'visit_campaign'], axis=1, inplace=True)

# String kolommen cleanen
df_visit_mailing['mailing_name'] = df_visit_mailing['mailing_name'].str.replace('OV-', '') \
                                            .str.replace('OV ', '').str.replace('OV -', '') \
                                            .str.replace('-', ' ').str.replace(r'[^\w\s]', ' ', regex=True) \
                                            .str.replace('  ', ' ').str.strip().str.lower().str.replace('  ', ' ')

df_visit_mailing['mailing_onderwerp'] = df_visit_mailing['mailing_onderwerp'] \
                                            .str.replace('-', ' ').str.replace(r'[^\w\s]', ' ', regex=True) \
                                            .str.replace('  ', ' ').str.strip().str.lower().str.replace('  ', ' ')

df_visit_mailing['sentemail_kliks_clicks'] \
    = df_visit_mailing['sentemail_kliks_clicks'].replace('unknown', '0').astype(int)

### Final merge

In [312]:
final_merge = merged_total2.merge(df_visit_mailing, left_on='contact_contactpersoon_id', right_on='visit_contact', how='left')
final_merge.drop(['visit_contact', 'account_account_id'], axis=1, inplace=True)

# Numeric en categorical kolommen selecteren
num_cols = final_merge.select_dtypes(include=['float64', 'int64']).columns
cat_cols = final_merge.select_dtypes(include=['object']).columns

# fill NaN values with -1 for numeric columns
final_merge[num_cols] = final_merge[num_cols].fillna(-1)
# fill NaN values with 'unknown' for categorical columns
final_merge[cat_cols] = final_merge[cat_cols].fillna('unknown')

# verander alle int64 en int32 naar int8
int_cols = final_merge.select_dtypes(include=['int64', 'int32']).columns
final_merge[int_cols] = final_merge[int_cols].astype('int8')

# mail_click_freq berekenen
final_merge['mail_click_freq'] = final_merge.apply(
    lambda row: 0 if row['aantal_mails'] == -1 else round(row['sentemail_kliks_clicks'] / row['aantal_mails']), axis=1)
final_merge.drop(['sentemail_kliks_clicks', 'aantal_mails'], axis=1, inplace=True)

# marketing_pressure berekenen
final_merge['marketing_pressure'] = final_merge['marketing_pressure'] + final_merge['mail_click_freq'] 
+ final_merge['visit_total_pages'] + final_merge['visit_first_visit'] + final_merge['inschrijving_bron']

final_merge.drop(['mail_click_freq', 'visit_total_pages', 'visit_first_visit', 'inschrijving_bron'], axis=1, inplace=True)

# toegevoegde waarde en boekjaar range droppen want we hebben al avg_waarde_jaar -> betere representatie
final_merge.drop(['toegevoegde_waarde', 'boekjaar_range'], axis=1, inplace=True)

# keyphrases kolommen mergen
cols_for_keyphrase = ['functie_naam', 'contact_functietitel', 'account_adres', 'account_onderneming'
                      , 'activiteitscode_naam', 'afspraak_thema', 'afspraak_onderwerp', 'afspraak_keyphrases'
                      , 'afspraak_betreft', 'campagne_naam', 'campagne_type_campagne', 'campagne_soort_campagne'
                      , 'mailing_name', 'mailing_onderwerp']

for col in cols_for_keyphrase:
    final_merge[col] = final_merge[col].astype(str).str.split().str.join(', ')

final_merge['keyphrases'] = final_merge[cols_for_keyphrase].apply(lambda x: ', '.join(x), axis=1)
final_merge.drop(cols_for_keyphrase, axis=1, inplace=True)

# keyphrases cleanen
final_merge['keyphrases'] = final_merge['keyphrases'].str.replace(', ,', ',').str.replace(r'(\s{2},\s{2}),*+', '') \
    .str.replace('  ', ' ').str.replace(r'[^\w\s]', '', regex=True) \
    .str.replace('  ', ' ').str.strip().str.lower()

final_merge.drop_duplicates(inplace=True, ignore_index=True)
final_merge.shape

(174436, 8)

### Tokenizing en stopword removal

In [313]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\buyse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\buyse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [314]:
def remove_stopwords(text):
    stop_words_nl = set(stopwords.words('dutch'))
    
    word_tokens = word_tokenize(text, language='dutch')

    result = [x for x in word_tokens if x not in stop_words_nl]

    seperator = ', '
    return seperator.join(result)


def team_name_change(text):
    teams_dict = {
        'jo': ' jong ondernemen ',
        'do': ' duurzaam ondernemen ',
        'in': ' innovatie digitalisering ',
        'io': ' internationaal ondernemen ',
        'ao': ' arbeidsmarkt ',
        'ex': ' expert ',
        'gr': ' groei ',
        'bb': ' belangenbehartiging ',
        'co': ' communicatie ',
        'nw': ' netwerking ',
        'ha': ' haven ',
        'ma': ' match '
    }
    word_tokens = word_tokenize(text, language='dutch')
    # apply dict to list
    result = [teams_dict.get(word, word) for word in word_tokens]
    # join list to string
    cleaned_list = ', '.join(result)
    # tokenize string
    tokenize_list = word_tokenize(cleaned_list, language='dutch')
    # remove comma
    tokenize_list_no_comma = [x for x in tokenize_list if x != ',']
    # join list to string and remove duplicates from list
    return ', '.join(list(set(tokenize_list_no_comma)))


def stemmer(text):
    stemmer = SnowballStemmer(language='dutch')
    stem_sentence=[]
    for word in text.split(','):
        stem_sentence.append(stemmer.stem(word))
    stem_sentence= ', '.join(stem_sentence)
    return stem_sentence


def clean_text(df=final_merge, col='keyphrases'):

    df_copy = df.copy()

    for row in range(len(df_copy)):
        name_change = team_name_change(df_copy[col][row])
        no_stopwords = remove_stopwords(name_change)
        tokenize_list = word_tokenize(no_stopwords, language='dutch')
        tokenize_list = [x for x in tokenize_list if x != ',']
        df_copy.at[row, col] = ', '.join(list(set(tokenize_list)))
        stemmer_list= stemmer(df_copy[col][row])
        df_copy.at[row, col] = stemmer_list

    return df_copy

In [315]:
final_merge_clean = clean_text()

In [353]:
final_merge_clean['keyphrases'] = final_merge_clean['keyphrases'].str.replace('voka', ' ') \
    .str.replace('ov', '').str.replace('unknown', '').str.replace(r'\b\w{1,3}\b', '', regex=True).str.replace(r'\d+', '', regex=True) \
    .str.replace(r'(\s{2},\s{2}),*+', '', regex=True).str.replace(' ', '').str.replace(r'^,+|,+$', '', regex=True) \
    .str.replace(r',,+', ',', regex=True)

In [356]:
final_merge_clean.to_csv('../data_clean/final_merge_clean.csv', index=False)

### Embedding

In [4]:
import openai
from dotenv import load_dotenv

In [9]:
data = pd.read_csv('../data_clean/final_merge_clean.csv', sep=',')
data.shape

(174436, 8)

In [5]:
load_dotenv("../.env")
openai.api_key = os.getenv("OPENAI_API_KEY")

embedding_model = "text-embedding-ada-002"

In [6]:
def get_embedding(text):
    response = openai.Embedding.create(
    input=text,
    model=embedding_model
    )   
    return response['data'][0]['embedding']

Zou 11.6 uur duren volgens de berekening ...

In [None]:
# unique_col = data['keyphrases'].unique().tolist()

# dict_temp = {}
# for i in unique_col:
#     dict_temp[i] = get_embedding(i)

# df_embeddings = pd.DataFrame.from_dict(dict_temp, orient='index')