In [342]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine

In [343]:
ENV_URL = os.path.join(os.getcwd(), '../.env')
load_dotenv(ENV_URL)

DWH_NAME = os.environ.get('DWH_NAME')
SERVER_NAME = os.environ.get('SERVER_NAME')
DB_USER = os.environ.get('DB_USER')
DB_PASSWORD = os.environ.get('DB_PASSWORD')

URL = f'mssql+pymssql://{DB_USER}:{DB_PASSWORD}@{SERVER_NAME}/{DWH_NAME}'
URL_LOCAL = f'mssql+pyodbc://{SERVER_NAME}/{DWH_NAME}?trusted_connection=yes&driver=ODBC+Driver+17 for SQL Server'

engine = create_engine(URL_LOCAL)
conn = engine.connect()

In [344]:
def create_query(table_name, columns, condition=None):

    query = f"SELECT "

    for i, column in enumerate(columns):
        if i == 0:
            query += f"[{column}]"
        else:
            query += f", [{column}]"
    
    query += f" FROM [{DWH_NAME}].[dbo].[{table_name}]"
    
    if condition:
        query += f" WHERE {condition}"

    return query

### Eerst gaan we alle datafiles die we nodig hebben mergen tot 1 dataframe

#### 1. Alles van Account mergen

In [345]:
acc_cols = ['accountID', 'subregio', 'ondernemingstype', 'ondernemingsaard', 'activiteitNaam']
# account conditie
acc_condition = "accountStatus = 1 AND provincie = 'Oost-Vlaanderen'"
# create query
acc_query = create_query('DimAccount', acc_cols, acc_condition)
# read sql
df_account = pd.read_sql(acc_query, conn)
df_account.shape

(3211, 5)

In [346]:
list(df_account.columns)

['accountID',
 'subregio',
 'ondernemingstype',
 'ondernemingsaard',
 'activiteitNaam']

##### Accounts merge met Contactfiches

In [347]:
contact_cols = ['contactID', 'accountID', 'functietitel', 
 'persoon_mail_thema_duurzaamheid', 'persoon_mail_thema_financieel_fiscaal', 'persoon_mail_thema_innovatie',
 'persoon_mail_thema_internationaal_ondernemen', 'persoon_mail_thema_mobiliteit', 'persoon_mail_thema_omgeving',
 'persoon_mail_thema_sales_marketing_communicatie', 'persoon_mail_thema_strategie_en_algemeen_management',
 'persoon_mail_thema_talent', 'persoon_mail_thema_welzijn', 'persoon_mail_type_bevraging', 'persoon_mail_type_communities_en_projecten',
 'persoon_mail_type_netwerkevenementen', 'persoon_mail_type_nieuwsbrieven', 'persoon_mail_type_opleidingen',
 'persoon_mail_type_persberichten_belangrijke_meldingen', 'persoon_marketingcommunicatie',]

contact_condition = "contactStatus = 'Actief'"
contact_query = create_query('DimContact', contact_cols, contact_condition)
df_contact = pd.read_sql(contact_query, conn)

df_contact['functietitel'] = df_contact['functietitel'].str.lower()
df_contact['persoon_marketingcommunicatie'] = df_contact['persoon_marketingcommunicatie'].fillna('-1')
df_contact['persoon_marketingcommunicatie'] = df_contact['persoon_marketingcommunicatie'] \
                                                            .str.replace('Strikt', '0').str.replace('Flexibel', '1') \
                                                            .str.replace('Uitgeschreven', '-1').str.replace('unknown', '-1').astype(int)

df_contact.shape

(70413, 20)

In [348]:
list(df_contact.columns)

['contactID',
 'accountID',
 'functietitel',
 'persoon_mail_thema_duurzaamheid',
 'persoon_mail_thema_financieel_fiscaal',
 'persoon_mail_thema_innovatie',
 'persoon_mail_thema_internationaal_ondernemen',
 'persoon_mail_thema_mobiliteit',
 'persoon_mail_thema_omgeving',
 'persoon_mail_thema_sales_marketing_communicatie',
 'persoon_mail_thema_strategie_en_algemeen_management',
 'persoon_mail_thema_talent',
 'persoon_mail_thema_welzijn',
 'persoon_mail_type_bevraging',
 'persoon_mail_type_communities_en_projecten',
 'persoon_mail_type_netwerkevenementen',
 'persoon_mail_type_nieuwsbrieven',
 'persoon_mail_type_opleidingen',
 'persoon_mail_type_persberichten_belangrijke_meldingen',
 'persoon_marketingcommunicatie']

In [349]:
accounts_merged = pd.merge(df_contact, df_account, on='accountID', how='inner')
accounts_merged.shape

(16656, 24)

#### 2. Alles van Afspraken mergen

In [350]:
grouping_categories = {
    'Lidmaatschap': ['Lidmaatschap'],
    'Welt': ['Welt', 'Welt 2.0', 'Welt 2.0-2023'],
    'Plato & Bryo': ['Plato', 'Bryo'],
    'Internationaal Ondernemen': ['Internationaal Ondernemen', 'Internationaal Ondernemen - voor Info en Advies'],
    'Technologie en Innovatie': ['Digitalisering, IT & Technologie', 'Innovatie', 'Veiligheid & Preventie'],
    'Groeien en Netwerking': ['Groeien', 'Netwerking', 'Communicatie', 'Starten'],
    'Duurzaamheid': ['Duurzaam Ondernemen', 'Mobiliteit'],
    'Familiebedrijfsmanagement': ['Familiebedrijven', 'Opvolging en Overname'],
    'Arbeidsmarkt': ['Arbeidsmarkt', 'Opleidingen'],
    'Bedrijfsbeheer': ['Algemeen Management', 'Bestuurlijke organisaties', 'Human Resources', 'Ruimtelijke ordening en Infrastructuur'],
    'Financieel': ['Financieel', 'Marketing & Sales', 'Aankoop'],
    'Logistiek en Transport': ['Logistiek en Transport', 'Haven']
}

def map_thema(thema):
    for category, themes in grouping_categories.items():
        if thema in themes:
            return category
    return thema

In [351]:
afspraak_cols = ['thema', 'contactID']

afspraak_condition = "contactID is not null"
afspraak_query = create_query('DimAfspraak', afspraak_cols, afspraak_condition)
df_afspraak1 = pd.read_sql(afspraak_query, conn)

df_afspraak1.drop_duplicates(inplace=True)

df_afspraak1['thema'] = df_afspraak1['thema'].apply(map_thema)
df_afspraak1 = df_afspraak1.groupby('contactID')['thema'].value_counts().unstack(fill_value=0)

df_afspraak1.shape

(1839, 12)

In [352]:
afspraak_cols = ['thema', 'accountID']

afspraak_condition = "accountID is not null"
afspraak_query = create_query('DimAfspraak', afspraak_cols, afspraak_condition)
df_afspraak2 = pd.read_sql(afspraak_query, conn)

df_afspraak2.drop_duplicates(inplace=True)

df_afspraak2['thema'] = df_afspraak2['thema'].apply(map_thema)
df_afspraak2 = df_afspraak2.groupby('accountID')['thema'].value_counts().unstack(fill_value=0)

df_afspraak2.shape

(2770, 13)

##### Alle afspraken mergen

In [353]:
accounts_merged = accounts_merged.merge(df_afspraak1, on=['contactID'], how='left')
accounts_merged = accounts_merged.merge(df_afspraak2, on=['accountID'], how='left')

columns_to_merge = ['Arbeidsmarkt', 'Bedrijfsbeheer', 'Duurzaamheid', 'Familiebedrijfsmanagement',
                    'Financieel', 'Groeien en Netwerking', 'Internationaal Ondernemen',
                    'Lidmaatschap', 'Logistiek en Transport', 'Plato & Bryo',
                    'Technologie en Innovatie', 'Welt']

for column in columns_to_merge:
    accounts_merged[column] = accounts_merged[f'{column}_x'].combine_first(accounts_merged[f'{column}_y'])

accounts_merged = accounts_merged.drop(columns=[f'{column}_x' for column in columns_to_merge] + [f'{column}_y' for column in columns_to_merge]).fillna(0)
accounts_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16656 entries, 0 to 16655
Data columns (total 37 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   contactID                                              16656 non-null  object 
 1   accountID                                              16656 non-null  object 
 2   functietitel                                           16656 non-null  object 
 3   persoon_mail_thema_duurzaamheid                        16656 non-null  int64  
 4   persoon_mail_thema_financieel_fiscaal                  16656 non-null  int64  
 5   persoon_mail_thema_innovatie                           16656 non-null  int64  
 6   persoon_mail_thema_internationaal_ondernemen           16656 non-null  int64  
 7   persoon_mail_thema_mobiliteit                          16656 non-null  int64  
 8   persoon_mail_thema_omgeving                   

#### 3. Alles van Campagnes mergen

In [354]:
campagne_cols = ['campagneID', 'campagneType', 'campagneSoort']

campagne_query = create_query('DimCampagne', campagne_cols)
df_campagne = pd.read_sql(campagne_query, conn)

df_campagne.shape

(468, 3)

In [355]:
df_campagne['campagneID'].nunique()

468

In [356]:
sessie_cols = ['campaignID', 'themaNaam']

sessie_query = create_query('DimSessie', sessie_cols)
df_sessie = pd.read_sql(sessie_query, conn)

df_sessie.shape

(4365, 2)

In [357]:
df_sessie['campaignID'].nunique()

144

In [358]:
sessie_themes_grouped = {
  'Bryo': ['Bryo'],
  'Algemeen': ['Familiebedrijven','Opvolging en Overname','Algemeen Management','Human Resources','Algemeen Management - Intern','Bestuurlijke organisaties'],
  'Onderwijs': ['Opleidingen','Persoonlijke vaardigheden','Onderwijs'],
  'Logistiek': ['Logistiek en Transport','Haven','Supply Chain','Retail'],
  'Welt': ['Welt', 'Welt 2.0', 'Welt 2.0-2023'],
  'Ondernemen': ['Starten', 'Internationaal Ondernemen', 'Jong Voka', 'Groeien', 'Stille Kampioenen', 'Samen doorgaan', 'Strategie'],
  'Duurzaamheid': ['Energie', 'Duurzaam Ondernemen', 'Milieu', 'Mobiliteit'],
  'Lidmaatschap': ['Lidmaatschap'],
  'Innovatie en Technologie': ['Innovatie', 'Digitalisering, IT & Technologie'],
  'Netwerking': ['Netwerking'],
  'Economie': ['Arbeidsmarkt', 'Economie', 'Fiscaal', 'Financieel', 'Marketing & Sales', 'Jobkanaal'],
  'Juridisch': ['Bedrijfsjuridisch', 'Juridisch'],
  'Veiligheid en communicatie': ['Communicatie', 'Veiligheid & Preventie', 'Welzijn en gezondheidszorg'],
  'Andere': ['Plato', 'Ruimtelijke ordening en Infrastructuur', 'Regeringsvorming', 'Industrie', 'Aankoop', 'Privé&Vrije tijd', 'Aantrekkelijke regio', 'Coronavirus', 'unknown']
}

def map_thema(thema):
    for category, themes in sessie_themes_grouped.items():
        if thema in themes:
            return category
    return thema

df_sessie['themaNaam'] = df_sessie['themaNaam'].apply(map_thema)

In [359]:
df_sessie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4365 entries, 0 to 4364
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   campaignID  4365 non-null   object
 1   themaNaam   4365 non-null   object
dtypes: object(2)
memory usage: 68.3+ KB


In [360]:
df_sessie = (df_sessie.assign(themaNaam_list=df_sessie['themaNaam'].str.split(', '))
               .explode('themaNaam_list')
               .drop_duplicates()
               .groupby('campaignID')['themaNaam_list']
               .agg(lambda x: list(set(x)))
               .reset_index()
               .sort_values(by='themaNaam_list', key=lambda x: x.str.len(), ascending=False))
df_sessie

Unnamed: 0,campaignID,themaNaam_list
53,62A0889B-5BAB-EC11-983F-002248848BF8,"[Algemeen, Economie, Ondernemen]"
140,F7C1D664-A26A-ED11-9561-6045BD8956C9,"[Algemeen, Economie, Innovatie en Technologie]"
17,210CE445-B4A3-ED11-AAD1-6045BD895D85,"[Economie, Innovatie en Technologie]"
119,CEDF71EE-F18F-EC11-B400-000D3A25AC23,"[Algemeen, Ondernemen]"
0,025A1F39-01A6-ED11-AAD1-6045BD895CDC,[Algemeen]
...,...,...
42,472E86D6-A16A-ED11-9561-6045BD8956C9,[Algemeen]
41,466301FF-FCEE-ED11-8849-6045BD974EB2,[Ondernemen]
40,463895E5-7792-ED11-AAD1-6045BD895D85,[Ondernemen]
39,4476B575-D361-EE11-8DF0-6045BD895554,[Welt]


In [361]:
unique_categories = set(category for row in df_sessie['themaNaam_list'] for category in row)

for category in unique_categories:
    df_sessie[category] = df_sessie['themaNaam_list'].apply(lambda x: int(category in x))

df_sessie.drop('themaNaam_list', axis=1, inplace=True)

In [362]:
campagnes_merged = pd.merge(df_campagne, df_sessie, left_on='campagneID', right_on='campaignID', how='left').fillna(0)
campagnes_merged.drop(['campaignID'], axis=1, inplace=True)
campagnes_merged.drop_duplicates(inplace=True)
campagnes_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 468 entries, 0 to 467
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   campagneID                468 non-null    object 
 1   campagneType              468 non-null    object 
 2   campagneSoort             468 non-null    object 
 3   Economie                  468 non-null    float64
 4   Logistiek                 468 non-null    float64
 5   Netwerking                468 non-null    float64
 6   Welt                      468 non-null    float64
 7   Duurzaamheid              468 non-null    float64
 8   Ondernemen                468 non-null    float64
 9   Innovatie en Technologie  468 non-null    float64
 10  Bryo                      468 non-null    float64
 11  Algemeen                  468 non-null    float64
 12  Andere                    468 non-null    float64
dtypes: float64(10), object(3)
memory usage: 47.7+ KB


#### CDI

In [363]:
visit_cols = ['contactID', 'visit_first_visit', 'visit_total_pages', 'mailing_onderwerp', 'mailing_name', 'mailSent_clicks', 'mailSent', 'campaignID']

visit_query = create_query('DimVisit', visit_cols)
df_visit = pd.read_sql(visit_query, conn)

df_visit.drop_duplicates(inplace=True)

df_visit['visit_first_visit'] = df_visit['visit_first_visit'] \
      .str.replace('Ja', '0').str.replace('Nee', '1') \
      .str.replace('unknown', '-1').astype(int)

df_visit['visit_total_pages'] = df_visit['visit_total_pages']\
      .replace('unknown', '-1.0').astype(float)

df_visit['aantal_mails'] = df_visit.groupby(
    ['contactID'])['mailSent'].transform('nunique')

df_visit['clicks_total'] = df_visit.groupby(
    ['contactID'])['mailSent_clicks'].transform('sum')

df_visit['visit_total_pages'] = df_visit.groupby(
    ['contactID'])['visit_total_pages'].transform('sum').astype(int)

df_visit['visit_first_visit'] = df_visit.groupby(
    ['contactID'])['visit_first_visit'].transform('sum').astype(int)

df_visit['mail_click_freq'] = np.round(df_visit['clicks_total'] / df_visit['aantal_mails'], 0)
df_visit['mail_click_freq'] = df_visit['mail_click_freq'].fillna(-1).astype(int)

df_visit.drop(['mailSent', 'mailSent_clicks', 'clicks_total', 'aantal_mails'], axis=1, inplace=True)
df_visit.drop_duplicates(inplace=True)

int_cols = df_visit.select_dtypes(include=['int64', 'int32']).columns
df_visit[int_cols] = df_visit[int_cols].astype('int8')

df_visit.reset_index(inplace=True)
df_visit.drop_duplicates(inplace=True)
df_visit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7831 entries, 0 to 7830
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   index              7831 non-null   int64 
 1   contactID          7831 non-null   object
 2   visit_first_visit  7831 non-null   int8  
 3   visit_total_pages  7831 non-null   int8  
 4   mailing_onderwerp  7831 non-null   object
 5   mailing_name       7831 non-null   object
 6   campaignID         7831 non-null   object
 7   mail_click_freq    7831 non-null   int8  
dtypes: int64(1), int8(3), object(4)
memory usage: 329.0+ KB


#### 5. Cleaning voor de grote merge

In [364]:
# Drop duplicates
accounts_merged = accounts_merged.drop_duplicates(keep='first')
campagnes_merged = campagnes_merged.drop_duplicates(keep='first')

In [365]:
accounts_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16656 entries, 0 to 16655
Data columns (total 37 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   contactID                                              16656 non-null  object 
 1   accountID                                              16656 non-null  object 
 2   functietitel                                           16656 non-null  object 
 3   persoon_mail_thema_duurzaamheid                        16656 non-null  int64  
 4   persoon_mail_thema_financieel_fiscaal                  16656 non-null  int64  
 5   persoon_mail_thema_innovatie                           16656 non-null  int64  
 6   persoon_mail_thema_internationaal_ondernemen           16656 non-null  int64  
 7   persoon_mail_thema_mobiliteit                          16656 non-null  int64  
 8   persoon_mail_thema_omgeving                   

In [366]:
campagnes_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 468 entries, 0 to 467
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   campagneID                468 non-null    object 
 1   campagneType              468 non-null    object 
 2   campagneSoort             468 non-null    object 
 3   Economie                  468 non-null    float64
 4   Logistiek                 468 non-null    float64
 5   Netwerking                468 non-null    float64
 6   Welt                      468 non-null    float64
 7   Duurzaamheid              468 non-null    float64
 8   Ondernemen                468 non-null    float64
 9   Innovatie en Technologie  468 non-null    float64
 10  Bryo                      468 non-null    float64
 11  Algemeen                  468 non-null    float64
 12  Andere                    468 non-null    float64
dtypes: float64(10), object(3)
memory usage: 47.7+ KB


#### 6. Alles mergen

In [367]:
inschrijving_cols = ['campagneID', 'contactID', 'facturatieBedrag']

inschrijving_query = create_query('FactInschrijving', inschrijving_cols)
df_inschrijving = pd.read_sql(inschrijving_query, conn)

df_inschrijving.shape

(8730, 3)

In [None]:
int_cols = accounts_merged.select_dtypes(include=['int64', 'int32']).columns
accounts_merged[int_cols] = accounts_merged[int_cols].apply(lambda x: np.int8(x))

int_cols = campagnes_merged.select_dtypes(include=['int64', 'int32']).columns
campagnes_merged[int_cols] = campagnes_merged[int_cols].apply(lambda x: np.int8(x))

In [None]:
pq.write_table(pa.Table.from_pandas(accounts_merged), 'account_merged.parquet')
pq.write_table(pa.Table.from_pandas(campagnes_merged), 'campagnes_merged.parquet')

# Read DataFrames back from Parquet files
account_merged_parquet = pd.read_parquet('account_merged.parquet')
campagnes_merged_parquet = pd.read_parquet('campagnes_merged.parquet')

In [None]:
merged_total = pd.merge(account_merged_parquet.assign(key=1), campagnes_merged_parquet.assign(key=1), on='key').drop('key', axis=1)
merged_total.shape

In [None]:
# RAM vrijmaken
try:
    del accounts_merged
    del campagnes_merged
    del account_merged_parquet
    del campagnes_merged_parquet
except:
    print("Data is niet meer beschikbaar / is al verwijderd")

In [None]:
def calc_marketing_pressure(df):
    marketing_pressure_cols = []
    marketing_pressure_cols.append('bron')
    marketing_pressure_cols.append('visit_first_visit')
    marketing_pressure_cols.append('visit_total_pages')
    marketing_pressure_cols.append('mail_click_freq')

    df['marketing_pressure'] = df[marketing_pressure_cols].sum(axis=1)
    df['marketing_pressure'] = df['marketing_pressure'].astype(int)
    df.drop(marketing_pressure_cols, axis=1, inplace=True)

    return df

In [None]:
merged_total["ingeschreven"] = 0
merged_total["marketing_pressure"] = -1


for index, row in merged_total.iterrows():
    condition1 = (df_inschrijving['contactID'] == row['contactID']) & (df_inschrijving['campagneID'] == row['campagneID'])
    if condition1.any():
        merged_total.at[index, 'ingeschreven'] = 1

    # Check if the condition is met
    condition2 = (df_visit['contactID'] == row['contactID']) & (df_visit['campagneID'] == row['campagneID'])
    if condition2.any():
        # get the index
        visit_index = condition2.first_valid_index()

        # calculate the marketing pressure
        df_visit_row = calc_marketing_pressure(df_visit.loc[[visit_index]].copy())

        # add the marketing pressure to the merged_total 
        merged_total.at[index, 'marketing_pressure'] = df_visit_row['marketing_pressure'].values[0]

merged_total['ingeschreven'] = merged_total['ingeschreven'].apply(lambda x: np.int8(x))

ingeschreven
0    19754
1        1
Name: count, dtype: int64

In [None]:
merged_total = merged_total.drop(['contactID', 'campagneID'], axis=1)

In [None]:
merged_total = merged_total.drop_duplicates(keep='first')

#### 7. Alle samples nemen met inschrijving == 1 en 200k samples nemen met inschrijving == 0

In [None]:
df_ingeschreven = merged_total[merged_total['ingeschreven'] == 1]
df_niet_ingeschreven = merged_total[merged_total['ingeschreven'] == 0].sample(n=(250000 - len(df_ingeschreven)), random_state=42)

df = pd.concat([df_ingeschreven, df_niet_ingeschreven])
df['ingeschreven'].value_counts()

In [None]:
# RAM vrijmaken
try:
    del df_ingeschreven
    del df_niet_ingeschreven
    del merged_total
except:
    print("Data is niet meer beschikbaar / is al verwijderd")

#### 8. Verdere data cleaning en One/Multi hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

oneHot = OneHotEncoder(sparse=False)

In [None]:
# Subregio
subregio_1hot = oneHot.fit_transform(df[['subregio']])



In [None]:
# Ondernemingstype
be_ondernemingstype = ce.BinaryEncoder(cols=['ondernemingstype'])
encoded_ondernemingstype = be_ondernemingstype.fit_transform(df['ondernemingstype'])
df = df.join(encoded_ondernemingstype, rsuffix='_ondernemingstype')

In [None]:
# Primaire activiteit
be_primaire_activiteit = ce.BinaryEncoder(cols=['activiteitNaam'])
encoded_primaire_activiteit = be_primaire_activiteit.fit_transform(df['activiteitNaam'])
df = df.join(encoded_primaire_activiteit, rsuffix='_activiteitNaam')

In [None]:
# Ondernemingsaard
diensten_column = []
productie_column = []

for label in df["ondernemingsaard"]:
    if label == "Diensten":
        diensten_column.append(1)
        productie_column.append(0)
    elif label == "Productie":
        diensten_column.append(0)
        productie_column.append(1)
    elif label == "Productie & Diensten":
        diensten_column.append(1)
        productie_column.append(1)
    elif label == "unknown":
        diensten_column.append(0)
        productie_column.append(0)

ondernemingsaard_multihot = pd.DataFrame({"Diensten": diensten_column, "Productie": productie_column})
df = df.join(ondernemingsaard_multihot, rsuffix='_ondernemingsaard')

In [None]:
# Functie_naam
def transform_role(role):
    if "Verantwoordelijke" in role:
        return "Verantwoordelijke"
    elif "Medewerker" in role:
        return "Medewerker"
    else:
        return role

df['functietitel'] = df['functietitel'].apply(transform_role)
functie_1hot = oneHot.fit_transform(df[['functietitel']])



In [None]:
# Campagne soort
online_column = []
offline_column = []

for label in df["campagneSoort"]:
    if label == "Offline":
        online_column.append(0)
        offline_column.append(1)
    elif label == "Online":
        online_column.append(1)
        offline_column.append(0)
    elif label == "On en Offline":
        online_column.append(1)
        offline_column.append(1)
    else:
        online_column.append(0)
        offline_column.append(0)

campagne_soort_multihot = pd.DataFrame({"Online": online_column, "Offline": offline_column})
df = df.join(campagne_soort_multihot, rsuffix='_campagneSoort')

In [None]:
# Campagne type
campagne_type_1hot = oneHot.fit_transform(merged_total[['campagneType']])



In [None]:
df = df.drop(['subregio', 'ondernemingstype', 'activiteitNaam', 'ondernemingsaard', 'functieTitel', 'campagneSoort', 'campagneType'], axis=1)

In [None]:
# Al de one hot encodings samenvoegen
df = df.join(pd.DataFrame(subregio_1hot), rsuffix='_subregio')
df = df.join(pd.DataFrame(functie_1hot), rsuffix='_functie')
df = df.join(pd.DataFrame(campagne_type_1hot), rsuffix='_campagne_type')

In [None]:
df = df.fillna(0)

In [None]:
# int columns
int_cols = df.select_dtypes(include=['int64', 'int32']).columns
df[int_cols] = df[int_cols].apply(lambda x: np.int8(x))

In [None]:
# RAM vrijmaken
try:
    del ondernemingsaard_multihot
    del functie_1hot
    del campagne_soort_multihot
    del campagne_type_1hot
except:
    print("Data is niet meer beschikbaar / is al verwijderd")


#### 9. Dataset opslaan

In [None]:
# Save to csv
df.to_csv('../data_clean/AA_Supervised_dataset.csv', index=False)