In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine

In [2]:
ENV_URL = os.path.join(os.getcwd(), '../.env')
load_dotenv(ENV_URL)

DWH_NAME = os.environ.get('DWH_NAME')
SERVER_NAME = os.environ.get('SERVER_NAME')
DB_USER = os.environ.get('DB_USER')
DB_PASSWORD = os.environ.get('DB_PASSWORD')
SERVER_NAME_REMOTE = os.environ.get('SERVER_NAME_REMOTE')

def connect_db(local=False):
    if local:
        URL_LOCAL = f'mssql+pyodbc://{SERVER_NAME}/{DWH_NAME}?trusted_connection=yes&driver=ODBC+Driver+17 for SQL Server'
        engine = create_engine(URL_LOCAL)
        conn = engine.connect()
        return conn
    else:
        URL = f'mssql+pymssql://{DB_USER}:{DB_PASSWORD}@{SERVER_NAME_REMOTE}:1438/{DWH_NAME}'
        engine = create_engine(URL)
        conn = engine.connect()
        return conn

In [3]:
def create_query(table_name, columns, condition=None):

    query = f"SELECT "

    for i, column in enumerate(columns):
        if i == 0:
            query += f"[{column}]"
        else:
            query += f", [{column}]"
    
    query += f" FROM [{DWH_NAME}].[dbo].[{table_name}]"
    
    if condition:
        query += f" WHERE {condition}"

    return query

In [4]:
conn = connect_db(local=False)

In [5]:
# Get all contact ids
df_trained = pd.read_csv('../data_clean/AA_Supervised_dataset.csv')
df_trained.shape

(300000, 56)

In [6]:
inschrijving_cols = ['campagneID', 'contactID', 'facturatieBedrag']

inschrijving_query = create_query('FactInschrijving', inschrijving_cols)
df_inschrijving = pd.read_sql(inschrijving_query, conn)

df_inschrijving.shape

(78790, 3)

In [7]:
# get all contactids in df_inschrijving that are not in df_trained
df_inschrijving_not_trained = df_inschrijving[~df_inschrijving['contactID'].isin(df_trained['contactID'])]
df_inschrijving_not_trained.shape

(52810, 3)

In [8]:
new_contactids = df_inschrijving_not_trained['contactID'].unique()
new_contactids

array(['0019C15A-6481-E611-80DE-001DD8B72B61',
       '00223C8E-467F-E311-BBFD-005056B06EB4',
       '00231824-53EA-ED11-8849-6045BD895420', ...,
       'FFF68536-5DE0-E111-8A53-984BE17C2819',
       'FFFAE2B6-11D5-EC11-A7B5-000D3ABD1F85',
       'FFFEA9CB-ED93-EC11-B400-000D3A2B10EB'], dtype=object)

In [9]:
contactids = tuple(new_contactids)
contactids

('0019C15A-6481-E611-80DE-001DD8B72B61',
 '00223C8E-467F-E311-BBFD-005056B06EB4',
 '00231824-53EA-ED11-8849-6045BD895420',
 '0025D44A-C19F-E311-B1AE-005056B06EC4',
 '00426A48-F851-EC11-8C62-000D3ABFC672',
 '0069C9F7-B76A-E111-B43A-00505680000A',
 '0081703F-9EBD-ED11-83FF-6045BD895CDC',
 '008AA6D6-E00A-E611-96DE-005056B06EB4',
 '00B91BB6-7E6B-E111-B43A-00505680000A',
 '00C18BC8-5B76-E511-895A-005056B06EC4',
 '00E23D3F-1B6F-E111-B43A-00505680000A',
 '01111FED-F76B-E111-B43A-00505680000A',
 '0111633F-0264-ED11-9561-6045BD8952CE',
 '012429CC-47B7-E911-8104-001DD8B72B62',
 '012B7EE9-ADFA-E611-80E4-001DD8B72B62',
 '012DB8C3-A2D9-E711-80EE-001DD8B72B61',
 '014A271E-A66B-E111-B43A-00505680000A',
 '01536C51-37E4-ED11-A7C7-6045BD895FE3',
 '015DF720-6AEE-E911-8106-001DD8B72B62',
 '0173226A-3DB5-E911-8104-001DD8B72B61',
 '01B45481-0877-E911-80FE-001DD8B72B62',
 '01B760C4-21E7-E811-80FA-001DD8B72B62',
 '01C11E80-FFBB-E411-9B05-005056B06EB4',
 '01CA1242-A85C-EB11-811A-001DD8B72B62',
 '01DEFDD3-2FC2-

In [10]:
contact_cols = ['contactID', 'accountID']
contact_condition = f"contactID IN {tuple(contactids)}"
contact_query = create_query('DimContact', contact_cols, contact_condition)
df_contact = pd.read_sql(contact_query, conn)
# Load accounts
acc_cols = ['accountID', 'subregio', 'ondernemingstype', 'ondernemingsaard', 'activiteitNaam']
acc_query = create_query('DimAccount', acc_cols)
df_account = pd.read_sql(acc_query, conn)
# Merge contacts and accounts
accounts_merged = df_contact.merge(df_account, on='accountID', how='inner').fillna('unknown')
accounts_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9754 entries, 0 to 9753
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   contactID         9754 non-null   object
 1   accountID         9754 non-null   object
 2   subregio          9754 non-null   object
 3   ondernemingstype  9754 non-null   object
 4   ondernemingsaard  9754 non-null   object
 5   activiteitNaam    9754 non-null   object
dtypes: object(6)
memory usage: 457.3+ KB


In [11]:
def map_thema(thema):
    for category, themes in grouping_categories.items():
        if thema in themes:
            return category
    return thema

grouping_categories = {
    'afspraak_Lidmaatschap': ['Lidmaatschap'],
    'afspraak_Welt': ['Welt', 'Welt 2.0', 'Welt 2.0-2023'],
    'afspraak_Plato & Bryo': ['Plato', 'Bryo'],
    'afspraak_Internationaal Ondernemen': ['Internationaal Ondernemen', 'Internationaal Ondernemen - voor Info en Advies'],
    'afspraak_Technologie en Innovatie': ['Digitalisering, IT & Technologie', 'Innovatie', 'Veiligheid & Preventie'],
    'afspraak_Groeien en Netwerking': ['Groeien', 'Netwerking', 'Communicatie', 'Starten'],
    'afspraak_Duurzaamheid': ['Duurzaam Ondernemen', 'Mobiliteit'],
    'afspraak_Familiebedrijfsmanagement': ['Familiebedrijven', 'Opvolging en Overname'],
    'afspraak_Arbeidsmarkt': ['Arbeidsmarkt', 'Opleidingen'],
    'afspraak_Bedrijfsbeheer': ['Algemeen Management', 'Bestuurlijke organisaties', 'Human Resources', 'Ruimtelijke ordening en Infrastructuur'],
    'afspraak_Financieel': ['Financieel', 'Marketing & Sales', 'Aankoop'],
    'afspraak_Logistiek en Transport': ['Logistiek en Transport', 'Haven']
}
afspraak_cols = ['thema', 'contactID']

afspraak_condition = "contactID is not null"
afspraak_query = create_query('DimAfspraak', afspraak_cols, afspraak_condition)
df_afspraak1 = pd.read_sql(afspraak_query, conn)

df_afspraak1.drop_duplicates(inplace=True)

df_afspraak1['thema'] = df_afspraak1['thema'].apply(map_thema)
df_afspraak1 = df_afspraak1.groupby('contactID')['thema'].value_counts().unstack(fill_value=0)
      
afspraak_cols = ['thema', 'accountID']

afspraak_condition = "accountID is not null"
afspraak_query = create_query('DimAfspraak', afspraak_cols, afspraak_condition)
df_afspraak2 = pd.read_sql(afspraak_query, conn)

df_afspraak2.drop_duplicates(inplace=True)

df_afspraak2['thema'] = df_afspraak2['thema'].apply(map_thema)
df_afspraak2 = df_afspraak2.groupby('accountID')['thema'].value_counts().unstack(fill_value=0).apply(lambda x: np.int8(x))

accounts_merged = accounts_merged.merge(df_afspraak1, on=['contactID'], how='left')
accounts_merged = accounts_merged.merge(df_afspraak2, on=['accountID'], how='left')

columns_to_merge = ['afspraak_Arbeidsmarkt', 'afspraak_Bedrijfsbeheer', 'afspraak_Duurzaamheid', 'afspraak_Familiebedrijfsmanagement',
                        'afspraak_Financieel', 'afspraak_Groeien en Netwerking', 'afspraak_Internationaal Ondernemen',
                        'afspraak_Lidmaatschap', 'afspraak_Logistiek en Transport', 'afspraak_Plato & Bryo',
                        'afspraak_Technologie en Innovatie', 'afspraak_Welt']

for column in columns_to_merge:
    accounts_merged[column] = accounts_merged[f'{column}_x'].combine_first(accounts_merged[f'{column}_y'])

accounts_merged = accounts_merged.drop(columns=[f'{column}_x' for column in columns_to_merge] + [f'{column}_y' for column in columns_to_merge]).fillna(0)
accounts_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9754 entries, 0 to 9753
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   contactID                           9754 non-null   object 
 1   accountID                           9754 non-null   object 
 2   subregio                            9754 non-null   object 
 3   ondernemingstype                    9754 non-null   object 
 4   ondernemingsaard                    9754 non-null   object 
 5   activiteitNaam                      9754 non-null   object 
 6   unknown                             9754 non-null   float64
 7   afspraak_Arbeidsmarkt               9754 non-null   float64
 8   afspraak_Bedrijfsbeheer             9754 non-null   float64
 9   afspraak_Duurzaamheid               9754 non-null   float64
 10  afspraak_Familiebedrijfsmanagement  9754 non-null   float64
 11  afspraak_Financieel                 9754 no

In [12]:
campagne_cols = ['campagneID', 'campagneNaam', 'campagneType', 'campagneSoort']
campagne_query = create_query('DimCampagne', campagne_cols)
df_campagne = pd.read_sql(campagne_query, conn)

sessie_cols = ['campaignID', 'themaNaam']
sessie_query = create_query('DimSessie', sessie_cols)
df_sessie = pd.read_sql(sessie_query, conn)
    
sessie_themes_grouped = {
  'sessie_bryo': ['Bryo'],
  'sessie_algemeen': ['Familiebedrijven','Opvolging en Overname','Algemeen Management','Human Resources','Algemeen Management - Intern','Bestuurlijke organisaties'],
  'sessie_onderwijs': ['Opleidingen','Persoonlijke vaardigheden','Onderwijs'],
  'sessie_logistiek': ['Logistiek en Transport','Haven','Supply Chain','Retail'],
  'sessie_welt': ['Welt', 'Welt 2.0', 'Welt 2.0-2023'],
  'sessie_ondernemen': ['Starten', 'Internationaal Ondernemen', 'Jong Voka', 'Groeien', 'Stille Kampioenen', 'Samen doorgaan', 'Strategie'],
  'sessie_duurzaamheid': ['Energie', 'Duurzaam Ondernemen', 'Milieu', 'Mobiliteit'],
  'sessie_lidmaatschap': ['Lidmaatschap'],
  'sessie_innovatie en Technologie': ['Innovatie', 'Digitalisering, IT & Technologie'],
  'sessie_netwerking': ['Netwerking'],
  'sessie_economie': ['Arbeidsmarkt', 'Economie', 'Fiscaal', 'Financieel', 'Marketing & Sales', 'Jobkanaal'],
  'sessie_juridisch': ['Bedrijfsjuridisch', 'Juridisch'],
  'sessie_veiligheid en communicatie': ['Communicatie', 'Veiligheid & Preventie', 'Welzijn en gezondheidszorg'],
  'sessie_andere': ['Plato', 'Ruimtelijke ordening en Infrastructuur', 'Regeringsvorming', 'Industrie', 'Aankoop', 'Privé&Vrije tijd', 'Aantrekkelijke regio', 'Coronavirus', 'unknown']
}

def map_thema(thema):
    for category, themes in sessie_themes_grouped.items():
        if thema in themes:
            return category
    return thema

df_sessie['themaNaam'] = df_sessie['themaNaam'].apply(map_thema)

df_sessie = (df_sessie.assign(themaNaam_list=df_sessie['themaNaam'].str.split(', '))
           .explode('themaNaam_list')
           .drop_duplicates()
           .groupby('campaignID')['themaNaam_list']
           .agg(lambda x: list(set(x)))
           .reset_index())

unique_categories = set(category for row in df_sessie['themaNaam_list'] for category in row)

for category in unique_categories:
    df_sessie[category] = df_sessie['themaNaam_list'].apply(lambda x: np.int8(category in x))

df_sessie.drop('themaNaam_list', axis=1, inplace=True)
    
campagnes_merged = pd.merge(df_campagne, df_sessie, left_on='campagneID', right_on='campaignID', how='left').fillna(0)
campagnes_merged.drop_duplicates(keep='first',inplace=True)
campagnes_merged.drop(['campaignID'], axis=1, inplace=True)

In [13]:
visit_cols = ['contactID', 'visit_first_visit', 'visit_total_pages', 'mailSent_clicks', 'mailSent', 'campaignID']
if len(contactids) > 1:
    visit_condition = f"contactID IN {tuple(contactids)}"
else:
    visit_condition = f"contactID = '{contactids[0]}'"
visit_query = create_query('DimVisit', visit_cols, visit_condition)
df_visit = pd.read_sql(visit_query, conn)
df_visit.drop_duplicates(inplace=True)

df_visit['visit_first_visit'] = df_visit['visit_first_visit'] \
  .str.replace('Ja', '0').str.replace('Nee', '1') \
  .str.replace('unknown', '-1').astype(int)

df_visit['visit_total_pages'] = df_visit['visit_total_pages']\
      .replace('unknown', '-1.0').astype(float)

df_visit['aantal_mails'] = df_visit.groupby(
    ['contactID'])['mailSent'].transform('nunique')

df_visit['clicks_total'] = df_visit.groupby(
    ['contactID'])['mailSent_clicks'].transform('sum')

df_visit['visit_total_pages'] = df_visit.groupby(
    ['contactID'])['visit_total_pages'].transform('sum').astype(int)

df_visit['visit_first_visit'] = df_visit.groupby(
    ['contactID'])['visit_first_visit'].transform('sum').astype(int)

df_visit['mail_click_freq'] = np.round(df_visit['clicks_total'] / df_visit['aantal_mails'], 0)
df_visit['mail_click_freq'] = df_visit['mail_click_freq'].fillna(-1).astype(int)

df_visit.drop(['mailSent', 'mailSent_clicks', 'clicks_total', 'aantal_mails'], axis=1, inplace=True)
df_visit.drop_duplicates(inplace=True)

int_cols = df_visit.select_dtypes(include=['int64', 'int32']).columns
df_visit[int_cols] = df_visit[int_cols].astype('int8')

df_visit.reset_index(inplace=True)
df_visit.drop_duplicates(inplace=True)

In [14]:
accounts_merged = accounts_merged.drop(['accountID'], axis=1)

In [15]:
accounts_merged = accounts_merged.drop_duplicates(keep='first')
campagnes_merged = campagnes_merged.drop_duplicates(keep='first')

In [16]:
numeric_cols = accounts_merged.select_dtypes(include=np.number).columns
accounts_merged[numeric_cols] = accounts_merged[numeric_cols].astype('int8')

numeric_cols = campagnes_merged.select_dtypes(include=np.number).columns
campagnes_merged[numeric_cols] = campagnes_merged[numeric_cols].astype('int8')

In [17]:
inschrijving_cols = ['campagneID', 'contactID', 'facturatieBedrag']

inschrijving_query = create_query('FactInschrijving', inschrijving_cols)
df_inschrijving = pd.read_sql(inschrijving_query, conn)

df_inschrijving.shape

(78790, 3)

In [18]:
int_cols = accounts_merged.select_dtypes(include=['int64', 'int32']).columns
accounts_merged[int_cols] = accounts_merged[int_cols].apply(lambda x: np.int8(x))

int_cols = campagnes_merged.select_dtypes(include=['int64', 'int32']).columns
campagnes_merged[int_cols] = campagnes_merged[int_cols].apply(lambda x: np.int8(x))

In [19]:
merged_total = pd.merge(accounts_merged.assign(key=1), campagnes_merged.assign(key=1), on='key').drop('key', axis=1)
merged_total.shape

(40001154, 36)

In [20]:
try:
    del accounts_merged
    del campagnes_merged
except:
    print("Data is niet meer beschikbaar / is al verwijderd")

In [21]:
merged_total = merged_total.drop_duplicates(keep='first')

In [22]:
def calc_marketing_pressure(row):
    marketing_pressure_cols = ['visit_first_visit', 'visit_total_pages', 'mail_click_freq']
    return int(row[marketing_pressure_cols].sum())

In [23]:
df_inschrijving['ingeschreven'] = 1
merged_total = merged_total.merge(df_inschrijving[['contactID', 'campagneID', 'ingeschreven']], on=['contactID', 'campagneID'], how='left')
merged_total['ingeschreven'] = merged_total['ingeschreven'].fillna(0).apply(lambda x: np.int8(x))

In [24]:
# RAM vrijmaken
try:
    del df_inschrijving
except:
    print("Data is niet meer beschikbaar / is al verwijderd")

In [25]:
df_visit['marketing_pressure'] = df_visit.apply(calc_marketing_pressure, axis=1)
merged_total = pd.merge(merged_total, df_visit[['contactID', 'campaignID', 'marketing_pressure']], 
                        left_on=['contactID', 'campagneID'], right_on=['contactID', 'campaignID'], how='left')
merged_total['marketing_pressure'] = merged_total['marketing_pressure'].fillna(-1).apply(lambda x: np.int8(x))

In [26]:
merged_total = merged_total.drop(['campaignID'], axis=1)

In [27]:
merged_total['ingeschreven'].value_counts()

ingeschreven
0    39974059
1       52810
Name: count, dtype: int64

In [28]:
# RAM vrijmaken
try:
    del df_visit
except:
    print("Data is niet meer beschikbaar / is al verwijderd")

In [29]:
# Check
df_ingeschreven = merged_total[merged_total['ingeschreven'] == 1]
df_niet_ingeschreven = merged_total[merged_total['ingeschreven'] == 0].sample(n=len(df_ingeschreven), random_state=42)

df = pd.concat([df_ingeschreven, df_niet_ingeschreven])
df['ingeschreven'].value_counts()

ingeschreven
1    52810
0    52810
Name: count, dtype: int64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105620 entries, 1646 to 7969338
Data columns (total 38 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   contactID                           105620 non-null  object
 1   subregio                            105620 non-null  object
 2   ondernemingstype                    105620 non-null  object
 3   ondernemingsaard                    105620 non-null  object
 4   activiteitNaam                      105620 non-null  object
 5   unknown                             105620 non-null  int8  
 6   afspraak_Arbeidsmarkt               105620 non-null  int8  
 7   afspraak_Bedrijfsbeheer             105620 non-null  int8  
 8   afspraak_Duurzaamheid               105620 non-null  int8  
 9   afspraak_Familiebedrijfsmanagement  105620 non-null  int8  
 10  afspraak_Financieel                 105620 non-null  int8  
 11  afspraak_Groeien en Netwerking      1056

In [32]:
# save csv
df.to_csv('../data_clean/AA_Vault_dataset.csv', index=False)

In [41]:
# read in csv
df = pd.read_csv('../data_clean/AA_Vault_dataset.csv')

In [42]:
from sklearn.preprocessing import OneHotEncoder

subregio_cat = ['Gent', 'Leiestreek-Meetjesland', 'Dendermonde', 'Aalst', 'Oudenaarde', 'Waasland']
campagnetype_cat = ['Project', 'Projectgebonden', 'Campagne', 'Opleiding', 'Netwerkevenement', 'Infosessie']

oneHot_subregio = OneHotEncoder(sparse=False, categories=[subregio_cat])
oneHot_campagnetype = OneHotEncoder(sparse=False, categories=[campagnetype_cat])

# Subregio
subregio_1hot = oneHot_subregio.fit_transform(df[['subregio']])



In [43]:
# Ondernemingstype
ondernemingstype_categories = [
 {'categorie': 'unknown', 'binary': None},
 {'categorie': 'Beroepsorganisatie', 'binary': None},
 {'categorie': 'Vakbonden', 'binary': None},
 {'categorie': 'Eenmanszaak', 'binary': None},
 {'categorie': 'Multinational', 'binary': None},
 {'categorie': 'Sociale organisatie', 'binary': None},
 {'categorie': 'Werkgeversorganisaties', 'binary': None},
 {'categorie': 'Pers/Media', 'binary': None},
 {'categorie': 'Overheid', 'binary': None},
 {'categorie': 'Onderwijs', 'binary': None},
 {'categorie': 'Social Profit', 'binary': None},
 {'categorie': 'Vrije beroepen', 'binary': None},
 {'categorie': 'Familiebedrijf', 'binary': None},
 {'categorie': 'Bedrijf', 'binary': None}
]

for i, categorie in enumerate(ondernemingstype_categories):
  categorie['binary'] = str(bin(i)[2:].zfill(4))

category_to_binary = {categorie['categorie']: categorie['binary'] for categorie in ondernemingstype_categories}
df['ondernemingstype'] = df['ondernemingstype'].map(category_to_binary)

for i in range(1, 5):
    df[f'ondernemingstype_{i}'] = df['ondernemingstype'].apply(lambda x: int(str(x)[i-1]))

In [44]:
df['activiteitNaam'] = df['activiteitNaam'].apply(lambda x: 'unknown' if x == 'Luchthavengerelateerd' else x)
    
# Primaire activiteit
activiteitNaam_categories = [
  {'categorie': 'unknown', 'binary': None},
  {'categorie': 'Farmacie', 'binary': None},
  {'categorie': 'Diamant, edelstenen, juwelen', 'binary': None},
  {'categorie': 'Havengerelateerd', 'binary': None},
  {'categorie': 'Media', 'binary': None},
  {'categorie': 'Overheid', 'binary': None},
  {'categorie': 'Verenigingen en maatschappelijke organisaties', 'binary': None},
  {'categorie': 'Onderwijs', 'binary': None},
  {'categorie': 'Milieu', 'binary': None},
  {'categorie': 'Vrije beroepen', 'binary': None},
  {'categorie': 'Agrarische & bio-industrie', 'binary': None},
  {'categorie': 'Hout- en meubelindustrie', 'binary': None},
  {'categorie': 'Accountancy & boekhouding', 'binary': None},
  {'categorie': 'Vastgoed', 'binary': None},
  {'categorie': 'Verzekering', 'binary': None},
  {'categorie': 'Financiële diensten', 'binary': None},
  {'categorie': 'Grafische industrie en diensten', 'binary': None},
  {'categorie': 'Automobiel- en Tweewielerindustrie', 'binary': None},
  {'categorie': 'Textiel, kleding en confectie', 'binary': None},
  {'categorie': 'Horeca & toerisme', 'binary': None},
  {'categorie': 'Technologische industrie & diensten', 'binary': None},
  {'categorie': 'Zorg', 'binary': None},
  {'categorie': 'Detailhandel', 'binary': None},
  {'categorie': 'Groothandel', 'binary': None},
  {'categorie': 'Bouw', 'binary': None},
  {'categorie': 'Energie', 'binary': None},
  {'categorie': 'Consultancy', 'binary': None},
  {'categorie': 'Papier & karton', 'binary': None},
  {'categorie': 'Human capital', 'binary': None},
  {'categorie': 'Chemie, petrochemie', 'binary': None},
  {'categorie': 'Distributie, logistiek en transport', 'binary': None},
  {'categorie': 'Telecom & IT', 'binary': None},
  {'categorie': 'Ijzer en staal', 'binary': None},
  {'categorie': 'Voeding', 'binary': None},
  {'categorie': 'Overige industrie & diensten', 'binary': None}
]
    
df['activiteitNaam'] = df['activiteitNaam'].apply(lambda x: 'unknown' if x not in [categorie['categorie'] for categorie in activiteitNaam_categories] else x)

for i, categorie in enumerate(activiteitNaam_categories):
    categorie['binary'] = str(bin(i)[2:].zfill(6))

category_to_binary = {categorie['categorie']: categorie['binary'] for categorie in activiteitNaam_categories}
df['activiteitNaam'] = df['activiteitNaam'].map(category_to_binary)

for i in range(1, 7):
    df[f'activiteitNaam_{i}'] = df['activiteitNaam'].apply(lambda x: int(str(x)[i-1]))

In [45]:
# Ondernemingsaard
diensten_column = []
productie_column = []

for label in df["ondernemingsaard"]:
    if label == "Productie & Diensten":
        diensten_column.append(1)
        productie_column.append(1)
    elif label == "Diensten":
        diensten_column.append(1)
        productie_column.append(0)
    elif label == "Productie":
        diensten_column.append(0)
        productie_column.append(1)
    else:
        diensten_column.append(0)
        productie_column.append(0)
print(len(productie_column))
ondernemingsaard_multihot = pd.DataFrame({"Diensten": diensten_column, "Productie": productie_column})
df = df.join(ondernemingsaard_multihot, rsuffix='_ondernemingsaard')
    
# Campagne soort
online_column = []
offline_column = []

for label in df["campagneSoort"]:
    if label == "On en Offline":
        online_column.append(1)
        offline_column.append(1)
    elif label == "Offline":
        online_column.append(0)
        offline_column.append(1)
    elif label == "Online":
        online_column.append(1)
        offline_column.append(0)
    else:
        online_column.append(0)
        offline_column.append(1)
print(len(online_column))
campagne_soort_multihot = pd.DataFrame({"Online": online_column, "Offline": offline_column})
    
# Campagne type
campagne_type_1hot = oneHot_campagnetype.fit_transform(df[['campagneType']])
    
df = df.drop(['subregio', 'ondernemingstype', 'activiteitNaam', 'ondernemingsaard', 'campagneSoort', 'campagneType', 'unknown'], axis=1)
    
# Al de one hot encodings samenvoegen
df = df.join(pd.DataFrame(subregio_1hot), rsuffix='_subregio')
df = df.join(pd.DataFrame(campagne_type_1hot), rsuffix='_campagne_type')
df = df.join(campagne_soort_multihot, rsuffix='_campagneSoort')

105620
105620




In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105620 entries, 0 to 105619
Data columns (total 57 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   contactID                           105620 non-null  object
 1   campagneID                          105620 non-null  object
 2   campagneNaam                        105620 non-null  object
 3   afspraak_Arbeidsmarkt               105620 non-null  int8  
 4   afspraak_Bedrijfsbeheer             105620 non-null  int8  
 5   afspraak_Duurzaamheid               105620 non-null  int8  
 6   afspraak_Familiebedrijfsmanagement  105620 non-null  int8  
 7   afspraak_Financieel                 105620 non-null  int8  
 8   afspraak_Groeien en Netwerking      105620 non-null  int8  
 9   afspraak_Internationaal Ondernemen  105620 non-null  int8  
 10  afspraak_Lidmaatschap               105620 non-null  int8  
 11  afspraak_Logistiek en Transport     105

In [47]:
custom_order = ['contactID', 'campagneID', 'campagneNaam', 'afspraak_Arbeidsmarkt', 'afspraak_Bedrijfsbeheer', 'afspraak_Duurzaamheid', 'afspraak_Familiebedrijfsmanagement', 'afspraak_Financieel', 'afspraak_Groeien en Netwerking', 'afspraak_Internationaal Ondernemen', 'afspraak_Lidmaatschap', 'afspraak_Logistiek en Transport', 'afspraak_Plato & Bryo', 'afspraak_Technologie en Innovatie', 'afspraak_Welt', 'sessie_ondernemen', 'sessie_logistiek', 'sessie_onderwijs', 'sessie_duurzaamheid', 'sessie_welt', 'sessie_lidmaatschap', 'sessie_innovatie en Technologie', 'sessie_netwerking', 'sessie_algemeen', 'sessie_juridisch', 'sessie_bryo', 'sessie_economie', 'sessie_veiligheid en communicatie', 'sessie_andere', 'marketing_pressure', 'ondernemingstype_1', 'ondernemingstype_2', 'ondernemingstype_3', 'ondernemingstype_4', 'activiteitNaam_1', 'activiteitNaam_2', 'activiteitNaam_3', 'activiteitNaam_4', 'activiteitNaam_5', 'activiteitNaam_6', 'Diensten', 'Productie', '0', '1', '2', '3', '4', '5', '0_campagne_type', '1_campagne_type', '2_campagne_type', '3_campagne_type', '4_campagne_type', '5_campagne_type', 'Online', 'Offline', 'ingeschreven']
df = df[custom_order]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105620 entries, 0 to 105619
Data columns (total 57 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   contactID                           105620 non-null  object 
 1   campagneID                          105620 non-null  object 
 2   campagneNaam                        105620 non-null  object 
 3   afspraak_Arbeidsmarkt               105620 non-null  int64  
 4   afspraak_Bedrijfsbeheer             105620 non-null  int64  
 5   afspraak_Duurzaamheid               105620 non-null  int64  
 6   afspraak_Familiebedrijfsmanagement  105620 non-null  int64  
 7   afspraak_Financieel                 105620 non-null  int64  
 8   afspraak_Groeien en Netwerking      105620 non-null  int64  
 9   afspraak_Internationaal Ondernemen  105620 non-null  int64  
 10  afspraak_Lidmaatschap               105620 non-null  int64  
 11  afspraak_Logistiek en Tran

In [48]:
int_cols = df.select_dtypes(include=['int64', 'int32', 'float32', 'float64']).columns
df[int_cols] = df[int_cols].apply(lambda x: np.int8(x))

In [49]:
# Opslaan als csv
df.to_csv('../data_clean/AA_Supervised_dataset_2.csv', index=False)