In [71]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine

In [72]:
ENV_URL = os.path.join(os.getcwd(), '../.env')
load_dotenv(ENV_URL)

DWH_NAME = os.environ.get('DWH_NAME')
SERVER_NAME = os.environ.get('SERVER_NAME')
DB_USER = os.environ.get('DB_USER')
DB_PASSWORD = os.environ.get('DB_PASSWORD')

URL = f'mssql+pymssql://{DB_USER}:{DB_PASSWORD}@{SERVER_NAME}/{DWH_NAME}'
URL_LOCAL = f'mssql+pyodbc://{SERVER_NAME}/{DWH_NAME}?trusted_connection=yes&driver=ODBC+Driver+17 for SQL Server'

engine = create_engine(URL_LOCAL)
conn = engine.connect()

In [73]:
def create_query(table_name, columns, condition=None):

    query = f"SELECT "

    for i, column in enumerate(columns):
        if i == 0:
            query += f"[{column}]"
        else:
            query += f", [{column}]"
    
    query += f" FROM [{DWH_NAME}].[dbo].[{table_name}]"
    
    if condition:
        query += f" WHERE {condition}"

    return query

### Eerst gaan we alle datafiles die we nodig hebben mergen tot 1 dataframe

#### 1. Alles van Account mergen

In [74]:
acc_cols = ['accountID', 'subregio', 'ondernemingstype', 'ondernemingsaard', 'activiteitNaam']
# account conditie
acc_condition = "accountStatus = 1 AND provincie = 'Oost-Vlaanderen'"
# create query
acc_query = create_query('DimAccount', acc_cols, acc_condition)
# read sql
df_account = pd.read_sql(acc_query, conn)
df_account.shape

(3211, 5)

In [75]:
list(df_account.columns)

['accountID',
 'subregio',
 'ondernemingstype',
 'ondernemingsaard',
 'activiteitNaam']

##### Accounts merge met Contactfiches

In [76]:
contact_cols = ['contactID', 'accountID']

contact_condition = "contactStatus = 'Actief'"
contact_query = create_query('DimContact', contact_cols, contact_condition)
df_contact = pd.read_sql(contact_query, conn)

df_contact.shape

(70114, 2)

In [77]:
accounts_merged = pd.merge(df_contact, df_account, on='accountID', how='inner')
accounts_merged.shape

(16537, 6)

#### 2. Alles van Afspraken mergen

In [78]:
grouping_categories = {
    'afspraak_Lidmaatschap': ['Lidmaatschap'],
    'afspraak_Welt': ['Welt', 'Welt 2.0', 'Welt 2.0-2023'],
    'afspraak_Plato & Bryo': ['Plato', 'Bryo'],
    'afspraak_Internationaal Ondernemen': ['Internationaal Ondernemen', 'Internationaal Ondernemen - voor Info en Advies'],
    'afspraak_Technologie en Innovatie': ['Digitalisering, IT & Technologie', 'Innovatie', 'Veiligheid & Preventie'],
    'afspraak_Groeien en Netwerking': ['Groeien', 'Netwerking', 'Communicatie', 'Starten'],
    'afspraak_Duurzaamheid': ['Duurzaam Ondernemen', 'Mobiliteit'],
    'afspraak_Familiebedrijfsmanagement': ['Familiebedrijven', 'Opvolging en Overname'],
    'afspraak_Arbeidsmarkt': ['Arbeidsmarkt', 'Opleidingen'],
    'afspraak_Bedrijfsbeheer': ['Algemeen Management', 'Bestuurlijke organisaties', 'Human Resources', 'Ruimtelijke ordening en Infrastructuur'],
    'afspraak_Financieel': ['Financieel', 'Marketing & Sales', 'Aankoop'],
    'afspraak_Logistiek en Transport': ['Logistiek en Transport', 'Haven']
}

def map_thema(thema):
    for category, themes in grouping_categories.items():
        if thema in themes:
            return category
    return thema

In [79]:
afspraak_cols = ['thema', 'contactID']

afspraak_condition = "contactID is not null"
afspraak_query = create_query('DimAfspraak', afspraak_cols, afspraak_condition)
df_afspraak1 = pd.read_sql(afspraak_query, conn)

df_afspraak1.drop_duplicates(inplace=True)

df_afspraak1['thema'] = df_afspraak1['thema'].apply(map_thema)
df_afspraak1 = df_afspraak1.groupby('contactID')['thema'].value_counts().unstack(fill_value=0)

df_afspraak1.shape

(1839, 12)

In [80]:
afspraak_cols = ['thema', 'accountID']

afspraak_condition = "accountID is not null"
afspraak_query = create_query('DimAfspraak', afspraak_cols, afspraak_condition)
df_afspraak2 = pd.read_sql(afspraak_query, conn)

df_afspraak2.drop_duplicates(inplace=True)

df_afspraak2['thema'] = df_afspraak2['thema'].apply(map_thema)
df_afspraak2 = df_afspraak2.groupby('accountID')['thema'].value_counts().unstack(fill_value=0).apply(lambda x: np.int8(x))

df_afspraak2.shape

(2770, 13)

##### Alle afspraken mergen

In [81]:
accounts_merged = accounts_merged.merge(df_afspraak1, on=['contactID'], how='left')
accounts_merged = accounts_merged.merge(df_afspraak2, on=['accountID'], how='left')

columns_to_merge = ['afspraak_Arbeidsmarkt', 'afspraak_Bedrijfsbeheer', 'afspraak_Duurzaamheid', 'afspraak_Familiebedrijfsmanagement',
                    'afspraak_Financieel', 'afspraak_Groeien en Netwerking', 'afspraak_Internationaal Ondernemen',
                    'afspraak_Lidmaatschap', 'afspraak_Logistiek en Transport', 'afspraak_Plato & Bryo',
                    'afspraak_Technologie en Innovatie', 'afspraak_Welt']

for column in columns_to_merge:
    accounts_merged[column] = accounts_merged[f'{column}_x'].combine_first(accounts_merged[f'{column}_y'])

accounts_merged = accounts_merged.drop(columns=[f'{column}_x' for column in columns_to_merge] + [f'{column}_y' for column in columns_to_merge]).fillna(0)
accounts_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16537 entries, 0 to 16536
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   contactID                           16537 non-null  object 
 1   accountID                           16537 non-null  object 
 2   subregio                            16537 non-null  object 
 3   ondernemingstype                    16537 non-null  object 
 4   ondernemingsaard                    16537 non-null  object 
 5   activiteitNaam                      16537 non-null  object 
 6   unknown                             16537 non-null  float64
 7   afspraak_Arbeidsmarkt               16537 non-null  float64
 8   afspraak_Bedrijfsbeheer             16537 non-null  float64
 9   afspraak_Duurzaamheid               16537 non-null  float64
 10  afspraak_Familiebedrijfsmanagement  16537 non-null  float64
 11  afspraak_Financieel                 16537

#### 3. Alles van Campagnes mergen

In [82]:
campagne_cols = ['campagneID', 'campagneType', 'campagneSoort']

campagne_query = create_query('DimCampagne', campagne_cols)
df_campagne = pd.read_sql(campagne_query, conn)

df_campagne.shape

(4101, 3)

In [83]:
sessie_cols = ['campaignID', 'themaNaam']

sessie_query = create_query('DimSessie', sessie_cols)
df_sessie = pd.read_sql(sessie_query, conn)

df_sessie.shape

(78790, 2)

In [84]:
sessie_themes_grouped = {
  'sessie_bryo': ['Bryo'],
  'sessie_algemeen': ['Familiebedrijven','Opvolging en Overname','Algemeen Management','Human Resources','Algemeen Management - Intern','Bestuurlijke organisaties'],
  'sessie_onderwijs': ['Opleidingen','Persoonlijke vaardigheden','Onderwijs'],
  'sessie_logistiek': ['Logistiek en Transport','Haven','Supply Chain','Retail'],
  'sessie_welt': ['Welt', 'Welt 2.0', 'Welt 2.0-2023'],
  'sessie_ondernemen': ['Starten', 'Internationaal Ondernemen', 'Jong Voka', 'Groeien', 'Stille Kampioenen', 'Samen doorgaan', 'Strategie'],
  'sessie_duurzaamheid': ['Energie', 'Duurzaam Ondernemen', 'Milieu', 'Mobiliteit'],
  'sessie_lidmaatschap': ['Lidmaatschap'],
  'sessie_innovatie en Technologie': ['Innovatie', 'Digitalisering, IT & Technologie'],
  'sessie_netwerking': ['Netwerking'],
  'sessie_economie': ['Arbeidsmarkt', 'Economie', 'Fiscaal', 'Financieel', 'Marketing & Sales', 'Jobkanaal'],
  'sessie_juridisch': ['Bedrijfsjuridisch', 'Juridisch'],
  'sessie_veiligheid en communicatie': ['Communicatie', 'Veiligheid & Preventie', 'Welzijn en gezondheidszorg'],
  'sessie_andere': ['Plato', 'Ruimtelijke ordening en Infrastructuur', 'Regeringsvorming', 'Industrie', 'Aankoop', 'Privé&Vrije tijd', 'Aantrekkelijke regio', 'Coronavirus', 'unknown']
}

def map_thema(thema):
    for category, themes in sessie_themes_grouped.items():
        if thema in themes:
            return category
    return thema

df_sessie['themaNaam'] = df_sessie['themaNaam'].apply(map_thema)

In [85]:
df_sessie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78790 entries, 0 to 78789
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   campaignID  78790 non-null  object
 1   themaNaam   78790 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [86]:
df_sessie = (df_sessie.assign(themaNaam_list=df_sessie['themaNaam'].str.split(', '))
               .explode('themaNaam_list')
               .drop_duplicates()
               .groupby('campaignID')['themaNaam_list']
               .agg(lambda x: list(set(x)))
               .reset_index()
               .sort_values(by='themaNaam_list', key=lambda x: x.str.len(), ascending=False))
df_sessie

Unnamed: 0,campaignID,themaNaam_list
495,416A235F-F209-E911-80FA-001DD8B72B62,"[sessie_ondernemen, sessie_duurzaamheid, sessi..."
1596,CFF2BA02-E99C-EB11-811E-001DD8B72B62,"[sessie_ondernemen, sessie_duurzaamheid, sessi..."
1243,A4658A7D-3629-E911-80FB-001DD8B72B62,"[sessie_ondernemen, sessie_duurzaamheid, sessi..."
748,61FE6D67-F088-E811-80F3-001DD8B72B61,"[sessie_ondernemen, sessie_duurzaamheid, sessi..."
1509,C6D72260-E451-EC11-8C62-000D3ABFCF4A,"[sessie_ondernemen, sessie_innovatie en Techno..."
...,...,...
654,55D5F840-22D7-EA11-8114-001DD8B72B62,[sessie_economie]
653,55CC339F-1977-E911-80FE-001DD8B72B62,[sessie_ondernemen]
652,558B3A03-23A2-ED11-AAD1-6045BD895B5A,[sessie_netwerking]
651,5532580C-870C-EC11-8123-001DD8B72B61,[sessie_netwerking]


In [87]:
unique_categories = set(category for row in df_sessie['themaNaam_list'] for category in row)

for category in unique_categories:
    df_sessie[category] = df_sessie['themaNaam_list'].apply(lambda x: np.int8(category in x))

df_sessie.drop('themaNaam_list', axis=1, inplace=True)

In [88]:
campagnes_merged = pd.merge(df_campagne, df_sessie, left_on='campagneID', right_on='campaignID', how='left').fillna(0)
campagnes_merged.drop(['campaignID'], axis=1, inplace=True)
campagnes_merged.drop_duplicates(inplace=True)
campagnes_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4101 entries, 0 to 4100
Data columns (total 17 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   campagneID                         4101 non-null   object 
 1   campagneType                       4101 non-null   object 
 2   campagneSoort                      4101 non-null   object 
 3   sessie_ondernemen                  4101 non-null   float64
 4   sessie_logistiek                   4101 non-null   float64
 5   sessie_onderwijs                   4101 non-null   float64
 6   sessie_duurzaamheid                4101 non-null   float64
 7   sessie_welt                        4101 non-null   float64
 8   sessie_lidmaatschap                4101 non-null   float64
 9   sessie_innovatie en Technologie    4101 non-null   float64
 10  sessie_netwerking                  4101 non-null   float64
 11  sessie_algemeen                    4101 non-null   float

#### CDI

In [89]:
visit_cols = ['contactID', 'visit_first_visit', 'visit_total_pages', 'mailSent_clicks', 'mailSent', 'campaignID']

visit_query = create_query('DimVisit', visit_cols)
df_visit = pd.read_sql(visit_query, conn)

df_visit.drop_duplicates(inplace=True)

df_visit['visit_first_visit'] = df_visit['visit_first_visit'] \
      .str.replace('Ja', '0').str.replace('Nee', '1') \
      .str.replace('unknown', '-1').astype(int)

df_visit['visit_total_pages'] = df_visit['visit_total_pages']\
      .replace('unknown', '-1.0').astype(float)

df_visit['aantal_mails'] = df_visit.groupby(
    ['contactID'])['mailSent'].transform('nunique')

df_visit['clicks_total'] = df_visit.groupby(
    ['contactID'])['mailSent_clicks'].transform('sum')

df_visit['visit_total_pages'] = df_visit.groupby(
    ['contactID'])['visit_total_pages'].transform('sum').astype(int)

df_visit['visit_first_visit'] = df_visit.groupby(
    ['contactID'])['visit_first_visit'].transform('sum').astype(int)

df_visit['mail_click_freq'] = np.round(df_visit['clicks_total'] / df_visit['aantal_mails'], 0)
df_visit['mail_click_freq'] = df_visit['mail_click_freq'].fillna(-1).astype(int)

df_visit.drop(['mailSent', 'mailSent_clicks', 'clicks_total', 'aantal_mails'], axis=1, inplace=True)
df_visit.drop_duplicates(inplace=True)

int_cols = df_visit.select_dtypes(include=['int64', 'int32']).columns
df_visit[int_cols] = df_visit[int_cols].astype('int8')

df_visit.reset_index(inplace=True)
df_visit.drop_duplicates(inplace=True)
df_visit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15631 entries, 0 to 15630
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   index              15631 non-null  int64 
 1   contactID          15631 non-null  object
 2   visit_first_visit  15631 non-null  int8  
 3   visit_total_pages  15631 non-null  int8  
 4   campaignID         15631 non-null  object
 5   mail_click_freq    15631 non-null  int8  
dtypes: int64(1), int8(3), object(2)
memory usage: 412.3+ KB


#### 5. Cleaning voor de grote merge

In [90]:
accounts_merged = accounts_merged.drop(['accountID'], axis=1)

In [91]:
# Drop duplicates
accounts_merged = accounts_merged.drop_duplicates(keep='first')
campagnes_merged = campagnes_merged.drop_duplicates(keep='first')

In [92]:
numeric_cols = accounts_merged.select_dtypes(include=np.number).columns
accounts_merged[numeric_cols] = accounts_merged[numeric_cols].astype('int8')

numeric_cols = campagnes_merged.select_dtypes(include=np.number).columns
campagnes_merged[numeric_cols] = campagnes_merged[numeric_cols].astype('int8')

In [93]:
accounts_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16537 entries, 0 to 16536
Data columns (total 18 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   contactID                           16537 non-null  object
 1   subregio                            16537 non-null  object
 2   ondernemingstype                    16537 non-null  object
 3   ondernemingsaard                    16537 non-null  object
 4   activiteitNaam                      16537 non-null  object
 5   unknown                             16537 non-null  int8  
 6   afspraak_Arbeidsmarkt               16537 non-null  int8  
 7   afspraak_Bedrijfsbeheer             16537 non-null  int8  
 8   afspraak_Duurzaamheid               16537 non-null  int8  
 9   afspraak_Familiebedrijfsmanagement  16537 non-null  int8  
 10  afspraak_Financieel                 16537 non-null  int8  
 11  afspraak_Groeien en Netwerking      16537 non-null  in

In [94]:
accounts_merged['activiteitNaam'].value_counts()

activiteitNaam
Overige industrie & diensten                     6434
Voeding                                          1939
Ijzer en staal                                   1595
Chemie, petrochemie                              1427
Distributie, logistiek en transport              1055
Telecom & IT                                      905
Papier & karton                                   510
Human capital                                     472
Energie                                           364
Zorg                                              271
Consultancy                                       259
Technologische industrie & diensten               178
Groothandel                                       174
Bouw                                              144
Automobiel- en Tweewielerindustrie                132
Financiële diensten                                94
Detailhandel                                       86
Textiel, kleding en confectie                      76
Grafische ind

In [95]:
campagnes_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4101 entries, 0 to 4100
Data columns (total 17 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   campagneID                         4101 non-null   object
 1   campagneType                       4101 non-null   object
 2   campagneSoort                      4101 non-null   object
 3   sessie_ondernemen                  4101 non-null   int8  
 4   sessie_logistiek                   4101 non-null   int8  
 5   sessie_onderwijs                   4101 non-null   int8  
 6   sessie_duurzaamheid                4101 non-null   int8  
 7   sessie_welt                        4101 non-null   int8  
 8   sessie_lidmaatschap                4101 non-null   int8  
 9   sessie_innovatie en Technologie    4101 non-null   int8  
 10  sessie_netwerking                  4101 non-null   int8  
 11  sessie_algemeen                    4101 non-null   int8  
 12  sessie

#### 6. Alles mergen

In [96]:
inschrijving_cols = ['campagneID', 'contactID', 'facturatieBedrag']

inschrijving_query = create_query('FactInschrijving', inschrijving_cols)
df_inschrijving = pd.read_sql(inschrijving_query, conn)

df_inschrijving.shape

(78790, 3)

In [97]:
int_cols = accounts_merged.select_dtypes(include=['int64', 'int32']).columns
accounts_merged[int_cols] = accounts_merged[int_cols].apply(lambda x: np.int8(x))

int_cols = campagnes_merged.select_dtypes(include=['int64', 'int32']).columns
campagnes_merged[int_cols] = campagnes_merged[int_cols].apply(lambda x: np.int8(x))

In [98]:
pq.write_table(pa.Table.from_pandas(accounts_merged), 'account_merged.parquet')
pq.write_table(pa.Table.from_pandas(campagnes_merged), 'campagnes_merged.parquet')

# Read DataFrames back from Parquet files
account_merged_parquet = pd.read_parquet('account_merged.parquet')
campagnes_merged_parquet = pd.read_parquet('campagnes_merged.parquet')

In [99]:
merged_total = pd.merge(account_merged_parquet.assign(key=1), campagnes_merged_parquet.assign(key=1), on='key').drop('key', axis=1)
merged_total.shape

(67818237, 35)

In [100]:
# RAM vrijmaken
try:
    del accounts_merged
    del campagnes_merged
    del account_merged_parquet
    del campagnes_merged_parquet
except:
    print("Data is niet meer beschikbaar / is al verwijderd")

In [101]:
merged_total = merged_total.drop_duplicates(keep='first')

In [102]:
def calc_marketing_pressure(row):
    marketing_pressure_cols = ['visit_first_visit', 'visit_total_pages', 'mail_click_freq']
    return int(row[marketing_pressure_cols].sum())

In [103]:
df_inschrijving['ingeschreven'] = 1
merged_total = merged_total.merge(df_inschrijving[['contactID', 'campagneID', 'ingeschreven']], on=['contactID', 'campagneID'], how='left')
merged_total['ingeschreven'] = merged_total['ingeschreven'].fillna(0).apply(lambda x: np.int8(x))

In [104]:
# RAM vrijmaken
try:
    del df_inschrijving
except:
    print("Data is niet meer beschikbaar / is al verwijderd")

In [105]:
df_visit['marketing_pressure'] = df_visit.apply(calc_marketing_pressure, axis=1)
merged_total = pd.merge(merged_total, df_visit[['contactID', 'campaignID', 'marketing_pressure']], 
                        left_on=['contactID', 'campagneID'], right_on=['contactID', 'campaignID'], how='left')
merged_total['marketing_pressure'] = merged_total['marketing_pressure'].fillna(-1).apply(lambda x: np.int8(x))

In [106]:
merged_total = merged_total.drop(['contactID', 'campagneID', 'campaignID'], axis=1)

In [107]:
merged_total.shape

(67830255, 35)

In [108]:
merged_total['ingeschreven'].value_counts()

ingeschreven
0    67804274
1       25981
Name: count, dtype: int64

In [109]:
# RAM vrijmaken
try:
    del df_visit
except:
    print("Data is niet meer beschikbaar / is al verwijderd")

#### 7. Alle samples nemen met inschrijving == 1 en 300k samples nemen met inschrijving == 0

In [110]:
df_ingeschreven = merged_total[merged_total['ingeschreven'] == 1]
df_niet_ingeschreven = merged_total[merged_total['ingeschreven'] == 0].sample(n=(300000 - len(df_ingeschreven)), random_state=42)

df = pd.concat([df_ingeschreven, df_niet_ingeschreven])
df['ingeschreven'].value_counts()

ingeschreven
0    274019
1     25981
Name: count, dtype: int64

In [111]:
df_vault = df.copy()

In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300000 entries, 7183 to 52768101
Data columns (total 35 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   subregio                            300000 non-null  object
 1   ondernemingstype                    300000 non-null  object
 2   ondernemingsaard                    300000 non-null  object
 3   activiteitNaam                      300000 non-null  object
 4   unknown                             300000 non-null  int8  
 5   afspraak_Arbeidsmarkt               300000 non-null  int8  
 6   afspraak_Bedrijfsbeheer             300000 non-null  int8  
 7   afspraak_Duurzaamheid               300000 non-null  int8  
 8   afspraak_Familiebedrijfsmanagement  300000 non-null  int8  
 9   afspraak_Financieel                 300000 non-null  int8  
 10  afspraak_Groeien en Netwerking      300000 non-null  int8  
 11  afspraak_Internationaal Ondernemen  300

In [113]:
# RAM vrijmaken
try:
    del df_ingeschreven
    del merged_total
    del df_niet_ingeschreven
except:
    print("Data is niet meer beschikbaar / is al verwijderd")

#### 8. Verdere data cleaning en One/Multi hot encoding

In [114]:
# import pandas as pd

# df = pd.read_csv('../data_clean/AA_Supervised_dataset.csv')
# df.shape

In [115]:
from sklearn.preprocessing import OneHotEncoder


subregio_cat = ['Gent', 'Leiestreek-Meetjesland', 'Dendermonde', 'Aalst', 'Oudenaarde', 'Waasland']
campagnetype_cat = ['Project', 'Projectgebonden', 'Campagne', 'Opleiding', 'Netwerkevenement', 'Infosessie']

oneHot_subregio = OneHotEncoder(sparse=False, categories=[subregio_cat])
oneHot_campagnetype = OneHotEncoder(sparse=False, categories=[campagnetype_cat])

In [116]:
# Subregio
subregio_1hot = oneHot_subregio.fit_transform(df[['subregio']])



In [117]:
# Ondernemingstype
ondernemingstype_categories = [
 {'categorie': 'unknown', 'binary': None},
 {'categorie': 'Beroepsorganisatie', 'binary': None},
 {'categorie': 'Vakbonden', 'binary': None},
 {'categorie': 'Eenmanszaak', 'binary': None},
 {'categorie': 'Multinational', 'binary': None},
 {'categorie': 'Sociale organisatie', 'binary': None},
 {'categorie': 'Werkgeversorganisaties', 'binary': None},
 {'categorie': 'Pers/Media', 'binary': None},
 {'categorie': 'Overheid', 'binary': None},
 {'categorie': 'Onderwijs', 'binary': None},
 {'categorie': 'Social Profit', 'binary': None},
 {'categorie': 'Vrije beroepen', 'binary': None},
 {'categorie': 'Familiebedrijf', 'binary': None},
 {'categorie': 'Bedrijf', 'binary': None}
]

for i, categorie in enumerate(ondernemingstype_categories):
  categorie['binary'] = str(bin(i)[2:].zfill(4))

category_to_binary = {categorie['categorie']: categorie['binary'] for categorie in ondernemingstype_categories}
df['ondernemingstype'] = df['ondernemingstype'].map(category_to_binary)

for i in range(1, 5):
    df[f'ondernemingstype_{i}'] = df['ondernemingstype'].apply(lambda x: int(str(x)[i-1]))

In [118]:
# Primaire activiteit
activiteitNaam_categories = [
  {'categorie': 'Luchthavengerelateerd', 'binary': None},
  {'categorie': 'Farmacie', 'binary': None},
  {'categorie': 'Diamant, edelstenen, juwelen', 'binary': None},
  {'categorie': 'Havengerelateerd', 'binary': None},
  {'categorie': 'Media', 'binary': None},
  {'categorie': 'Overheid', 'binary': None},
  {'categorie': 'Verenigingen en maatschappelijke organisaties', 'binary': None},
  {'categorie': 'Onderwijs', 'binary': None},
  {'categorie': 'Milieu', 'binary': None},
  {'categorie': 'Vrije beroepen', 'binary': None},
  {'categorie': 'Agrarische & bio-industrie', 'binary': None},
  {'categorie': 'Hout- en meubelindustrie', 'binary': None},
  {'categorie': 'Accountancy & boekhouding', 'binary': None},
  {'categorie': 'Vastgoed', 'binary': None},
  {'categorie': 'Verzekering', 'binary': None},
  {'categorie': 'Financiële diensten', 'binary': None},
  {'categorie': 'Grafische industrie en diensten', 'binary': None},
  {'categorie': 'Automobiel- en Tweewielerindustrie', 'binary': None},
  {'categorie': 'Textiel, kleding en confectie', 'binary': None},
  {'categorie': 'Horeca & toerisme', 'binary': None},
  {'categorie': 'Technologische industrie & diensten', 'binary': None},
  {'categorie': 'Zorg', 'binary': None},
  {'categorie': 'Detailhandel', 'binary': None},
  {'categorie': 'Groothandel', 'binary': None},
  {'categorie': 'Bouw', 'binary': None},
  {'categorie': 'Energie', 'binary': None},
  {'categorie': 'Consultancy', 'binary': None},
  {'categorie': 'Papier & karton', 'binary': None},
  {'categorie': 'Human capital', 'binary': None},
  {'categorie': 'Chemie, petrochemie', 'binary': None},
  {'categorie': 'Distributie, logistiek en transport', 'binary': None},
  {'categorie': 'Telecom & IT', 'binary': None},
  {'categorie': 'Ijzer en staal', 'binary': None},
  {'categorie': 'Voeding', 'binary': None},
  {'categorie': 'Overige industrie & diensten', 'binary': None}
]

for i, categorie in enumerate(activiteitNaam_categories):
    categorie['binary'] = str(bin(i)[2:].zfill(6))

category_to_binary = {categorie['categorie']: categorie['binary'] for categorie in activiteitNaam_categories}
df['activiteitNaam'] = df['activiteitNaam'].map(category_to_binary)

for i in range(1, 7):
    df[f'activiteitNaam_{i}'] = df['activiteitNaam'].apply(lambda x: int(str(x)[i-1]))

In [119]:
# Ondernemingsaard
diensten_column = []
productie_column = []

for label in df["ondernemingsaard"]:
    if label == "Productie & Diensten":
        diensten_column.append(1)
        productie_column.append(1)
    elif label == "Diensten":
        diensten_column.append(1)
        productie_column.append(0)
    elif label == "Productie":
        diensten_column.append(0)
        productie_column.append(1)
    else:
        diensten_column.append(0)
        productie_column.append(0)

ondernemingsaard_multihot = pd.DataFrame({"Diensten": diensten_column, "Productie": productie_column})
df = df.join(ondernemingsaard_multihot, rsuffix='_ondernemingsaard')

In [120]:
# Campagne soort
online_column = []
offline_column = []

for label in df["campagneSoort"]:
    if label == "On en Offline":
        online_column.append(1)
        offline_column.append(1)
    elif label == "Offline":
        online_column.append(0)
        offline_column.append(1)
    elif label == "Online":
        online_column.append(1)
        offline_column.append(0)
    else:
        online_column.append(0)
        offline_column.append(1)

campagne_soort_multihot = pd.DataFrame({"Online": online_column, "Offline": offline_column})

In [121]:
# Campagne type
campagne_type_1hot = oneHot_campagnetype.fit_transform(df[['campagneType']])



In [122]:
df = df.drop(['subregio', 'ondernemingstype', 'activiteitNaam', 'ondernemingsaard', 'campagneSoort', 'campagneType', 'unknown'], axis=1)

In [123]:
# Al de one hot encodings samenvoegen
df = df.join(pd.DataFrame(subregio_1hot), rsuffix='_subregio')
df = df.join(pd.DataFrame(campagne_type_1hot), rsuffix='_campagne_type')
df = df.join(campagne_soort_multihot, rsuffix='_campagneSoort')

In [124]:
# RAM vrijmaken
try:
    del ondernemingsaard_multihot
    del campagne_soort_multihot
    del campagne_type_1hot
except:
    print("Data is niet meer beschikbaar / is al verwijderd")


In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300000 entries, 7183 to 52768101
Data columns (total 54 columns):
 #   Column                              Non-Null Count   Dtype
---  ------                              --------------   -----
 0   afspraak_Arbeidsmarkt               300000 non-null  int8 
 1   afspraak_Bedrijfsbeheer             300000 non-null  int8 
 2   afspraak_Duurzaamheid               300000 non-null  int8 
 3   afspraak_Familiebedrijfsmanagement  300000 non-null  int8 
 4   afspraak_Financieel                 300000 non-null  int8 
 5   afspraak_Groeien en Netwerking      300000 non-null  int8 
 6   afspraak_Internationaal Ondernemen  300000 non-null  int8 
 7   afspraak_Lidmaatschap               300000 non-null  int8 
 8   afspraak_Logistiek en Transport     300000 non-null  int8 
 9   afspraak_Plato & Bryo               300000 non-null  int8 
 10  afspraak_Technologie en Innovatie   300000 non-null  int8 
 11  afspraak_Welt                       300000 non-null 

In [126]:
int_cols = df.select_dtypes(include=['int64', 'int32', 'float32', 'float64']).columns
df[int_cols] = df[int_cols].apply(lambda x: np.int8(x))

  arr = np.asarray(values, dtype=dtype)


#### 9. Dataset opslaan

In [67]:
# Save to csv
df.to_csv('../data_clean/AA_Supervised_dataset.csv', index=False)