In [143]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

### Create connection to DWH

In [269]:
ENV_URL = os.path.join(os.getcwd(), '../.env')
load_dotenv(ENV_URL)

DWH_NAME = os.environ.get('DWH_NAME')
SERVER_NAME = os.environ.get('SERVER_NAME')
DB_USER = os.environ.get('DB_USER')
DB_PASSWORD = os.environ.get('DB_PASSWORD')

URL = f'mssql+pymssql://{DB_USER}:{DB_PASSWORD}@{SERVER_NAME}/{DWH_NAME}'
URL_LOCAL = f'mssql+pyodbc://{SERVER_NAME}/{DWH_NAME}?trusted_connection=yes&driver=ODBC+Driver+17 for SQL Server'

engine = create_engine(URL_LOCAL)
conn = engine.connect()

### Create a dataframe with all the data from the DWH

In [270]:
def create_query(table_name, columns, condition=None):

    query = f"SELECT "

    for i, column in enumerate(columns):
        if i == 0:
            query += f"[{column}]"
        else:
            query += f", [{column}]"
    
    query += f" FROM [{DWH_NAME}].[dbo].[{table_name}]"
    
    if condition:
        query += f" WHERE {condition}"

    return query

#### Account data

In [272]:
acc_cols = ['accountID', 'plaats', 'ondernemingstype', 'ondernemingsaard', 'activiteitNaam']
# account conditie
acc_condition = "accountStatus = 1 AND provincie = 'Oost-Vlaanderen'"
# create query
acc_query = create_query('DimAccount', acc_cols, acc_condition)
# read sql
df_account = pd.read_sql(acc_query, conn)
# account adres samenvoegen
df_account['plaats'] = df_account['plaats'].str.replace(r'\([a-z.-]+\)', '', regex=True).str.replace('  ', ' ')\

# account ondernemingstype samenvoegen
df_account['onderneming'] = df_account['ondernemingstype'] + ' ' \
                            + df_account['ondernemingsaard'] + ' ' \
                            + df_account['activiteitNaam']

df_account['onderneming'] = df_account['onderneming'].str.replace('unknown', '').str.replace(', , ', '') \
                                                            .str.strip().str.lower() \
                                                            .str.replace(r',$|^,', '', regex=True) \
                                                            .str.replace('&', '').str.replace('-', '')                                                      
# gebruikte kolommen droppen
df_account.drop(columns=['plaats', 'ondernemingstype', 
            'ondernemingsaard', 'activiteitNaam'], inplace=True)

df_account.shape

(3206, 2)

#### Contact data

In [273]:
contact_cols = ['contactID', 'accountID', 'functietitel', 
 'persoon_mail_thema_duurzaamheid', 'persoon_mail_thema_financieel_fiscaal', 'persoon_mail_thema_innovatie',
 'persoon_mail_thema_internationaal_ondernemen', 'persoon_mail_thema_mobiliteit', 'persoon_mail_thema_omgeving',
 'persoon_mail_thema_sales_marketing_communicatie', 'persoon_mail_thema_strategie_en_algemeen_management',
 'persoon_mail_thema_talent', 'persoon_mail_thema_welzijn', 'persoon_mail_type_bevraging', 'persoon_mail_type_communities_en_projecten',
 'persoon_mail_type_netwerkevenementen', 'persoon_mail_type_nieuwsbrieven', 'persoon_mail_type_opleidingen',
 'persoon_mail_type_persberichten_belangrijke_meldingen', 'persoon_marketingcommunicatie',]

contact_condition = "contactStatus = 'Actief'"
contact_query = create_query('DimContact', contact_cols, contact_condition)
df_contact = pd.read_sql(contact_query, conn)

df_contact['persoon_marketingcommunicatie'] = df_contact['persoon_marketingcommunicatie'].fillna('-1')
df_contact['persoon_marketingcommunicatie'] = df_contact['persoon_marketingcommunicatie'] \
                                                            .str.replace('Strikt', '0').str.replace('Flexibel', '1') \
                                                            .str.replace('Uitgeschreven', '-1').str.replace('unknown', '-1').astype(int)

df_contact.shape

(396800, 20)

### Merge account and contact data

In [293]:
df = pd.merge(df_contact, df_account, on='accountID', how='inner')
df.shape

(16635, 21)

### Afspraken data

In [294]:
afspraak_cols = ['subthema', 'onderwerp', 'keyphrases', 'contactID']

afspraak_condition = "contactID is not null"
afspraak_query = create_query('DimAfspraak', afspraak_cols, afspraak_condition)
df_afspraak = pd.read_sql(afspraak_query, conn)

df_afspraak['thema'] = df_afspraak['subthema'].str.replace('\(', '', regex=True)

df_afspraak['thema'] = df_afspraak['thema'].str.replace('\)', '', regex=True).str.lower() \
        .str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

df_afspraak['onderwerp'] = df_afspraak['onderwerp'].str.lower().astype(str).str.replace('ov-', '') \
        .str.replace('ov -', '').str.replace('ov ', '').str.replace('-', ' ') \
        .str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

df_afspraak['keyphrases'] = df_afspraak['keyphrases'].str.lower().str.replace(r'[^\w\s]', '', regex=True) \
                                                                            .str.replace('  ', ' ').str.strip()

df_afspraak.drop(['subthema'], axis=1, inplace=True)
df_afspraak.drop_duplicates(inplace=True)

df_afspraak.shape

(2547, 4)

In [295]:
afspraak_cols = ['subthema', 'onderwerp', 'keyphrases', 'accountID']

afspraak_condition = "accountID is not null"
afspraak_query = create_query('DimAfspraak', afspraak_cols, afspraak_condition)
df_afspraak1 = pd.read_sql(afspraak_query, conn)

df_afspraak1['thema'] = df_afspraak1['subthema'].str.replace('\(', '', regex=True)

df_afspraak1['thema'] = df_afspraak1['thema'].str.replace('\)', '', regex=True).str.lower() \
        .str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

df_afspraak1['onderwerp'] = df_afspraak1['onderwerp'].str.lower().astype(str).str.replace('ov-', '') \
        .str.replace('ov -', '').str.replace('ov ', '').str.replace('-', ' ') \
        .str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip()

df_afspraak1['keyphrases'] = df_afspraak1['keyphrases'].str.lower().str.replace(r'[^\w\s]', '', regex=True) \
                                                                            .str.replace('  ', ' ').str.strip()

df_afspraak1.drop(['subthema'], axis=1, inplace=True)
df_afspraak1.drop_duplicates(inplace=True)

df_afspraak1.shape

(4861, 4)

### Merge afspraken data met account en contact data

In [296]:
df = df.merge(df_afspraak, on=['contactID'], how='left')
df = df.merge(df_afspraak1, on=['accountID'], how='left')

df['onderwerp_x'] = df['onderwerp_x'].fillna('')
df['onderwerp_y'] = df['onderwerp_y'].fillna('')
df['keyphrases_x'] = df['keyphrases_x'].fillna('')
df['keyphrases_y'] = df['keyphrases_y'].fillna('')
df['thema_x'] = df['thema_x'].fillna('')
df['thema_y'] = df['thema_y'].fillna('')

df['onderwerp'] = df['onderwerp_x'] + ' ' + df['onderwerp_y']
df['keyphrases'] = df['keyphrases_x'] + ' ' + df['keyphrases_y']
df['thema'] = df['thema_x'] + ' ' + df['thema_y']

df['onderwerp'] = df['onderwerp'].str.strip().apply(lambda x: ' '.join(list(set(x.replace('  ', ' ').split(' ')))))
df['keyphrases'] = df['keyphrases'].str.strip().apply(lambda x: ' '.join(list(set(x.replace('  ', ' ').split(' ')))))
df['thema'] = df['thema'].str.strip().apply(lambda x: ' '.join(list(set(x.replace('  ', ' ').split(' ')))))

df.drop(['onderwerp_x', 'onderwerp_y', 'keyphrases_x', 'keyphrases_y', 'thema_x', 'thema_y'], axis=1, inplace=True)
df.drop_duplicates(inplace=True)
df.shape

(31314, 24)

### Campagne data

In [297]:
campagne_cols = ['campagneID', 'campagneNaam', 'campagneType', 'campagneSoort']

campagne_query = create_query('DimCampagne', campagne_cols)
df_campagne = pd.read_sql(campagne_query, conn)

df_campagne['campagneNaam'] = df_campagne['campagneNaam'].str.replace('OV-', '').str.replace('ov-', '') \
                                    .str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True) \
                                    .str.replace('  ', ' ').str.strip().str.lower().str.replace('  ', ' ')

df_campagne.shape

(4101, 4)

### Inschrijvingen data

In [298]:
inschrijving_cols = ['campagneID', 'contactID', 'bron']

inschrijving_query = create_query('FactInschrijving', inschrijving_cols)
df_inschrijving = pd.read_sql(inschrijving_query, conn)

df_inschrijving['bron'] = df_inschrijving['bron'].astype(str).str.replace('unknown', '-1') \
                            .str.replace('Website', '0').str.replace('Email', '1').astype(int)

df_inschrijving.shape

(91851, 3)

### Campagne en inschrijvingen data samenvoegen

In [300]:
df_campagne_inschrijving = pd.merge(df_campagne, df_inschrijving, on='campagneID', how='inner')
df_campagne_inschrijving.drop_duplicates(inplace=True)
df_campagne_inschrijving.shape

(48531, 6)

### Sessie data

In [301]:
sessie_cols = ['campaignID', 'themaNaam']

sessie_query = create_query('DimSessie', sessie_cols)
df_sessie = pd.read_sql(sessie_query, conn)

df_sessie = df_sessie.groupby('campaignID')['themaNaam'].apply(list).reset_index()
df_sessie['themaNaam'] = df_sessie['themaNaam'].apply(lambda x: ', '.join(list(set(x))))
df_sessie['themaNaam'] = df_sessie['themaNaam'].str.replace('OV-', '').str.replace('ov-', '') \
                                                            .str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True) \
                                                            .str.replace('  ', ' ').str.strip().str.lower().str.replace('  ', ' ')

df_sessie.shape

(1954, 2)

### Sessie en campagne/inschrijvingen data samenvoegen

In [302]:
df_campagne_inschrijving_sessie = pd.merge(df_campagne_inschrijving, df_sessie, left_on='campagneID', right_on='campaignID', how='inner')
df_campagne_inschrijving_sessie.drop_duplicates(inplace=True)
df_campagne_inschrijving_sessie.shape

(48531, 8)

### Campagne, inschrijvingen en sessie data samenvoegen met account en contact data

In [303]:
df = pd.merge(df, df_campagne_inschrijving_sessie, on='contactID', how='inner')
df.shape

(41122, 31)

### Visit data

In [304]:
# TODO: meer kolommen in DWH toevoegen
visit_cols = ['contactID', 'visit_first_visit', 'visit_total_pages', 'mailing_onderwerp', 'mailSent_clicks', 'mailSent']

visit_query = create_query('DimVisit', visit_cols)
df_visit = pd.read_sql(visit_query, conn)

df_visit.drop_duplicates(inplace=True)

df_visit['visit_first_visit'] = df_visit['visit_first_visit'] \
      .str.replace('Ja', '0').str.replace('Nee', '1') \
      .str.replace('unknown', '-1').astype(int)

df_visit['visit_total_pages'] = df_visit['visit_total_pages']\
      .replace('unknown', '-1.0').astype(float)

df_visit['aantal_mails'] = df_visit.groupby(
    ['contactID'])['mailSent'].transform('nunique')

df_visit['clicks_total'] = df_visit.groupby(
    ['contactID'])['mailSent_clicks'].transform('sum')

df_visit['visit_total_pages'] = df_visit.groupby(
    ['contactID'])['visit_total_pages'].transform('sum')

df_visit['visit_first_visit'] = df_visit.groupby(
    ['contactID'])['visit_first_visit'].transform('sum')

df_visit['mail_click_freq'] = df_visit['clicks_total'] / df_visit['aantal_mails']

df_visit.drop(['mailSent', 'mailSent_clicks', 'clicks_total', 'aantal_mails'], axis=1, inplace=True)
df_visit.drop_duplicates(inplace=True)
df_visit.shape

(6124, 5)

### Final merge

In [305]:
df = pd.merge(df, df_visit, on='contactID', how='inner')

# TODO: maak keyphrases

df.drop_duplicates(inplace=True)
df.shape

(21419, 35)

### Preprocessing + marketing_pressure calculation

In [306]:
def calc_marketing_pressure(df):
    marketing_pressure_cols = [col for col in df.columns if col.__contains__('persoon_mail_type') 
                           or col.__contains__('persoon_mail_thema') or col.__contains__('persoon_marketingcommunicatie')]

    marketing_pressure_cols.append('bron')
    marketing_pressure_cols.append('visit_first_visit')
    marketing_pressure_cols.append('visit_total_pages')
    marketing_pressure_cols.append('mail_click_freq')

    df['marketing_pressure'] = df[marketing_pressure_cols].sum(axis=1)
    df['marketing_pressure'] = df['marketing_pressure'].astype(int)
    df.drop(marketing_pressure_cols, axis=1, inplace=True)

    return df

In [307]:
df = calc_marketing_pressure(df)

df = df.groupby('contactID').agg(list)

df['marketing_pressure'] = df['marketing_pressure'].apply(lambda x: np.mean(x).round(0).astype(int))
df['keyphrases'] = df['keyphrases'].apply(lambda x: ','.join(list(set(x))))
    
df.reset_index(inplace=True)

### Recommendation system

In [None]:
def recommend(df, new_keyphrase: str, top_n=10):
    # preprocessing
    scaler = MinMaxScaler()
    df['marketing_pressure'] = scaler.fit_transform(df[['marketing_pressure']])
    # vectorization
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['keyphrases'])
    # vectorize the new keyphrase and calculate similarity
    new_keyphrase_tfidf = tfidf.transform([new_keyphrase])
    sim_score_new = cosine_similarity(new_keyphrase_tfidf, tfidf_matrix)
    # sort the similarity scores
    contact_person_similarity = list(enumerate(sim_score_new[0]))
    sorted_contact_persons = sorted(contact_person_similarity, key=lambda x: x[1], reverse=True)
    # get the top n similar contact persons
    top_contact_persons = sorted_contact_persons[:top_n]
    # Create a set to keep track of recommended contact IDs
    recommended_contact_ids = set()
    # Iterate through the sorted contact persons and add unique contact IDs to the set
    for index, _ in top_contact_persons:
        contact_id = df['contact_contactpersoon_id'][index]
        recommended_contact_ids.add(contact_id)
    # Convert the set back to a list
    recommended_contact_ids = list(recommended_contact_ids)
    # This would not remove the duplicates
    # recommended_contact_ids = [df['contact_contactpersoon_id'][index] for index, _ in top_contact_persons]
    # sort the contact ids by marketing pressure
    recommended_contact_ids = sorted(recommended_contact_ids, key=lambda x: df[df['contact_contactpersoon_id'] == x]['marketing_pressure'].values[0], reverse=False)
    # result
    results_list = []
    print("Recommended Contact Persons for the New Campaign:")
    for contact_id in recommended_contact_ids:
        marketing_pressure = df[df['contact_contactpersoon_id'] == contact_id]['marketing_pressure'].values[0]
        results_list.append((contact_id, marketing_pressure))
    return results_list

### Test the recommendation system

In [None]:
campaign_keyphrase = 'volwassen, ov, gent, merelbeke, ondernemen, winst, ceo, bedrijf, netwerkevenement'
recommend(df, campaign_keyphrase)