In [19]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

### Create connection to DWH

In [21]:
ENV_URL = os.path.join(os.getcwd(), '../.env')
load_dotenv(ENV_URL)

DWH_NAME = os.environ.get('DWH_NAME')
SERVER_NAME = os.environ.get('SERVER_NAME')
DB_USER = os.environ.get('DB_USER')
DB_PASSWORD = os.environ.get('DB_PASSWORD')

URL = f'mssql+pymssql://{DB_USER}:{DB_PASSWORD}@{SERVER_NAME}/{DWH_NAME}'

# engine = create_engine(URL)
# conn = engine.connect()

### Create a dataframe with all the data from the DWH

In [22]:
def create_query(table_name, columns, condition=None):

    query = f"SELECT "

    for i, column in enumerate(columns):
        if i == 0:
            query += f"[{column}]"
        else:
            query += f", [{column}]"
    
    query += f" FROM [{DWH_NAME}].[dbo].[{table_name}]"
    
    if condition:
        query += f" WHERE {condition}"

    return query

#### Account data

In [None]:
# account kolommen
acc_cols = ['account_account_id', 'account_adres_plaats', 'account_adres_geografische_subregio', 'account_ondernemingstype', 
            'account_ondernemingsaard', 'account_primaire_activiteit']

# account conditie
acc_condition = "account_reden_van_status = 'Actief' AND account_adres_provincie = 'Oost-Vlaanderen'"
# create query
acc_query = create_query('DimAccount', acc_cols, acc_condition)
# read sql
df_account = pd.read_sql(acc_query, conn)
# account adres samenvoegen
df_account['account_adres'] = df_account['account_adres_plaats'].str.lower() + ' ' \
            + df_account['account_adres_geografische_subregio'].str.lower()
df_account['account_adres'] = df_account['account_adres'].str.replace(r'\([a-z.-]+\)', '', regex=True).str.replace('  ', ' ')\

# account ondernemingstype samenvoegen
df_account['account_onderneming'] = df_account['account_ondernemingstype'] + ', ' \
                            + df_account['account_ondernemingsaard'] + ', ' \
                            + df_account['account_primaire_activiteit'] + ' ' \

df_account['account_onderneming'] = df_account['account_onderneming'].str.replace('unknown', '').str.replace(', , ', '') \
                                                            .str.strip().str.lower() \
                                                            .str.replace(r',$|^,', '', regex=True) \
                                                            .str.replace('&', '').str.replace('-', '') \                                                       
# gebruikte kolommen droppen
df_account.drop(columns=['account_adres_plaats', 'account_adres_geografische_subregio', 'account_ondernemingstype', 
            'account_ondernemingsaard', 'account_primaire_activiteit'], inplace=True)

#### Contact data

In [None]:
contact_cols = ['contact_contactpersoon_id', 'contact_account', 'contact_functietitel', 
 'persoon_mail_thema_duurzaamheid', 'persoon_mail_thema_financieel_fiscaal', 'persoon_mail_thema_innovatie',
 'persoon_mail_thema_internationaal_ondernemen', 'persoon_mail_thema_mobiliteit', 'persoon_mail_thema_omgeving',
 'persoon_mail_thema_sales_marketing_communicatie', 'persoon_mail_thema_strategie_en_algemeen_management',
 'persoon_mail_thema_talent', 'persoon_mail_thema_welzijn', 'persoon_mail_type_bevraging', 'persoon_mail_type_communities_en_projecten',
 'persoon_mail_type_netwerkevenementen', 'persoon_mail_type_nieuwsbrieven', 'persoon_mail_type_opleidingen',
 'persoon_mail_type_persberichten_belangrijke_meldingen', 'persoon_marketingcommunicatie',]

contact_condition = "contact_status = 'Actief'"
contact_query = create_query('DimContact', contact_cols, contact_condition)
df_contact = pd.read_sql(contact_query, conn)

### Preprocessing + marketing_pressure calculation

In [None]:
def calc_marketing_pressure(df):
    marketing_pressure_cols = [col for col in df.columns if col.__contains__('persoon_mail_type') 
                           or col.__contains__('persoon_mail_thema') or col.__contains__('persoon_marketingcommunicatie')]

    marketing_pressure_cols.append('inschrijving_bron')
    marketing_pressure_cols.append('visit_first_visit')
    marketing_pressure_cols.append('visit_total_pages')
    marketing_pressure_cols.append('mail_click_freq')

    df['marketing_pressure'] = df[marketing_pressure_cols].sum(axis=1)
    df['marketing_pressure'] = df['marketing_pressure'].astype(int)
    df.drop(marketing_pressure_cols, axis=1, inplace=True)

    return df

In [None]:
df = calc_marketing_pressure(df)

df = df.groupby('contact_contactpersoon_id').agg(list)

df['marketing_pressure'] = df['marketing_pressure'].apply(lambda x: np.mean(x).round(0).astype(int))
df['keyphrases'] = df['keyphrases'].apply(lambda x: ','.join(list(set(x))))
    
df.reset_index(inplace=True)

### Recommendation system

In [None]:
def recommend(df, new_keyphrase: str, top_n=10):
    # preprocessing
    scaler = MinMaxScaler()
    df['marketing_pressure'] = scaler.fit_transform(df[['marketing_pressure']])
    # vectorization
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['keyphrases'])
    # vectorize the new keyphrase and calculate similarity
    new_keyphrase_tfidf = tfidf.transform([new_keyphrase])
    sim_score_new = cosine_similarity(new_keyphrase_tfidf, tfidf_matrix)
    # sort the similarity scores
    contact_person_similarity = list(enumerate(sim_score_new[0]))
    sorted_contact_persons = sorted(contact_person_similarity, key=lambda x: x[1], reverse=True)
    # get the top n similar contact persons
    top_contact_persons = sorted_contact_persons[:top_n]
    # Create a set to keep track of recommended contact IDs
    recommended_contact_ids = set()
    # Iterate through the sorted contact persons and add unique contact IDs to the set
    for index, _ in top_contact_persons:
        contact_id = df['contact_contactpersoon_id'][index]
        recommended_contact_ids.add(contact_id)
    # Convert the set back to a list
    recommended_contact_ids = list(recommended_contact_ids)
    # This would not remove the duplicates
    # recommended_contact_ids = [df['contact_contactpersoon_id'][index] for index, _ in top_contact_persons]
    # sort the contact ids by marketing pressure
    recommended_contact_ids = sorted(recommended_contact_ids, key=lambda x: df[df['contact_contactpersoon_id'] == x]['marketing_pressure'].values[0], reverse=False)
    # result
    results_list = []
    print("Recommended Contact Persons for the New Campaign:")
    for contact_id in recommended_contact_ids:
        marketing_pressure = df[df['contact_contactpersoon_id'] == contact_id]['marketing_pressure'].values[0]
        results_list.append((contact_id, marketing_pressure))
    return results_list

### Test the recommendation system

In [None]:
campaign_keyphrase = 'volwassen, ov, gent, merelbeke, ondernemen, winst, ceo, bedrijf, netwerkevenement'
recommend(df, campaign_keyphrase)