In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

True

In [25]:
data = pd.read_csv('../data_clean/final_merge_clean.csv')
df = data.drop(['avg_waarde_jaar', 'afspraak_account_gelinkt', 'campagne_campagne_id', 'inschrijving_aanwezig_afwezig', 'inschrijving_facturatie_bedrag'], axis=1, inplace=False)
df.shape

(211429, 24)

In [27]:
def calc_marketing_pressure(df):
    marketing_pressure_cols = [col for col in df.columns if col.__contains__('persoon_mail_type') 
                           or col.__contains__('persoon_mail_thema') or col.__contains__('persoon_marketingcommunicatie')]

    marketing_pressure_cols.append('inschrijving_bron')
    marketing_pressure_cols.append('visit_first_visit')
    marketing_pressure_cols.append('visit_total_pages')
    marketing_pressure_cols.append('mail_click_freq')

    df['marketing_pressure'] = df[marketing_pressure_cols].sum(axis=1)
    df['marketing_pressure'] = df['marketing_pressure'].astype(int)
    df.drop(marketing_pressure_cols, axis=1, inplace=True)

    return df

df = calc_marketing_pressure(df)

In [21]:
df.head()

Unnamed: 0,contact_contactpersoon_id,keyphrases,marketing_pressure
0,D9303EA2-57E5-EB11-8121-001DD8B72B61,"medewerker,offlin,mell,dienst,kick,netwerk,bou...",-1
1,451DF235-4B73-E111-B43A-00505680000A,"geraardsberg,consultancy,offlin,lidmaatschap,a...",-3
2,10446D11-F363-ED11-9561-6045BD895B5A,"verwerk,organisaties,dienst,organisch,industri...",32
3,10446D11-F363-ED11-9561-6045BD895B5A,"verwerk,organisaties,dienst,organisch,industri...",22
4,10446D11-F363-ED11-9561-6045BD895B5A,"verwerk,organisaties,dienst,organisch,nieuw,in...",29


In [22]:
# group by contact_id
df = df.groupby('contact_contactpersoon_id').agg(list)
df['marketing_pressure'] = df['marketing_pressure'].apply(lambda x: np.mean(x).round(0).astype(int))
df['keyphrases'] = df['keyphrases'].apply(lambda x: ','.join(list(set(x))))
df.reset_index(inplace=True)
df.shape

(12225, 3)

In [23]:
df.head()

Unnamed: 0,contact_contactpersoon_id,keyphrases,marketing_pressure
0,00169619-E322-E911-80FB-001DD8B72B62,"zaakvoerder,dienst,waasmunster,jong,lidmaatsch...",12
1,0017416A-2C6E-E111-B43A-00505680000A,"leiestreekmeetjesland,eeklo,dienst,voorjaarsag...",16
2,00223C8E-467F-E311-BBFD-005056B06EB4,"agrarisch,happen,dienst,familiebedrijv,family,...",0
3,00231824-53EA-ED11-8849-6045BD895420,"offlin,oudenaard,bedrijfsleider,zaakvoerder,ne...",-1
4,0025D44A-C19F-E311-B1AE-005056B06EC4,"activiteit,werknemer,opleid,leiestreekmeetjesl...",0


### Functies om de nieuwe campagne data te cleanen en om te recommenden

In [6]:
def remove_stopwords(text):
    stop_words_nl = set(stopwords.words('dutch'))
    
    word_tokens = word_tokenize(text, language='dutch')

    result = [x for x in word_tokens if x not in stop_words_nl]

    seperator = ', '
    return seperator.join(result)


def team_name_change(text):
    teams_dict = {
        'jo': ' jong ondernemen ',
        'do': ' duurzaam ondernemen ',
        'in': ' innovatie digitalisering ',
        'io': ' internationaal ondernemen ',
        'ao': ' arbeidsmarkt ',
        'ex': ' expert ',
        'gr': ' groei ',
        'bb': ' belangenbehartiging ',
        'co': ' communicatie ',
        'nw': ' netwerking ',
        'ha': ' haven ',
        'ma': ' match '
    }
    word_tokens = word_tokenize(text, language='dutch')
    # apply dict to list
    result = [teams_dict.get(word, word) for word in word_tokens]
    # join list to string
    cleaned_list = ', '.join(result)
    # tokenize string
    tokenize_list = word_tokenize(cleaned_list, language='dutch')
    # remove comma
    tokenize_list_no_comma = [x for x in tokenize_list if x != ',']
    # join list to string and remove duplicates from list
    return ', '.join(list(set(tokenize_list_no_comma)))


def stemmer(text):
    stemmer = SnowballStemmer(language='dutch')
    stem_sentence=[]
    for word in text.split(','):
        stem_sentence.append(stemmer.stem(word))
    stem_sentence= ', '.join(stem_sentence)
    return stem_sentence


def clean_text(df, col='keyphrase'):

    df_copy = df.copy()

    for row in range(len(df_copy)):
        name_change = team_name_change(df_copy[col][row])
        no_stopwords = remove_stopwords(name_change)
        tokenize_list = word_tokenize(no_stopwords, language='dutch')
        tokenize_list = [x for x in tokenize_list if x != ',']
        df_copy.at[row, col] = ', '.join(list(set(tokenize_list)))
        stemmer_list= stemmer(df_copy[col][row])
        df_copy.at[row, col] = stemmer_list
    
    df_copy[col] = df_copy[col].str.replace('voka', ' ') \
        .str.replace('ov', '').str.replace('unknown', '').str.replace(r'\b\w{1,3}\b', '', regex=True).str.replace(r'\d+', '', regex=True) \
        .str.replace(r'(\s{2},\s{2}),*+', '', regex=True).str.replace(' ', '').str.replace(r'^,+|,+$', '', regex=True) \
        .str.replace(r',,+', ',', regex=True)

    return df_copy


def clean_new_campaign_data(df):
    df_campagne = df.copy()
    # campagne naam cleanen
    df_campagne['campagne_naam'] = df_campagne['campagne_naam'].str.replace('OV-', '').str.replace('ov-', '') \
                                                            .str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True) \
                                                            .str.replace('  ', ' ').str.strip().str.lower().str.replace('  ', ' ')

    # Drop kolommen
    df_campagne.drop(['campagne_einddatum', 'campagne_startdatum', 'campagne_campagne_nr', 
                  'campagne_naam_in_email', 'campagne_reden_van_status', 'campagne_status',
                  'campagne_url_voka_be'
                  ], axis=1, inplace=True)

    # Create keyphrase column
    cols_for_key = ['campagne_naam', 'campagne_type_campagne', 'campagne_soort_campagne']

    for col in cols_for_key:
        df_campagne[col] = df_campagne[col].astype(str).str.split().str.join(', ')

    df_campagne['keyphrase'] = df_campagne[cols_for_key].apply(lambda x: ', '.join(x), axis=1)

    # keyphrases cleanen
    df_campagne['keyphrase'] = df_campagne['keyphrase'].str.replace(', ,', ',').str.replace(r'(\s{2},\s{2}),*+', '') \
        .str.replace('  ', ' ').str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip().str.lower()
    
    final_df = clean_text(df_campagne, 'keyphrase') 

    return final_df['keyphrase']


def recommend(df, new_keyphrase: str, top_n=10):
    # preprocessing
    scaler = MinMaxScaler()
    df['marketing_pressure'] = scaler.fit_transform(df[['marketing_pressure']])
    # vectorization
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['keyphrases'])
    # vectorize the new keyphrase and calculate similarity
    new_keyphrase_tfidf = tfidf.transform([new_keyphrase])
    sim_score_new = cosine_similarity(new_keyphrase_tfidf, tfidf_matrix)
    # sort the similarity scores
    contact_person_similarity = list(enumerate(sim_score_new[0]))
    sorted_contact_persons = sorted(contact_person_similarity, key=lambda x: x[1], reverse=True)
    # get the top n similar contact persons
    top_contact_persons = sorted_contact_persons[:top_n]
    # Create a set to keep track of recommended contact IDs
    recommended_contact_ids = set()
    # Iterate through the sorted contact persons and add unique contact IDs to the set
    for index, _ in top_contact_persons:
        contact_id = df['contact_contactpersoon_id'][index]
        recommended_contact_ids.add(contact_id)
    # Convert the set back to a list
    recommended_contact_ids = list(recommended_contact_ids)
    # This would not remove the duplicates
    # recommended_contact_ids = [df['contact_contactpersoon_id'][index] for index, _ in top_contact_persons]
    # sort the contact ids by marketing pressure
    recommended_contact_ids = sorted(recommended_contact_ids, key=lambda x: df[df['contact_contactpersoon_id'] == x]['marketing_pressure'].values[0], reverse=False)
    # result
    print("Recommended Contact Persons for the New Campaign:")
    for contact_id in recommended_contact_ids:
        marketing_pressure = df[df['contact_contactpersoon_id'] == contact_id]['marketing_pressure'].values[0]
        print(f"{contact_id} (marketing_pressure={marketing_pressure:.2f})")

### Testen van de functies hierboven

In [7]:
df_campagne = pd.read_csv('../data_clean/Campagne_fixed.csv')
# get the 3rd row of the df
new_campaign = df_campagne.iloc[2]
df_new_campaign = pd.DataFrame(new_campaign).T
df_new_campaign.reset_index(inplace=True, drop=True)
df_new_campaign

Unnamed: 0,campagne_campagne_id,campagne_campagne_nr,campagne_einddatum,campagne_naam,campagne_naam_in_email,campagne_reden_van_status,campagne_startdatum,campagne_status,campagne_type_campagne,campagne_url_voka_be,campagne_soort_campagne
0,001B42E8-76F8-E411-ABE8-005056B06EB4,15-OV-01-0104,2015-06-30,OV-IO-Economische missie wereldexpo Milaan,OV-IO-Economische missie wereldexpo Milaan,Nieuw,2015-06-28,Actief,Netwerkevenement,unknown,Offline


In [8]:
new_campaign_cleaned = clean_new_campaign_data(df_new_campaign)
list(new_campaign_cleaned)

['netwerkevenement,offlin,ondernem,wereldexpo,economisch,missie,international,milan']

In [18]:
recommend(df, str(new_campaign_cleaned))

Recommended Contact Persons for the New Campaign:
C93F5776-5A47-EA11-810C-001DD8B72B62 (marketing_pressure=0.00)
D83D2995-E3E0-EC11-BB3C-000D3ABCA38B (marketing_pressure=0.01)
77F1037D-FFCE-E911-8105-001DD8B72B62 (marketing_pressure=0.01)
15ABD17B-E3E0-EC11-BB3C-000D3ABCA38B (marketing_pressure=0.01)
2CB90A2D-708A-EC11-93B0-6045BD91D12C (marketing_pressure=0.02)
3194D173-A56A-E611-80DE-001DD8B72B61 (marketing_pressure=0.02)
7AE184A8-6A79-E411-8007-005056B06EB4 (marketing_pressure=0.02)
BFC4DC0C-56D0-EC11-A7B5-000D3A480EA6 (marketing_pressure=0.02)
104374F4-7400-EA11-8107-001DD8B72B62 (marketing_pressure=0.03)
03435871-2C18-E611-BEEF-005056B06EB4 (marketing_pressure=0.07)
