In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\buyse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\buyse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
df = pd.read_csv('../data_clean/final_merge_clean.csv')
df.drop(['avg_waarde_jaar', 'afspraak_account_gelinkt', 'campagne_campagne_id', 'inschrijving_aanwezig_afwezig', 'inschrijving_facturatie_bedrag'], axis=1, inplace=True)
df.shape

(174436, 3)

In [9]:
df.head()

Unnamed: 0,contact_contactpersoon_id,marketing_pressure,keyphrases
0,D9303EA2-57E5-EB11-8121-001DD8B72B61,1,"dienst,vastgoed,mell,kick,bouw,familiebedrijf,..."
1,451DF235-4B73-E111-B43A-00505680000A,-1,"dienst,consultancy,opleid,round,geraardsberg,s..."
2,10446D11-F363-ED11-9561-6045BD895B5A,23,"ledenbezoek,dienst,oost,fiscal,nieuwjaarsrecep..."
3,10446D11-F363-ED11-9561-6045BD895B5A,21,"ledenbezoek,dienst,oost,fiscal,nieuwjaarsrecep..."
4,10446D11-F363-ED11-9561-6045BD895B5A,28,"ledenbezoek,dienst,oost,fiscal,nieuwjaarsrecep..."


In [4]:
# group by contact_id
df = df.groupby('contact_contactpersoon_id').agg(list)
df['marketing_pressure'] = df['marketing_pressure'].apply(lambda x: np.mean(x).round(0).astype(int))
df['keyphrases'] = df['keyphrases'].apply(lambda x: ','.join(list(set(x))))
df.reset_index(inplace=True)
df.shape

(11975, 3)

In [54]:
df.head()

Unnamed: 0,contact_contactpersoon_id,marketing_pressure,keyphrases
0,00169619-E322-E911-80FB-001DD8B72B62,13,"aanvull,account,retentie,bedrijfsleider,opmerk..."
1,0017416A-2C6E-E111-B43A-00505680000A,13,"bedrijfsleider,ontdek,vlaander,contact,dienst,..."
2,00223C8E-467F-E311-BBFD-005056B06EB4,1,"opleid,bioindustrie,dienst,bedrijf,offlin,mark..."
3,00231824-53EA-ED11-8849-6045BD895420,1,"zaakvoerder,balegem,offlin,bedrijfsleider,fami..."
4,0025D44A-C19F-E311-B1AE-005056B06EC4,1,"interes,retentie,groei,toekomst,bedrijfsleider..."


### Functies om de nieuwe campagne data te cleanen en om te recommenden

In [55]:
def remove_stopwords(text):
    stop_words_nl = set(stopwords.words('dutch'))
    
    word_tokens = word_tokenize(text, language='dutch')

    result = [x for x in word_tokens if x not in stop_words_nl]

    seperator = ', '
    return seperator.join(result)


def team_name_change(text):
    teams_dict = {
        'jo': ' jong ondernemen ',
        'do': ' duurzaam ondernemen ',
        'in': ' innovatie digitalisering ',
        'io': ' internationaal ondernemen ',
        'ao': ' arbeidsmarkt ',
        'ex': ' expert ',
        'gr': ' groei ',
        'bb': ' belangenbehartiging ',
        'co': ' communicatie ',
        'nw': ' netwerking ',
        'ha': ' haven ',
        'ma': ' match '
    }
    word_tokens = word_tokenize(text, language='dutch')
    # apply dict to list
    result = [teams_dict.get(word, word) for word in word_tokens]
    # join list to string
    cleaned_list = ', '.join(result)
    # tokenize string
    tokenize_list = word_tokenize(cleaned_list, language='dutch')
    # remove comma
    tokenize_list_no_comma = [x for x in tokenize_list if x != ',']
    # join list to string and remove duplicates from list
    return ', '.join(list(set(tokenize_list_no_comma)))


def stemmer(text):
    stemmer = SnowballStemmer(language='dutch')
    stem_sentence=[]
    for word in text.split(','):
        stem_sentence.append(stemmer.stem(word))
    stem_sentence= ', '.join(stem_sentence)
    return stem_sentence


def clean_text(df, col='keyphrase'):

    df_copy = df.copy()

    for row in range(len(df_copy)):
        name_change = team_name_change(df_copy[col][row])
        no_stopwords = remove_stopwords(name_change)
        tokenize_list = word_tokenize(no_stopwords, language='dutch')
        tokenize_list = [x for x in tokenize_list if x != ',']
        df_copy.at[row, col] = ', '.join(list(set(tokenize_list)))
        stemmer_list= stemmer(df_copy[col][row])
        df_copy.at[row, col] = stemmer_list
    
    df_copy[col] = df_copy[col].str.replace('voka', ' ') \
        .str.replace('ov', '').str.replace('unknown', '').str.replace(r'\b\w{1,3}\b', '', regex=True).str.replace(r'\d+', '', regex=True) \
        .str.replace(r'(\s{2},\s{2}),*+', '', regex=True).str.replace(' ', '').str.replace(r'^,+|,+$', '', regex=True) \
        .str.replace(r',,+', ',', regex=True)

    return df_copy


def clean_new_campaign_data(df):
    df_campagne = df.copy()
    # campagne naam cleanen
    df_campagne['campagne_naam'] = df_campagne['campagne_naam'].str.replace('OV-', '').str.replace('ov-', '') \
                                                            .str.replace('-', ' ').str.replace(r'[^\w\s]', '', regex=True) \
                                                            .str.replace('  ', ' ').str.strip().str.lower().str.replace('  ', ' ')

    # Drop kolommen
    df_campagne.drop(['campagne_einddatum', 'campagne_startdatum', 'campagne_campagne_nr', 
                  'campagne_naam_in_email', 'campagne_reden_van_status', 'campagne_status',
                  'campagne_url_voka_be'
                  ], axis=1, inplace=True)

    # Create keyphrase column
    cols_for_key = ['campagne_naam', 'campagne_type_campagne', 'campagne_soort_campagne']

    for col in cols_for_key:
        df_campagne[col] = df_campagne[col].astype(str).str.split().str.join(', ')

    df_campagne['keyphrase'] = df_campagne[cols_for_key].apply(lambda x: ', '.join(x), axis=1)

    # keyphrases cleanen
    df_campagne['keyphrase'] = df_campagne['keyphrase'].str.replace(', ,', ',').str.replace(r'(\s{2},\s{2}),*+', '') \
        .str.replace('  ', ' ').str.replace(r'[^\w\s]', '', regex=True).str.replace('  ', ' ').str.strip().str.lower()
    
    final_df = clean_text(df_campagne, 'keyphrase') 

    return final_df['keyphrase']


def recommend(df, new_keyphrase: str, top_n=10):
    # preprocessing
    scaler = MinMaxScaler()
    df['marketing_pressure'] = scaler.fit_transform(df[['marketing_pressure']])
    # vectorization
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['keyphrases'])
    # vectorize the new keyphrase and calculate similarity
    new_keyphrase_tfidf = tfidf.transform([new_keyphrase])
    sim_score_new = cosine_similarity(new_keyphrase_tfidf, tfidf_matrix)
    # sort the similarity scores
    contact_person_similarity = list(enumerate(sim_score_new[0]))
    sorted_contact_persons = sorted(contact_person_similarity, key=lambda x: x[1], reverse=True)
    # get the top n similar contact persons
    top_contact_persons = sorted_contact_persons[:top_n]
    # Create a set to keep track of recommended contact IDs
    recommended_contact_ids = set()
    # Iterate through the sorted contact persons and add unique contact IDs to the set
    for index, _ in top_contact_persons:
        contact_id = df['contact_contactpersoon_id'][index]
        recommended_contact_ids.add(contact_id)
    # Convert the set back to a list
    recommended_contact_ids = list(recommended_contact_ids)
    # This would not remove the duplicates
    # recommended_contact_ids = [df['contact_contactpersoon_id'][index] for index, _ in top_contact_persons]
    # sort the contact ids by marketing pressure
    recommended_contact_ids = sorted(recommended_contact_ids, key=lambda x: df[df['contact_contactpersoon_id'] == x]['marketing_pressure'].values[0], reverse=False)
    # result
    print("Recommended Contact Persons for the New Campaign:")
    for contact_id in recommended_contact_ids:
        marketing_pressure = df[df['contact_contactpersoon_id'] == contact_id]['marketing_pressure'].values[0]
        print(f"{contact_id} (marketing_pressure={marketing_pressure:.2f})")

### Testen van de functies hierboven

In [56]:
df_campagne = pd.read_csv('../data_clean/Campagne_fixed.csv')
# get the 3rd row of the df
new_campaign = df_campagne.iloc[2]
df_new_campaign = pd.DataFrame(new_campaign).T
df_new_campaign.reset_index(inplace=True, drop=True)
df_new_campaign

Unnamed: 0,campagne_campagne_id,campagne_campagne_nr,campagne_einddatum,campagne_naam,campagne_naam_in_email,campagne_reden_van_status,campagne_startdatum,campagne_status,campagne_type_campagne,campagne_url_voka_be,campagne_soort_campagne
0,001B42E8-76F8-E411-ABE8-005056B06EB4,15-OV-01-0104,30-6-2015 00:00:00,OV-IO-Economische missie wereldexpo Milaan,OV-IO-Economische missie wereldexpo Milaan,Nieuw,28-6-2015 00:00:00,Actief,Netwerkevenement,unknown,Offline


In [57]:
new_campaign_cleaned = clean_new_campaign_data(df_new_campaign)
list(new_campaign_cleaned)

['milan,wereldexpo,economisch,international,missie,offlin,ondernem,netwerkevenement']

In [58]:
recommend(df, str(new_campaign_cleaned))

Recommended Contact Persons for the New Campaign:
7AE184A8-6A79-E411-8007-005056B06EB4 (marketing_pressure=0.02)
5FF83F62-B3B6-E411-9B05-005056B06EB4 (marketing_pressure=0.22)
501292DF-DDCD-EB11-8123-001DD8B72B62 (marketing_pressure=0.22)
06CC213F-C94E-E411-8F25-005056B06EB4 (marketing_pressure=0.24)
E058455F-456B-E111-B43A-00505680000A (marketing_pressure=0.24)
6EC3B81F-9972-E111-B43A-00505680000A (marketing_pressure=0.24)
8C7BBB4E-466B-E111-B43A-00505680000A (marketing_pressure=0.24)
E2085BF9-ED8F-E811-80F5-001DD8B72B61 (marketing_pressure=0.24)
2F6DAD08-1BA7-EA11-8110-001DD8B72B61 (marketing_pressure=0.24)
0359E83F-3373-E111-B43A-00505680000A (marketing_pressure=0.34)
