In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
df = pd.read_csv('../data_clean/merged_total.csv')
df.fillna('unknown', inplace=True)
df.shape

(250666, 19)

### Embedding

In [4]:
import nltk
import gensim
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\buyse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\buyse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:

embedding_model = "text-embedding-ada-002"

cat_cols = df.select_dtypes(include=['object']).columns
cat_cols = cat_cols[(cat_cols != 'visit_bounce') & (cat_cols != 'contact_contactpersoon_id') & (cat_cols != 'account_account_id')]

cat_cols

Index(['afspraak_keyphrases', 'account_keyphrases', 'campagne_keyphrases',
       'sessie_keyphrases', 'visit_keyphrases', 'mailing_keyphrases'],
      dtype='object')

In [6]:
df[cat_cols].head()

Unnamed: 0,afspraak_keyphrases,account_keyphrases,campagne_keyphrases,sessie_keyphrases,visit_keyphrases,mailing_keyphrases
0,",","familiebedrijf, diensten, vastgoed, melle gent...","offline, netwerkevenement, ov kick off communi...","ma events, netwerking, netwerkactiviteit project",unknown,unknown
1,",","bedrijf, diensten, consultancy, geraardsbergen...","offline, opleiding, corona round tables sales","gr werking, marketing sales, opleidingen",unknown,unknown
2,"retentie lidmaatschap, ledenbezoek 2023, indus...","bedrijf, diensten, milieu, meilegem oudenaarde...","offline, netwerkevenement, nw nieuwjaarsrecept...","nw voka connect gent, netwerking, netwerkactiv...","chrome, windows, ghent, belgium, pro, event su...","jo stamgasten 2023, uitnodiging stamgasten 23 ..."
3,"retentie lidmaatschap, ledenbezoek 2023, indus...","bedrijf, diensten, milieu, meilegem oudenaarde...","offline, netwerkevenement, nw nieuwjaarsrecept...","nw voka connect gent, netwerking, netwerkactiv...","chrome, windows, ghent, belgium, telenet, acti...",nw nieuwjaarsreceptie vlaamse ardennen leiestr...
4,"retentie lidmaatschap, ledenbezoek 2023, indus...","bedrijf, diensten, milieu, meilegem oudenaarde...","offline, netwerkevenement, nw nieuwjaarsrecept...","nw voka connect gent, netwerking, netwerkactiv...","chrome, windows, ghent, belgium, pro, activite...","nieuwsbrief 09052023, nieuws van voka oost vla..."


In [7]:
len(df['afspraak_keyphrases'])

250666

In [18]:
from nltk.stem.snowball import SnowballStemmer
def remove_stopwords(text):
    stop_words_nl = set(stopwords.words('dutch'))
    
    word_tokens = word_tokenize(text, language='dutch')

    result = [x for x in word_tokens if x not in stop_words_nl]

    seperator = ', '
    return seperator.join(result)


def team_name_change(text):
    teams_dict = {
        'jo': ' jong ondernemen ',
        'do': ' duurzaam ondernemen ',
        'in': ' innovatie digitalisering ',
        'io': ' internationaal ondernemen ',
        'ao': ' arbeidsmarkt ',
        'ex': ' expert ',
        'gr': ' groei ',
        'bb': ' belangenbehartiging ',
        'co': ' communicatie ',
        'nw': ' netwerking ',
        'ha': ' haven ',
        'ma': ' match '
    }
    word_tokens = word_tokenize(text, language='dutch')
    # apply dict to list
    result = [teams_dict.get(word, word) for word in word_tokens]
    # join list to string
    cleaned_list = ', '.join(result)
    # tokenize string
    tokenize_list = word_tokenize(cleaned_list, language='dutch')
    # remove comma
    tokenize_list_no_comma = [x for x in tokenize_list if x != ',']
    # join list to string and remove duplicates from list
    return ', '.join(list(set(tokenize_list_no_comma)))


def stemmer(text):
    stemmer = SnowballStemmer(language='dutch')
    stem_sentence=[]
    for word in text.split(','):
        stem_sentence.append(stemmer.stem(word))
    stem_sentence= ', '.join(stem_sentence)
    return stem_sentence



def clean_text(df, cat_cols=cat_cols):

    df_copy = df.copy()

    for col in cat_cols:
        for row in range(len(df_copy)):
            name_change = team_name_change(df_copy[col][row])
            no_stopwords = remove_stopwords(name_change)
            tokenize_list = word_tokenize(no_stopwords, language='dutch')
            tokenize_list = [x for x in tokenize_list if x != ',']
            df_copy.at[row, col] = ', '.join(list(set(tokenize_list)))
            stemmer_list= stemmer(df_copy[col][row])
            df_copy.at[row, col] = stemmer_list
            
    return df_copy


def flatten_vector_columns(df, col):
    # Flatten the 'vector_column' into a matrix (with padding)
    max_vector_length = max(len(vector) for vector in df[col])
    padded_matrix = np.array([vector + [0.0] * (max_vector_length - len(vector)) for vector in df[col]])
    return padded_matrix

duurt 11.5 minuten om df_clean te maken (10cores) 30 min (4cores)

In [None]:
df_clean = clean_text(df=df, cat_cols=cat_cols)

In [121]:
del df

In [None]:
df_clean[cat_cols].head()

In [123]:
for col in cat_cols:
    df_clean[col] = df_clean[col].str.replace(r'\d', '', regex=True).str.replace(', ,', ',')
    df_clean[col] = df_clean[col].apply(lambda x: 'unknown' if len(x) == 0 else x)

In [None]:
df_clean['campagne_keyphrases'][2]

Campagne_keyphrases embeddings: 6 minuten

In [127]:
def get_embedding(text):
    response = openai.Embedding.create(
    input=text,
    model=embedding_model
    )   
    return response['data'][0]['embedding']


def embed_col(df, col):
    unique_col = df[col].unique().tolist()
    dict_temp = {}

    for i in unique_col:
        dict_temp[i] = get_embedding(i)
    
    df[col+'_embed'] = df[col].map(dict_temp)
    df.drop(columns=[col], inplace=True)
    
    return df

In [None]:
df_clean_emb1 = embed_col(df=df_clean, col='campagne_keyphrases')

In [None]:
len(df_clean_emb1['campagne_keyphrases_embed'][2])

1536

##### Met OpenAI embedding -> 1536 getallen per keyphrase

### CLUSTERING

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
# reduce the value from each embedded column to a single value
def reduce_embedding(embedded):
    return np.mean(embedded)

df_clean['campagne_naam_embedded'] = df_clean['campagne_naam_embedded'].apply(lambda x: reduce_embedding(x))
df_clean['visit_ip_embedded'] = df_clean['visit_ip_embedded'].apply(lambda x: reduce_embedding(x))
df_clean['afspraak_keyphrases_embedded'] = df_clean['afspraak_keyphrases_embedded'].apply(lambda x: reduce_embedding(x))
df_clean['mailing_name_embedded'] = df_clean['mailing_name_embedded'].apply(lambda x: reduce_embedding(x))
df_clean['mailing_onderwerp_embedded'] = df_clean['mailing_onderwerp_embedded'].apply(lambda x: reduce_embedding(x))

df_clean.shape

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=5, ).fit(df_clean)

In [None]:
labels_db = dbscan.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels_db)) - (1 if -1 in labels_db else 0)
n_noise_ = list(labels_db).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 60
Estimated number of noise points: 9604


In [None]:
dbscan.get_params(deep=True)

{'algorithm': 'auto',
 'eps': 0.5,
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'min_samples': 5,
 'n_jobs': None,
 'p': None}