In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [40]:
df = pd.read_csv('../data_clean/merged_total.csv')
df.shape

(250666, 19)

### Embedding

In [46]:
import openai
from dotenv import load_dotenv

In [47]:
load_dotenv("../.env")
openai.api_key = os.getenv('OPENAI_API_KEY')

embedding_model = "text-embedding-ada-002"

cat_cols = df.select_dtypes(include=['object']).columns
cat_cols = cat_cols[(cat_cols != 'visit_bounce') & (cat_cols != 'contact_contactpersoon_id') & (cat_cols != 'account_account_id')]

cat_cols

Index(['afspraak_keyphrases', 'account_keyphrases', 'campagne_keyphrases',
       'sessie_keyphrases', 'visit_keyphrases', 'mailing_keyphrases'],
      dtype='object')

In [48]:
def get_embedding(text):
    response = openai.Embedding.create(
    input=text,
    model=embedding_model
    )   
    return response['data'][0]['embedding']


def embed_col(df, col):
    unique_col = df[col].unique().tolist()
    dict_temp = {}

    for i in unique_col:
        dict_temp[i] = get_embedding(i)
    
    df[col+'_embed'] = df[col].map(dict_temp)
    df.drop(columns=[col], inplace=True)

    print(f'Column {col} embedded\nDataframe shape: {df.shape}')
    
    return df

In [49]:
df['mailing_keyphrases'].unique()

array(['unknown',
       'jong ondernemen stamgasten 2023, uitnodiging stamgasten 23 mei 2023',
       'netwerking nieuwjaarsreceptie vlaamse ardennen   leiestreek 2023, nieuwjaarsreceptie vlaamse ardennen leiestreek  voka oost vlaanderen',
       'nieuwsbrief , nieuws van voka oost vlaanderen, rechtstreeks in je inbox',
       'netwerking nieuwjaarsreceptie 2023 gent, nieuwjaarsreceptie gent  voka oost vlaanderen',
       'netwerking community event groei grootmeesters, grootmeesters van de groei   inspirerend community event',
       'expert groeien door overname 2023, groeien door overname',
       'netwerking nieuwjaarsreceptie vlar 2023 reminder, last call  nieuwjaarsreceptie vlaamse ardennen & leiestreek',
       'netwerking nieuwjaarsreceptie 2023 gastsprekers vlar, nieuwjaarsreceptie vlaamse ardennen & leiestreek  voka oost vlaanderen',
       "netwerking praktische info connect gent, praktische info voka's nieuwjaarsreceptie in gent",
       'nw/arbeidsmarkt quick refresh arbe

### CLUSTERING

In [6]:
from sklearn.cluster import DBSCAN

In [None]:
# reduce the value from each embedded column to a single value
def reduce_embedding(embedded):
    return np.mean(embedded)

df['campagne_naam_embedded'] = df['campagne_naam_embedded'].apply(lambda x: reduce_embedding(x))
df['visit_ip_embedded'] = df['visit_ip_embedded'].apply(lambda x: reduce_embedding(x))
df['afspraak_keyphrases_embedded'] = df['afspraak_keyphrases_embedded'].apply(lambda x: reduce_embedding(x))
df['mailing_name_embedded'] = df['mailing_name_embedded'].apply(lambda x: reduce_embedding(x))
df['mailing_onderwerp_embedded'] = df['mailing_onderwerp_embedded'].apply(lambda x: reduce_embedding(x))

df.shape

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=5, ).fit(df)

In [None]:
labels_db = dbscan.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels_db)) - (1 if -1 in labels_db else 0)
n_noise_ = list(labels_db).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 60
Estimated number of noise points: 9604


In [None]:
dbscan.get_params(deep=True)

{'algorithm': 'auto',
 'eps': 0.5,
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'min_samples': 5,
 'n_jobs': None,
 'p': None}