# epic 4(sh)

Als een key user kan ik voor een contact met weinig transacties een lookalike met veel transacties identificeren. Ik kan ook een clustering maken van contactpersonen die qua jobinhoud, type bedrijf, voorkeuren en (verwacht) gedrag

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine

In [2]:
ENV_URL = os.path.join(os.getcwd(), '../.env')
load_dotenv(ENV_URL)

DWH_NAME = os.environ.get('DWH_NAME')
SERVER_NAME = os.environ.get('SERVER_NAME')
DB_USER = os.environ.get('DB_USER')
DB_PASSWORD = os.environ.get('DB_PASSWORD')

URL = f'mssql+pymssql://{DB_USER}:{DB_PASSWORD}@{SERVER_NAME}/{DWH_NAME}'
URL_LOCAL = f'mssql+pyodbc://{SERVER_NAME}/{DWH_NAME}?trusted_connection=yes&driver=ODBC+Driver+17 for SQL Server'

engine = create_engine(URL_LOCAL)
conn = engine.connect()

In [3]:
def create_query(table_name, columns, condition=None):

    query = f"SELECT "

    for i, column in enumerate(columns):
        if i == 0:
            query += f"[{column}]"
        else:
            query += f", [{column}]"
    
    query += f" FROM [{DWH_NAME}].[dbo].[{table_name}]"
    
    if condition:
        query += f" WHERE {condition}"

    return query

### 1 Alles van account selecteren

In [4]:
acc_cols = ['accountID', 'plaats','subregio','ondernemingsaard','ondernemingstype','activiteitNaam']
# account conditie
acc_condition = "provincie = 'Oost-Vlaanderen'"
# create query
acc_query = create_query('DimAccount', acc_cols, acc_condition)
# read sql
df_account = pd.read_sql(acc_query, conn)
df_account.shape

(4124, 6)

### 2 Mergen van contact en account

In [5]:
contact_cols = ['contactID', 'accountID', 'functietitel'] # ,'functieNaam']

contact_query = create_query('DimContact', contact_cols)
df_contact = pd.read_sql(contact_query, conn)

df_contact['functietitel'] = df_contact['functietitel'].str.lower()

# df_contact['functieNaam'] = df_contact['functieNaam'].str.lower()
df_contact.shape

(194192, 3)

In [17]:
accounts_merged = pd.merge(df_contact, df_account, on='accountID', how='inner')
accounts_merged.shape

(61885, 8)

### 3 Mergen van account en afspraak

In [7]:
afspraak_cols = ['accountID', 'keyphrases']

afspraak_query = create_query('DimAfspraak', afspraak_cols)

df_afspraak = pd.read_sql(afspraak_query, conn)
df_afspraak.shape

(7167, 2)

In [18]:
acc_con_afs = pd.merge(accounts_merged, df_afspraak, on='accountID', how='inner')
acc_con_afs.shape

(63673, 9)

### 4 Mergen van Campagne en account

In [9]:
campagne_cols = ['campagneID','campagneType','campagneNaam','campagneSoort']

campagne_query = create_query('DimCampagne', campagne_cols)

df_campagne = pd.read_sql(campagne_query, conn)
df_campagne.shape

(468, 4)

In [20]:
factInschrijving_cols = ['campagneID','contactID']

factInschrijving_query = create_query('FactInschrijving', factInschrijving_cols)

df_factInschrijving = pd.read_sql(factInschrijving_query, conn)
df_factInschrijving.shape

(8730, 2)

In [21]:
camp_fact = pd.merge(df_campagne, df_factInschrijving, on='campagneID', how='inner')
camp_fact.shape

(8730, 5)

In [34]:
df = pd.merge(acc_con_afs, camp_fact, on='contactID', how='inner')
df = df.drop_duplicates(subset=['contactID','campagneID'], keep='first')
df.shape

(421, 13)

## Data Cleaning

In [23]:
# combine all the data 
df3 = df.copy()
df2 = df
df2['data'] =df[df.columns[1:]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
print(df['data'].head())

0    41037B31-0969-E111-B43A-00505680000A,unknown,G...
1    41037B31-0969-E111-B43A-00505680000A,unknown,G...
2    41037B31-0969-E111-B43A-00505680000A,onderzoek...
3    41037B31-0969-E111-B43A-00505680000A,teamleide...
4    41037B31-0969-E111-B43A-00505680000A,student,G...
Name: data, dtype: object


In [24]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorized= vectorizer.fit_transform(df2['data'])

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

#duurt 2m38s
similarities = cosine_similarity(vectorized)

In [26]:
print(similarities)


[[1.         1.         0.96392539 ... 0.         0.         0.        ]
 [1.         1.         0.96392539 ... 0.         0.         0.        ]
 [0.96392539 0.96392539 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.         0.         0.         ... 1.         1.         1.        ]]


In [27]:
df = pd.DataFrame(similarities,columns=df['contactID'],index=df['contactID']).reset_index()
df.head()

contactID,contactID.1,00008922-92DA-E711-80EE-001DD8B72B61,01C7C34F-90DA-E711-80EE-001DD8B72B61,0217E600-1375-E211-A85C-005056B06EC4,0542DA63-2C64-ED11-9561-6045BD895B5A,09E85092-AF88-EC11-93B0-6045BD91D362,0B39B000-708D-EC11-B400-000D3A24B6E5,126C6BF1-6A9A-E511-A092-005056B06EB4,13AFFF54-DA36-E711-80E6-001DD8B72B61,143B2AE3-1B6F-E111-B43A-00505680000A,...,CDC9B99B-61DA-EC11-BB3D-6045BD8B2E69,03ECA1DC-3267-EE11-9AE7-6045BD895B5A,47233332-2B05-EE11-8F6E-6045BD895420,AA8614A8-EA67-EE11-9AE7-6045BD9745FB,700DB6C4-25CD-E811-80F5-001DD8B72B62,F43AC180-E2C9-EC11-A7B5-00224881D030,7713E350-0556-EE11-BE6E-6045BD895420,1F0BD26E-6C66-EE11-9AE7-6045BD895B5A,FCDBE8E9-3868-EE11-9AE7-000D3A4AB78E,1C2A0BA5-899A-E511-A092-005056B06EB4
0,00008922-92DA-E711-80EE-001DD8B72B61,1.0,1.0,0.963925,0.93124,0.97561,0.97561,0.97561,0.963925,0.963925,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,01C7C34F-90DA-E711-80EE-001DD8B72B61,1.0,1.0,0.963925,0.93124,0.97561,0.97561,0.97561,0.963925,0.963925,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0217E600-1375-E211-A85C-005056B06EC4,0.963925,0.963925,1.0,0.920087,0.963925,0.963925,0.963925,0.952381,0.952381,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0542DA63-2C64-ED11-9561-6045BD895B5A,0.93124,0.93124,0.920087,1.0,0.93124,0.93124,0.93124,0.920087,0.920087,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,09E85092-AF88-EC11-93B0-6045BD91D362,0.97561,0.97561,0.963925,0.93124,1.0,0.97561,0.97561,0.963925,0.963925,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
input_person_id = '01C7C34F-90DA-E711-80EE-001DD8B72B61'
recommendations = pd.DataFrame(df.nlargest(11,input_person_id)['contactID'])
recommendations = recommendations[recommendations['contactID']!=input_person_id]
print(recommendations)

                                contactID
0    00008922-92DA-E711-80EE-001DD8B72B61
22   225548F9-8EDA-E711-80EE-001DD8B72B61
24   23D35156-1341-EB11-8116-001DD8B72B61
27   29B0EB66-82DA-E711-80EE-001DD8B72B61
41   415EC187-F409-EA11-8107-001DD8B72B62
86   8F165595-90DA-E711-80EE-001DD8B72B61
96   9B832ACA-81DA-E711-80EE-001DD8B72B61
107  B269EBE4-6CDA-E711-80EE-001DD8B72B61
136  DE9E5F4B-8349-E811-80F0-001DD8B72B62
137  DF8AE390-8349-E811-80F0-001DD8B72B62


## test

In [33]:
df3[(df3['contactID']=='143B2AE3-1B6F-E111-B43A-00505680000A')].head()

Unnamed: 0,contactID,accountID,functietitel,plaats,subregio,ondernemingsaard,ondernemingstype,activiteitNaam,keyphrases,campagneID,campagneType,campagneNaam,campagneSoort
8,143B2AE3-1B6F-E111-B43A-00505680000A,41037B31-0969-E111-B43A-00505680000A,algemeen directeur,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,"profiel, tak, social, china, restaurant, ...",,,,


In [30]:
df3[(df3['contactID']=='00008922-92DA-E711-80EE-001DD8B72B61')].head()

Unnamed: 0,contactID,accountID,functietitel,plaats,subregio,ondernemingsaard,ondernemingstype,activiteitNaam,keyphrases,campagneID,campagneType,campagneNaam,campagneSoort
0,00008922-92DA-E711-80EE-001DD8B72B61,41037B31-0969-E111-B43A-00505680000A,unknown,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,"profiel, tak, social, china, restaurant, ...",,,,


In [31]:
df.head().columns

Index(['contactID', '00008922-92DA-E711-80EE-001DD8B72B61',
       '01C7C34F-90DA-E711-80EE-001DD8B72B61',
       '0217E600-1375-E211-A85C-005056B06EC4',
       '0542DA63-2C64-ED11-9561-6045BD895B5A',
       '09E85092-AF88-EC11-93B0-6045BD91D362',
       '0B39B000-708D-EC11-B400-000D3A24B6E5',
       '126C6BF1-6A9A-E511-A092-005056B06EB4',
       '13AFFF54-DA36-E711-80E6-001DD8B72B61',
       '143B2AE3-1B6F-E111-B43A-00505680000A',
       ...
       'CDC9B99B-61DA-EC11-BB3D-6045BD8B2E69',
       '03ECA1DC-3267-EE11-9AE7-6045BD895B5A',
       '47233332-2B05-EE11-8F6E-6045BD895420',
       'AA8614A8-EA67-EE11-9AE7-6045BD9745FB',
       '700DB6C4-25CD-E811-80F5-001DD8B72B62',
       'F43AC180-E2C9-EC11-A7B5-00224881D030',
       '7713E350-0556-EE11-BE6E-6045BD895420',
       '1F0BD26E-6C66-EE11-9AE7-6045BD895B5A',
       'FCDBE8E9-3868-EE11-9AE7-000D3A4AB78E',
       '1C2A0BA5-899A-E511-A092-005056B06EB4'],
      dtype='object', name='contactID', length=26349)

In [32]:
df3

Unnamed: 0,contactID,accountID,functietitel,plaats,subregio,ondernemingsaard,ondernemingstype,activiteitNaam,keyphrases,campagneID,campagneType,campagneNaam,campagneSoort
0,00008922-92DA-E711-80EE-001DD8B72B61,41037B31-0969-E111-B43A-00505680000A,unknown,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,"profiel, tak, social, china, restaurant, ...",,,,
1,01C7C34F-90DA-E711-80EE-001DD8B72B61,41037B31-0969-E111-B43A-00505680000A,unknown,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,"profiel, tak, social, china, restaurant, ...",,,,
2,0217E600-1375-E211-A85C-005056B06EC4,41037B31-0969-E111-B43A-00505680000A,onderzoek & dienstverlening,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,"profiel, tak, social, china, restaurant, ...",,,,
3,0542DA63-2C64-ED11-9561-6045BD895B5A,41037B31-0969-E111-B43A-00505680000A,"teamleider graduaatsopleidingen enw bmg, artev...",Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,"profiel, tak, social, china, restaurant, ...",,,,
4,09E85092-AF88-EC11-93B0-6045BD91D362,41037B31-0969-E111-B43A-00505680000A,student,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,"profiel, tak, social, china, restaurant, ...",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75093,F43AC180-E2C9-EC11-A7B5-00224881D030,,,,,,,,,FFE56B2D-B848-EE11-BE6E-6045BD895D78,Project,OV-P-GROEP U StartUp Bryo 2023,Offline
75095,7713E350-0556-EE11-BE6E-6045BD895420,,,,,,,,,FFE56B2D-B848-EE11-BE6E-6045BD895D78,Project,OV-P-GROEP U StartUp Bryo 2023,Offline
75097,1F0BD26E-6C66-EE11-9AE7-6045BD895B5A,,,,,,,,,FFE56B2D-B848-EE11-BE6E-6045BD895D78,Project,OV-P-GROEP U StartUp Bryo 2023,Offline
75099,FCDBE8E9-3868-EE11-9AE7-000D3A4AB78E,,,,,,,,,FFE56B2D-B848-EE11-BE6E-6045BD895D78,Project,OV-P-GROEP U StartUp Bryo 2023,Offline
