# epic 4(sh)

Als een key user kan ik voor een contact met weinig transacties een lookalike met veel transacties identificeren. Ik kan ook een clustering maken van contactpersonen die qua jobinhoud, type bedrijf, voorkeuren en (verwacht) gedrag

In [22]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine

In [23]:
ENV_URL = os.path.join(os.getcwd(), '../.env')
load_dotenv(ENV_URL)

DWH_NAME = os.environ.get('DWH_NAME')
SERVER_NAME = os.environ.get('SERVER_NAME')
DB_USER = os.environ.get('DB_USER')
DB_PASSWORD = os.environ.get('DB_PASSWORD')

URL = f'mssql+pymssql://{DB_USER}:{DB_PASSWORD}@{SERVER_NAME}/{DWH_NAME}'
URL_LOCAL = f'mssql+pyodbc://{SERVER_NAME}/{DWH_NAME}?trusted_connection=yes&driver=ODBC+Driver+17 for SQL Server'

engine = create_engine(URL_LOCAL)
conn = engine.connect()

In [24]:
def create_query(table_name, columns, condition=None):

    query = f"SELECT "

    for i, column in enumerate(columns):
        if i == 0:
            query += f"[{column}]"
        else:
            query += f", [{column}]"
    
    query += f" FROM [{DWH_NAME}].[dbo].[{table_name}]"
    
    if condition:
        query += f" WHERE {condition}"

    return query

### 1 Alles van account selecteren

In [25]:
acc_cols = ['accountID', 'plaats','subregio','ondernemingsaard','ondernemingstype','activiteitNaam']
# account conditie
acc_condition = "provincie = 'Oost-Vlaanderen'"
# create query
acc_query = create_query('DimAccount', acc_cols, acc_condition)
# read sql
df_account = pd.read_sql(acc_query, conn)
df_account.shape

(4124, 6)

### 2 Mergen van contact en account

In [26]:
contact_cols = ['contactID', 'accountID', 'functietitel','functieNaam']

contact_query = create_query('DimContact', contact_cols)
df_contact = pd.read_sql(contact_query, conn)

df_contact['functietitel'] = df_contact['functietitel'].str.lower()

df_contact['functieNaam'] = df_contact['functieNaam'].str.lower()
df_contact.shape

(194192, 4)

In [27]:
accounts_merged = pd.merge(df_contact, df_account, on='accountID', how='inner')
accounts_merged.shape

(61885, 9)

### 3 Mergen van account en afspraak

In [28]:
afspraak_cols = ['accountID', 'keyphrases']

afspraak_query = create_query('DimAfspraak', afspraak_cols)

df_afspraak = pd.read_sql(afspraak_query, conn)
df_afspraak.shape

(7167, 2)

In [29]:
acc_con_afs = pd.merge(accounts_merged, df_afspraak, on='accountID', how='inner')
acc_con_afs.shape

(63673, 10)

### 4 Mergen van Campagne en account

In [30]:
campagne_cols = ['campagneID','campagneType','campagneNaam','campagneSoort']

campagne_query = create_query('DimCampagne', campagne_cols)

df_campagne = pd.read_sql(campagne_query, conn)
df_campagne.shape

(4101, 4)

In [31]:
factInschrijving_cols = ['campagneID','contactID']

factInschrijving_query = create_query('FactInschrijving', factInschrijving_cols)

df_factInschrijving = pd.read_sql(factInschrijving_query, conn)
df_factInschrijving.shape

(78790, 2)

In [32]:
camp_fact = pd.merge(df_campagne, df_factInschrijving, on='campagneID', how='inner')
camp_fact.shape

(78790, 5)

In [37]:
df = pd.merge(acc_con_afs, camp_fact, on='contactID', how='inner')
df = df.drop_duplicates(subset=['contactID','campagneID'], keep='first')
df = df[['contactID','plaats','subregio','ondernemingsaard','ondernemingstype','activiteitNaam','campagneType','campagneNaam','campagneSoort','keyphrases','functietitel','functieNaam']]
df.shape

(12963, 12)

## Data Cleaning

In [38]:
# combine all the data 
df3 = df.copy()
df2 = df
df2['data'] =df[df.columns[1:]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
print(df['data'].head())

0    Gent,Gent,Diensten,Onderwijs,Overige industrie...
2    Gent,Gent,Diensten,Onderwijs,Overige industrie...
5    Gent,Gent,Diensten,Onderwijs,Overige industrie...
6    Gent,Gent,Diensten,Onderwijs,Overige industrie...
7    Gent,Gent,Diensten,Onderwijs,Overige industrie...
Name: data, dtype: object


In [39]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorized= vectorizer.fit_transform(df2['data'])

In [40]:
from sklearn.metrics.pairwise import cosine_similarity

#duurt 2m38s
similarities = cosine_similarity(vectorized)

In [41]:
print(similarities)


[[1.         0.83172394 0.84887469 ... 0.10938056 0.10938056 0.12550125]
 [0.83172394 1.         0.85732141 ... 0.09205746 0.09205746 0.1086429 ]
 [0.84887469 0.85732141 1.         ... 0.1127469  0.1127469  0.14784425]
 ...
 [0.10938056 0.09205746 0.1127469  ... 1.         0.91525424 0.90012503]
 [0.10938056 0.09205746 0.1127469  ... 0.91525424 1.         0.90012503]
 [0.12550125 0.1086429  0.14784425 ... 0.90012503 0.90012503 1.        ]]


In [42]:
df = pd.DataFrame(similarities,columns=df['contactID'],index=df['contactID']).reset_index()
df.head()

contactID,contactID.1,0542DA63-2C64-ED11-9561-6045BD895B5A,09E85092-AF88-EC11-93B0-6045BD91D362,153B9FE0-68BA-E811-80F4-001DD8B72B62,1FE13719-73A4-EC11-983F-00224884C0D3,1FE13719-73A4-EC11-983F-00224884C0D3.1,1FE13719-73A4-EC11-983F-00224884C0D3.2,225548F9-8EDA-E711-80EE-001DD8B72B61,225548F9-8EDA-E711-80EE-001DD8B72B61.1,225548F9-8EDA-E711-80EE-001DD8B72B61.2,...,E8B5ABFC-C8E6-E611-80E5-001DD8B72B61,E8B5ABFC-C8E6-E611-80E5-001DD8B72B61.1,E8B5ABFC-C8E6-E611-80E5-001DD8B72B61.2,F39FA0B4-5736-E711-80E6-001DD8B72B61,F39FA0B4-5736-E711-80E6-001DD8B72B61.1,FC89CE9B-17CA-E711-80EC-001DD8B72B62,FC89CE9B-17CA-E711-80EC-001DD8B72B62.1,FC89CE9B-17CA-E711-80EC-001DD8B72B62.2,FC89CE9B-17CA-E711-80EC-001DD8B72B62.3,FC89CE9B-17CA-E711-80EC-001DD8B72B62.4
0,0542DA63-2C64-ED11-9561-6045BD895B5A,1.0,0.831724,0.848875,0.884779,0.776736,0.769373,0.848875,0.780156,0.862745,...,0.20444,0.2111,0.192187,0.210042,0.186704,0.14462,0.12983,0.109381,0.109381,0.125501
1,09E85092-AF88-EC11-93B0-6045BD91D362,0.831724,1.0,0.857321,0.81588,0.804076,0.777029,0.857321,0.828325,0.87133,...,0.160591,0.149241,0.150966,0.188562,0.164992,0.127802,0.11239,0.092057,0.092057,0.108643
2,153B9FE0-68BA-E811-80F4-001DD8B72B62,0.848875,0.857321,1.0,0.832704,0.800641,0.793052,0.875,0.804166,0.889297,...,0.187317,0.174078,0.17609,0.240563,0.19245,0.167705,0.133826,0.112747,0.112747,0.147844
3,1FE13719-73A4-EC11-983F-00224884C0D3,0.884779,0.81588,0.832704,1.0,0.895279,0.886792,0.832704,0.765295,0.84631,...,0.200545,0.207079,0.188526,0.228934,0.183147,0.141865,0.127357,0.107297,0.107297,0.140698
4,1FE13719-73A4-EC11-983F-00224884C0D3,0.776736,0.804076,0.800641,0.895279,1.0,0.895279,0.800641,0.772618,0.815572,...,0.179969,0.209061,0.211477,0.231125,0.1849,0.107417,0.128576,0.108324,0.108324,0.142044


In [43]:
input_person_id = '0542DA63-2C64-ED11-9561-6045BD895B5A'
recommendations = pd.DataFrame(df.nlargest(11,input_person_id)['contactID'])
recommendations = recommendations[recommendations['contactID']!=input_person_id]
print(recommendations)

                               contactID
9   236D00C5-A765-ED11-9561-6045BD8956C9
21  39DF7E64-2C66-ED11-9561-6045BD895BFB
72  EBBF1717-7556-ED11-BBA2-6045BD895BFB
49  945F58FA-9A6C-ED11-9561-6045BD895B5A
3   1FE13719-73A4-EC11-983F-00224884C0D3
17  29B0EB66-82DA-E711-80EE-001DD8B72B61
42  8F165595-90DA-E711-80EE-001DD8B72B61
69  DE9E5F4B-8349-E811-80F0-001DD8B72B62
65  DE9E5F4B-8349-E811-80F0-001DD8B72B62
68  DE9E5F4B-8349-E811-80F0-001DD8B72B62


## test

In [44]:
df3[(df3['contactID']=='0542DA63-2C64-ED11-9561-6045BD895B5A')].head()

Unnamed: 0,contactID,plaats,subregio,ondernemingsaard,ondernemingstype,activiteitNaam,campagneType,campagneNaam,campagneSoort,keyphrases,functietitel,functieNaam
0,0542DA63-2C64-ED11-9561-6045BD895B5A,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,Netwerkevenement,OV-NW-Nieuwjaarsreceptie regio Oost-Vlaanderen,Offline,"workshop, plantages, student, pakket, onli...","teamleider graduaatsopleidingen enw bmg, artev...",medewerker


In [46]:
df3[(df3['contactID']=='DE9E5F4B-8349-E811-80F0-001DD8B72B62')].head()

Unnamed: 0,contactID,plaats,subregio,ondernemingsaard,ondernemingstype,activiteitNaam,campagneType,campagneNaam,campagneSoort,keyphrases,functietitel,functieNaam
110,DE9E5F4B-8349-E811-80F0-001DD8B72B62,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,Netwerkevenement,OV-NW-Voka Update: The Big Refresh 6 - voorjaa...,Offline,"workshop, plantages, student, pakket, onli...",unknown,medewerker
114,DE9E5F4B-8349-E811-80F0-001DD8B72B62,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,Netwerkevenement,OV-NW-Voka Bilan 2018,Offline,"workshop, plantages, student, pakket, onli...",unknown,medewerker
115,DE9E5F4B-8349-E811-80F0-001DD8B72B62,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,Infosessie,OV-Webinar: E-commerce op de Chinese markt,Online,"workshop, plantages, student, pakket, onli...",unknown,medewerker
116,DE9E5F4B-8349-E811-80F0-001DD8B72B62,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,Netwerkevenement,OV-NW-Verderkijkers 2018-Think Customer,Offline,"workshop, plantages, student, pakket, onli...",unknown,medewerker
117,DE9E5F4B-8349-E811-80F0-001DD8B72B62,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,Netwerkevenement,OV-NW-Voka Politica XL,Offline,"workshop, plantages, student, pakket, onli...",unknown,medewerker


In [None]:
df.head().columns

Index(['contactID', '00008922-92DA-E711-80EE-001DD8B72B61',
       '01C7C34F-90DA-E711-80EE-001DD8B72B61',
       '0217E600-1375-E211-A85C-005056B06EC4',
       '0542DA63-2C64-ED11-9561-6045BD895B5A',
       '09E85092-AF88-EC11-93B0-6045BD91D362',
       '0B39B000-708D-EC11-B400-000D3A24B6E5',
       '126C6BF1-6A9A-E511-A092-005056B06EB4',
       '13AFFF54-DA36-E711-80E6-001DD8B72B61',
       '143B2AE3-1B6F-E111-B43A-00505680000A',
       ...
       'CDC9B99B-61DA-EC11-BB3D-6045BD8B2E69',
       '03ECA1DC-3267-EE11-9AE7-6045BD895B5A',
       '47233332-2B05-EE11-8F6E-6045BD895420',
       'AA8614A8-EA67-EE11-9AE7-6045BD9745FB',
       '700DB6C4-25CD-E811-80F5-001DD8B72B62',
       'F43AC180-E2C9-EC11-A7B5-00224881D030',
       '7713E350-0556-EE11-BE6E-6045BD895420',
       '1F0BD26E-6C66-EE11-9AE7-6045BD895B5A',
       'FCDBE8E9-3868-EE11-9AE7-000D3A4AB78E',
       '1C2A0BA5-899A-E511-A092-005056B06EB4'],
      dtype='object', name='contactID', length=26349)

In [None]:
df3

Unnamed: 0,contactID,accountID,functietitel,functieNaam,plaats,subregio,ondernemingsaard,ondernemingstype,activiteitNaam,keyphrases,campagneID,campagneType,campagneNaam,campagneSoort
0,0542DA63-2C64-ED11-9561-6045BD895B5A,41037B31-0969-E111-B43A-00505680000A,"teamleider graduaatsopleidingen enw bmg, artev...",medewerker,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,"workshop, plantages, student, pakket, onli...",317CD023-2B1E-ED11-B83D-000D3AAD783A,Netwerkevenement,OV-NW-Nieuwjaarsreceptie regio Oost-Vlaanderen,Offline
2,09E85092-AF88-EC11-93B0-6045BD91D362,41037B31-0969-E111-B43A-00505680000A,student,medewerker,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,"workshop, plantages, student, pakket, onli...",C6D72260-E451-EC11-8C62-000D3ABFCF4A,Netwerkevenement,OV-NW-VokaUpdate-Big Refresh-Voorjaar 2022,Online
5,153B9FE0-68BA-E811-80F4-001DD8B72B62,41037B31-0969-E111-B43A-00505680000A,marketingverantwoordelijke,medewerker marketing,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,"workshop, plantages, student, pakket, onli...",7835D8C0-F488-E811-80F3-001DD8B72B61,Netwerkevenement,OV-NW-Verderkijkers 2018-Think Customer,Offline
6,1FE13719-73A4-EC11-983F-00224884C0D3,41037B31-0969-E111-B43A-00505680000A,community manager & coördinator postgraduaat o...,medewerker,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,"workshop, plantages, student, pakket, onli...",317CD023-2B1E-ED11-B83D-000D3AAD783A,Netwerkevenement,OV-NW-Nieuwjaarsreceptie regio Oost-Vlaanderen,Offline
7,1FE13719-73A4-EC11-983F-00224884C0D3,41037B31-0969-E111-B43A-00505680000A,community manager & coördinator postgraduaat o...,medewerker,Gent,Gent,Diensten,Onderwijs,Overige industrie & diensten,"workshop, plantages, student, pakket, onli...",4C8FF159-145C-EC11-8F8F-000D3A2BCF4B,Infosessie,OV-JO Breakfastclub april 2022,Offline
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72039,FC89CE9B-17CA-E711-80EC-001DD8B72B62,4AFDE787-17CA-E711-80EC-001DD8B72B62,unknown,"contact lidmaatschap, bedrijfsleider",ZOTTEGEM,Aalst,Productie & Diensten,Bedrijf,Bouw,"workshop, tijdsinvester, creatief, aankled,...",24D8FBE3-7BB6-E811-80F4-001DD8B72B62,Netwerkevenement,OV-NW-Voka Ambassadeur - Verkiezing 2018,Offline
72040,FC89CE9B-17CA-E711-80EC-001DD8B72B62,4AFDE787-17CA-E711-80EC-001DD8B72B62,unknown,"contact lidmaatschap, bedrijfsleider",ZOTTEGEM,Aalst,Productie & Diensten,Bedrijf,Bouw,"workshop, tijdsinvester, creatief, aankled,...",2B01F3C0-E1C4-E911-8104-001DD8B72B61,Netwerkevenement,OV-JO-FinFinder3,Offline
72042,FC89CE9B-17CA-E711-80EC-001DD8B72B62,4AFDE787-17CA-E711-80EC-001DD8B72B62,unknown,"contact lidmaatschap, bedrijfsleider",ZOTTEGEM,Aalst,Productie & Diensten,Bedrijf,Bouw,"workshop, tijdsinvester, creatief, aankled,...",39BBF8F7-6F3A-E911-80FC-001DD8B72B61,Project,OV-P-Groep J2 - StartUp-Bryo2019,Offline
72055,FC89CE9B-17CA-E711-80EC-001DD8B72B62,4AFDE787-17CA-E711-80EC-001DD8B72B62,unknown,"contact lidmaatschap, bedrijfsleider",ZOTTEGEM,Aalst,Productie & Diensten,Bedrijf,Bouw,"workshop, tijdsinvester, creatief, aankled,...",52A5376C-6851-E911-80FD-001DD8B72B61,Projectgebonden,OV-P-Plato-Overkoepelende sessie: Veerkracht,Offline
