<a href="https://colab.research.google.com/github/Catisyf/Python-projects-chez-Toucan/blob/main/Copy_of_Welcome_to_Colaboratory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install -U sentence-transformers

In [None]:
!ls '/content/drive/My Drive/crm_contacts.csv'

'/content/drive/My Drive/crm_contacts.csv'


In [208]:
import pandas as pd
from scipy import spatial
from collections import Counter 
import numpy as np
import re

In [271]:
df_contacts = pd.read_csv('/content/drive/My Drive/crm_contacts.csv', na_values=' ') #replace blank cells with na
df_persona = pd.read_csv('/content/drive/My Drive/buyer_persona_definition.csv')

## **Data Preparation**

Step 1: remove null & irrelevant values from CRM data

Step 2: remove stop words in both datasets (of, de etc.). It is not necessary to use stopwords dictionaries.  

In [272]:
df_contacts = df_contacts.replace('?', 'np.Nan').dropna() #remove null

In [273]:
df_contacts = df_contacts[~df_contacts.jobtitle.str.contains('(site)|(try-for-free)')] #remove strings that contain irrelevant info

  return func(self, *args, **kwargs)


In [274]:
#create stopword list
stopwords = ['of', 'de', 'du', 'for', 'For', 'the', 'et', "d'"] 
space = ' '
stopwords_dict = Counter(stopwords)

In [275]:
#function to remove stopwords from job title
def remove_stopwords (df_name, column_name):
    clean = [space.join([word for word in name.split() if word not in stopwords_dict]) \
             for name in df_name[column_name]]
    return clean

In [276]:
#function to remove special characters from job title
def remove_sign(df_name, column_name):
  clean = [re.sub('[|!@#$-.&/_+={}()]', ' ', text) for text in df_name[column_name]]
  return clean

In [277]:
#add cleaned job titles to datasets
df_persona['clean_title'] = remove_stopwords(df_persona, 'job_title')
df_contacts['clean_title'] = remove_stopwords(df_contacts, 'jobtitle')

In [278]:
df_persona['clean_title'] = remove_sign(df_persona, 'clean_title')
df_contacts['clean_title'] = remove_sign(df_contacts, 'clean_title')

## **Use NLP model for matching**

In [279]:
#pre-trained sentence transformer model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [280]:
def text_vectorizer (df_name, column_name):
  vector = [model.encode(title) for title in df_name[column_name]]
  
  return vector

In [281]:
#add vector to datasets
df_persona['vector'] = text_vectorizer (df_persona, 'clean_title')
df_contacts['vector'] = text_vectorizer (df_contacts, 'clean_title')

In [282]:
#function to calculate cosine similarity
def cosine_similarity(v1,v2):
  return 1-spatial.distance.cosine(v1, v2)

In [233]:
#find a match for each job title in CRM data using the max value of cosine similarity -- REWRITE

similarity = {}
i = 0
for vec1 in df_contacts['vector']:
  similarity_persona = []
  for vec2 in df_persona['vector']:
    similarity_persona.append(cosine_similarity(vec1, vec2))
  
  max_similiarity = max(similarity_persona)
  persona_id = similarity_persona.index(max_similiarity) #
  similarity[i] = [max_similiarity, persona_id]
  i += 1


In [236]:
table = pd.DataFrame.from_dict(similarity, orient='index')
table.columns =['similarity_score', 'persona_job_index']

In [251]:
#join tables using index 
table_full = table.join(df_contacts[['jobtitle', 'email', 'vector']]\
                       ).merge(df_persona[['buyer_persona', 'job_title', 'vector']], \
                               left_on = 'persona_job_index', right_index = True)

In [255]:
table_good_score = table_full.loc[table_full['similarity_score'] > 0.9]

In [295]:
table_full.to_csv('/content/drive/My Drive/table_full.csv', index = False)

------------------------------------------------------------------------------

In [283]:
vec_crm = []
vec_persona = []
similarity = []
for vec1 in df_contacts['vector']:
  for vec2 in df_persona['vector']:
    if (cosine_similarity(vec1, vec2)) >= 0.85: #set thereshold for high-level similarity 
      vec_crm.append(vec1)
      vec_persona.append(vec2)
      similarity.append(cosine_similarity(vec1, vec2))

In [284]:
table = pd.DataFrame({'vec_crm': vec_crm,
                     'vec_persona': vec_persona,
                      'similarity': similarity})

In [285]:
#find job titles & persona
table_persona = df_persona[df_persona['vector'].isin(vec_persona)]
table_contacts = df_contacts[df_contacts['vector'].isin(vec_crm)]

In [286]:
#convert np array to str so I can use the vector as a key for joining tables
list1 = []
list2 = []
for i in range(len(table)):
  list1.append(np.array_str(table.iloc[i]['vec_crm']))
  list2.append(np.array_str(table.iloc[i]['vec_persona']))

table['vec1'] = list1
table['vec2'] = list2

In [None]:
ls = []
for i in range(len(table_contacts)):
  ls.append(np.array_str(table_contacts.iloc[i]['vector']))

table_contacts['vec_str'] = ls

In [None]:
ls = []
for i in range(len(table_persona)):
  ls.append(np.array_str(table_persona.iloc[i]['vector']))

table_persona['vec_str'] = ls

In [289]:
#join table
table1 = table.merge(table_contacts, left_on = 'vec1', right_on = 'vec_str')
table2 = table1.merge(table_persona, left_on = 'vec2', right_on = 'vec_str')
final = table2[['similarity', 'jobtitle', 'email', 'buyer_persona', 'job_title']]

In [292]:
result = final.drop_duplicates()

In [None]:
#final output
final.drop_duplicates()

Unnamed: 0,similarity,jobtitle,email,buyer_persona,job_title
0,0.874570,Senior Engineer,chibuzor.obilom@gmail.com,Practitioner,Senior BI Engineer
1,0.874570,Senior Engineer,vishal.dwivedi9@gmail.com,Practitioner,Senior BI Engineer
2,0.874570,Senior Engineer,skadir@tereos.com,Practitioner,Senior BI Engineer
9,0.879964,Senior Associate Engineer,rkalavakuntla@tasengg.com,Practitioner,Senior BI Engineer
10,0.932112,Manager of Sales,iamguan@yeah.net,Business Leader,Sales Director
...,...,...,...,...,...
2049,0.900356,Business Intelligence Project Leader,kasper.vrees@danone.com,Practitioner,Chef De Projet Business Intelligence
2050,0.850429,Chargée de projet Business Intelligence,claire.jourdan-sestier@isere.fr,Practitioner,Chef De Projet Business Intelligence
2051,0.866969,Business Intelligence Project Leader,kasper.vrees@danone.com,Practitioner,Chef de Projet Business Intelligence
2052,0.866108,Respo Analyse Données Finance,michel.ndiaye@fnacdarty.com,Practitioner,Financial Analyst


In [None]:
#use faker to anonymise email data

In [None]:
from faker import Faker
faker = Faker()
for i in range():
  email = (f'Company email: {faker.company_email()}')

In [None]:
#work on documentation 