<a href="https://colab.research.google.com/github/Catisyf/Python-projects-chez-Toucan/blob/main/Backfill_text_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install -U sentence-transformers

In [None]:
!ls '/content/drive/My Drive/crm_contacts.csv'

'/content/drive/My Drive/crm_contacts.csv'


In [4]:
import pandas as pd
from scipy import spatial
from collections import Counter #reduce processing time

In [5]:
df_contacts = pd.read_csv('/content/drive/My Drive/crm_contacts.csv', na_values=' ') #replace blank cells with na
df_persona = pd.read_csv('/content/drive/My Drive/buyer_persona_definition.csv')

## **Data Preparation**

Step 1: remove null & irrelevant values from CRM data

Step 2: remove stop words in both datasets (of, de etc.). It is not necessary to use stopwords dictionaries.  

In [6]:
df_contacts = df_contacts.replace('?', 'np.Nan').dropna() #remove null

In [7]:
df_contacts = df_contacts[~df_contacts.jobtitle.str.contains('(site)|(try-for-free)')] #remove strings that contain irrelevant info

  return func(self, *args, **kwargs)


In [8]:
stopwords = ['of', 'de', 'du', 'for', 'For', 'the', 'et', '&', '/'] 
space = ' '
stopwords_dict = Counter(stopwords)

In [9]:
clean = []
for n in range(0,len(df_contacts)):
  clean.append(space.join([word for word in df_contacts.iloc[n]['jobtitle'].split() \
                                                if word not in stopwords_dict])) #remove stopwords from persona data
  
df_contacts['clean_title'] = clean

In [10]:
clean = []
for n in range(0,len(df_persona)):
  clean.append(space.join([word for word in df_persona.iloc[n]['job_title'].split() \
                                                if word not in stopwords_dict])) #remove stopwords from persona data
  
df_persona['clean_title'] = clean

## **Use NLP model for matching**

In [11]:
#Pre-trained sentence transformer model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [12]:
#Vectorize cleaned job titles

vector1 = []
for i in range(0,len(df_contacts)):
  vector1.append(model.encode(df_contacts.iloc[i]['clean_title']))

df_contacts['vector'] = vector1

In [24]:
vector2 = []
for i in range(0,len(df_persona)):
  vector2.append(model.encode(df_persona.iloc[i]['clean_title']))

df_persona['vector'] = vector2

In [22]:
df_contacts.to_csv(r'/content/drive/My Drive/crm.csv', index = False)

In [25]:
df_persona.to_csv(r'/content/drive/My Drive/persona.csv', index = False)

In [None]:
#def cosine_similarity (x,y):
  #result = 1-spatial.distance.cosine(x, y)
  #return result

In [34]:
vec_crm = []
vec_persona = []
similarity = []
for vec1 in df_contacts['vector']:
  for vec2 in df_persona['vector']:
    if (1-spatial.distance.cosine(vec1, vec2)) >= 0.85: #set thereshold for high-level similarity between crm job titles and toucan definition 
      vec_crm.append(vec1)
      vec_persona.append(vec2)
      similarity.append(1-spatial.distance.cosine(vec1, vec2))

In [59]:
table = pd.DataFrame({'vec_crm': vec_crm,
                     'vec_persona': vec_persona,
                      'similarity': similarity})

In [60]:
table

Unnamed: 0,vec_crm,vec_persona,similarity
0,"[0.016104918, 0.01589653, -0.037263863, -0.016...","[0.012951195, 0.011007603, -0.002596906, -0.01...",0.874570
1,"[0.02564861, 0.005234625, -0.077426195, 0.0211...","[0.0062408005, 0.017177565, -0.086252905, 0.02...",0.932112
2,"[-0.047285523, 0.06932076, -0.0064845555, 0.02...","[-0.025777908, 0.06288852, -0.019013725, 0.022...",0.945942
3,"[0.02134378, 0.023986489, -0.02848963, -0.0211...","[0.012951195, 0.011007603, -0.002596906, -0.01...",0.879964
4,"[-0.029905481, 0.095552064, 0.0024888797, 0.00...","[-0.04500205, 0.096511535, 0.03392309, 0.04132...",0.856016
...,...,...,...
405,"[0.0124193635, -0.06058235, -0.0136015555, 0.0...","[0.012346001, -0.034419905, 0.010265383, 0.040...",0.907719
406,"[-0.039011903, -0.04080392, 0.020483527, 0.045...","[-0.034476522, -0.027893739, 0.024168175, 0.02...",0.921947
407,"[-0.050758168, 0.027393581, -0.00058877113, 0....","[-0.039283436, 0.028995229, 0.026557157, 0.043...",0.874555
408,"[-0.004294157, 0.01826609, 0.0045741037, 0.037...","[-0.009512056, 0.01465146, 0.001745046, 0.0302...",0.992154


In [37]:
table_persona = df_persona[df_persona['vector'].isin(vec_persona)]
table_contacts = df_contacts[df_contacts['vector'].isin(vec_crm)]

In [44]:
table_contacts[['jobtitle', 'email', 'vector']]

Unnamed: 0,jobtitle,email,vector
5,Senior Engineer,chibuzor.obilom@gmail.com,"[0.016104918, 0.01589653, -0.037263863, -0.016..."
24,Manager of Sales,iamguan@yeah.net,"[0.02564861, 0.005234625, -0.077426195, 0.0211..."
100,Technical Lead,prasadsunny1@gmail.com,"[-0.047285523, 0.06932076, -0.0064845555, 0.02..."
107,Senior Associate Engineer,rkalavakuntla@tasengg.com,"[0.02134378, 0.023986489, -0.02848963, -0.0211..."
264,Head of Technology,christian@penetrace.com,"[-0.029905481, 0.095552064, 0.0024888797, 0.00..."
...,...,...,...
24280,Reponsable Datamining,vpiot@oui.sncf,"[0.0124193635, -0.06058235, -0.0136015555, 0.0..."
24328,Chargées d'Analyses Financières,isabelle.duffau@pole-emploi.fr,"[-0.039011903, -0.04080392, 0.020483527, 0.045..."
24394,Chef de Projets,nicolas.varenne@medusims.com,"[-0.050758168, 0.027393581, -0.00058877113, 0...."
24498,Directeur Systèmes d'Information,philippe-alexandre.schajer@eurovia.com,"[-0.004294157, 0.01826609, 0.0045741037, 0.037..."


In [45]:
table_persona[['buyer_persona', 'job_title', 'vector']]

Unnamed: 0,buyer_persona,job_title,vector
0,"Data Leader - CIO, CDO",CIO,"[0.018791175, 0.0241993, 0.04187469, -0.000740..."
1,"Data Leader - CIO, CDO",CDO,"[0.052183516, -0.003516074, -0.0060044792, -0...."
2,"Data Leader - CIO, CDO",Chief Information Officer,"[0.029007144, 0.08815744, 0.008515886, 0.01692..."
3,"Data Leader - CIO, CDO",CIO,"[0.018791175, 0.0241993, 0.04187469, -0.000740..."
5,"Data Leader - CIO, CDO",Head of Information Technology,"[-0.04500205, 0.096511535, 0.03392309, 0.04132..."
...,...,...,...
115,Practitioner,Chef de Projet Décisionnel,"[-0.042411648, 0.027226837, 0.01959621, 0.0461..."
119,Practitioner,Chef de Projet Business Intelligence,"[-0.01577634, -0.0404313, 0.04310881, 0.006635..."
120,Practitioner,Chef de Projet Insights,"[-0.039283436, 0.028995229, 0.026557157, 0.043..."
122,Practitioner,Chef de Projet SI,"[-0.06391318, 0.056288775, 0.05590724, 0.01934..."
