In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
skills=pd.read_excel("data.xlsx",sheet_name=0)
skills.head()

Unnamed: 0,JobID,Job,CompID,SkillsID,Competency/Skills
0,1,Data Analyst,C1,S1,Data Visualization Power BI
1,1,Data Analyst,C1,S2,Data Visualization Tableau
2,1,Data Analyst,C1,S3,Data Visualization Looker
3,1,Data Analyst,C1,S4,Data Visualization Matplotlib
4,1,Data Analyst,C2,S5,Data Preparation Pandas


In [3]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [4]:
sentences = ["eat apple", "rabbit dead","mother loves fruits"]
embeddings = model.encode(sentences)
print(embeddings)

[[ 0.0519857   0.03726526  0.05079734 ... -0.01896069  0.06716993
   0.02756945]
 [ 0.04755198 -0.01106954 -0.01533367 ...  0.00871179  0.01541569
  -0.01033784]
 [ 0.00875989  0.00015454  0.04707264 ... -0.00274575  0.11228129
  -0.01075796]]


In [5]:
print(embeddings[0].reshape(1,-1).shape)

(1, 384)


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(embeddings[0].reshape(1,-1),embeddings[1].reshape(1,-1))
print(similarity)
similarity=cosine_similarity(embeddings[0].reshape(1,-1),embeddings[2].reshape(1,-1))
print(similarity)
similarity=cosine_similarity(embeddings[1].reshape(1,-1),embeddings[2].reshape(1,-1))
print(similarity)

[[0.07827369]]
[[0.43070382]]
[[0.0440757]]


In [7]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
stopFrench=stopwords.words("french")
def normalize(text):
  text=re.sub(r'http\S+', '', text)
  text=re.sub(r'@\w+', '', text)
  text=re.sub(r'[^a-zA-Z\s]', '', text)
  text=text.lower()
  words=text.split()
  words=[w for w in words if w not in stopFrench]
  text=" ".join(words)
  return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asami\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
user_input="Son travail repose sur une vigilance constante et une capacité à distinguer les fausses alertes des véritables menaces. Lorsqu’un incident est confirmé, il enclenche les procédures de réponse : isolation des systèmes compromis, analyse des causes, et coordination avec les équipes techniques pour rétablir la sécurité. Chaque incident est une opportunité d’apprentissage, lui permettant d’affiner les règles de détection et de mieux préparer l’organisation aux attaques futures."
cleaned_user_input=normalize(user_input)
print(cleaned_user_input)

travail repose vigilance constante capacit distinguer fausses alertes vritables menaces lorsquun incident confirm enclenche procdures rponse isolation systmes compromis analyse causes coordination quipes techniques rtablir scurit chaque incident opportunit dapprentissage permettant daffiner rgles dtection mieux prparer lorganisation attaques futures


In [9]:
embedded_skills=model.encode(skills["Competency/Skills"].apply(normalize))
embedded_user=model.encode(cleaned_user_input)
similarities=[model.similarity(embedded_user,skills)[0][0].item() for skills in embedded_skills]
skills["Score"]=similarities

In [10]:
skills.head()

Unnamed: 0,JobID,Job,CompID,SkillsID,Competency/Skills,Score
0,1,Data Analyst,C1,S1,Data Visualization Power BI,0.131679
1,1,Data Analyst,C1,S2,Data Visualization Tableau,0.113234
2,1,Data Analyst,C1,S3,Data Visualization Looker,0.114878
3,1,Data Analyst,C1,S4,Data Visualization Matplotlib,0.026105
4,1,Data Analyst,C2,S5,Data Preparation Pandas,0.123545


In [11]:
skills.sort_values(by=["Score"],ascending=False)

Unnamed: 0,JobID,Job,CompID,SkillsID,Competency/Skills,Score
83,7,SOC Analyst,C13,S41,Threat Intelligence MITRE ATT&CK,0.409731
110,10,Threat Intelligence Analyst,C13,S41,Threat Intelligence MITRE ATT&CK,0.409731
82,7,SOC Analyst,C13,S40,Threat Intelligence MISP,0.380052
109,10,Threat Intelligence Analyst,C13,S40,Threat Intelligence MISP,0.380052
111,10,Threat Intelligence Analyst,C13,S42,Threat Intelligence ThreatConnect,0.375090
...,...,...,...,...,...,...
38,3,Machine Learning Engineer,C5,S18,Deep Learning Keras,-0.028811
28,2,Data Scientist,C5,S18,Deep Learning Keras,-0.028811
10,1,Data Analyst,C3,S11,Machine Learning LightGBM,-0.031967
21,2,Data Scientist,C3,S11,Machine Learning LightGBM,-0.031967


In [12]:
import numpy as np

In [13]:
jobs=set(skills["Job"])
jobDf=pd.DataFrame(jobs,columns=["Job"])
jobDf.head()

Unnamed: 0,Job
0,Machine Learning Engineer
1,Data Analyst
2,Data Engineer
3,Pentester (Ethical Hacker)
4,Threat Intelligence Analyst


In [14]:
means=[]
for job in jobs:
    means.append(np.mean(skills[skills["Job"]==job]["Score"][0:3]))
jobDf["MeanScore"]=means
finalJob=jobDf.sort_values(by="MeanScore",ascending=False).reset_index(drop=True)
finalJob.head()

Unnamed: 0,Job,MeanScore
0,Cybersecurity Analyst,0.213775
1,SOC Analyst,0.213775
2,Pentester (Ethical Hacker),0.174229
3,Threat Intelligence Analyst,0.174229
4,Data Analyst,0.119931


In [15]:
print("Le metier qui vous correspond le plus est : ",finalJob.loc[0,"Job"])

Le metier qui vous correspond le plus est :  Cybersecurity Analyst
