# Installations and data loading

In [None]:
!pip install tensorflow -q

In [None]:
!pip install tensorflow-text --no-dependencies -q

In [None]:
!pip install tensorflow_hub -q

In [None]:
!pip install --upgrade --user tensorflow tensorflow-hub -q

In [None]:
!pip install --upgrade tensorflow



In [None]:
!pip install --upgrade tensorflow-text



In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

print("TensorFlow version:", tf.__version__)
print("TensorFlow Hub version:", hub.__version__)
print("TensorFlow Text version:", text.__version__)

TensorFlow version: 2.13.0
TensorFlow Hub version: 0.14.0
TensorFlow Text version: 2.13.0


In [None]:
#Instantiate BERT preprocessor

preprocessor_url = "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3"
preprocessor = hub.KerasLayer(preprocessor_url)

In [None]:
#Instantiate BERT embedding model

embedding_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/2"
embedding_model = hub.KerasLayer(embedding_url)

In [None]:
#Data loading

import pandas as pd
import numpy as np

df = pd.read_csv('/content/drive/MyDrive/Jedha_Fullstack/DataMatch_Project/Model/df_clean.csv')
df

Unnamed: 0,job_title,job_class,job_company,job_description,location,skills,salary,platform
0,Manager Data,Data Manager,CGI,"Manager data description de postechez cgi, lea...",France,"Excel,",45000.000000,HelloWork
1,Développeur Big Data,Data Architect,Inetum,"Nous sommes une esn agile, un groupe internati...",France,"Python, Sql, Spark, Java, Scala, Hadoop, Nosql...",45000.000000,HelloWork
2,Data Developer - Cdd,Développeur,Crédit Agricole Consumer Finance,"Crédit agricole consumer finance, filiale à 10...",Paris,"Python, Sql, R , Excel, , Sas, Vba, Sas",45000.000000,HelloWork
3,Responsable Bi Et Data,Data/BI Analyst,Timac AGRO France,Spécialiste de la production d'amendements de ...,France,"Sql, Tableau, Sas, Sas",45000.000000,HelloWork
4,Tech Lead Big Data,Data Architect,SG Services centraux,"Chez société générale, nous sommes convaincus ...",France,"Spark, Scala, Hadoop",45000.000000,HelloWork
...,...,...,...,...,...,...,...,...
8026,Data Manager Lead,Data Manager,Keyrus,Descriptif de poste nous recherchons un data ...,Neuilly-Sur-Seine,"Excel,",45221.142178,LinkedIn
8027,Ingénieur Data,Data Engineer,Mosica,"Véritable agent de carrière, mosica créée et m...",Poissy,"Python, Aws, Spark, Java, Scala, Hadoop, Kafka...",45221.142178,LinkedIn
8028,Stage - Bras Droit Co-Fondateur - Data & Opera...,Autres,Climb,L'entreprise : notre mission : permettre à...,Clichy,"Sql, Tableau, Excel,",45221.142178,LinkedIn
8029,Consultant Stratégie Cloud -,Data Architect,Thales,Qui etes-vous ? diplômé d’une grande école...,Courbevoie,,45221.142178,LinkedIn


# Preprocessing

In [None]:
!pip install langdetect -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/981.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [None]:
#Removal of offers in English if any

from langdetect import detect

def detect_language(text):
    try:
        return detect(text)
    except:
        return None

languages = df['job_description'].apply(detect_language)

english_occurrences = df['job_description'][languages == 'en']
print(f"{len(english_occurrences)} descriptions in english on {df.shape[0]} in overall" + '\n')
df = df.drop(english_occurrences.index)
print(f"{df.shape[0]} descriptions now")

2 descriptions in english on 8031 in overall

8029 descriptions now


In [None]:
#Cleaning descriptions text

import re
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

df['clean_text'] = df['job_description'].astype(str).apply(remove_emoji)
df['clean_text'] = df['clean_text'].replace(r"[^a-zA-Z0-9\s]", " ")
df['clean_text'] = df['clean_text'].apply(lambda x: ''.join(elt for elt in x if elt.isalnum() or elt==" "))
df['clean_text'] = df['clean_text'].replace(r"\s+", " ")
df['clean_text'] = df['clean_text'].apply(lambda x: x.lower())
df['clean_text'][:5]

0    manager data description de postechez cgi lead...
1    nous sommes une esn agile un groupe internatio...
2    crédit agricole consumer finance filiale à 100...
3    spécialiste de la production damendements de s...
4    chez société générale nous sommes convaincus q...
Name: clean_text, dtype: object

In [None]:
descs_list = df['clean_text'].tolist()

In [None]:
#Preprocessing by BERT preprocessor

%%time

input_dict = preprocessor(descs_list)

CPU times: user 55.5 s, sys: 3.63 s, total: 59.1 s
Wall time: 1min 12s


In [None]:
#Embedding

%%time

embeddings = embedding_model(input_dict)["pooled_output"]

CPU times: user 3min 56s, sys: 1min 46s, total: 5min 43s
Wall time: 4min 5s


In [None]:
print(embeddings)

tf.Tensor(
[[-0.70695114 -0.9963507   0.8429316  ... -0.9717219  -0.13150772
   0.88539803]
 [-0.44034722 -0.9982226   0.9099451  ... -0.952992    0.2780985
   0.9194428 ]
 [-0.5240503  -0.9932733   0.69654274 ... -0.9456916   0.49436834
   0.88399047]
 ...
 [-0.52654946 -0.9943937   0.9190644  ... -0.9270223   0.3241336
   0.79668534]
 [-0.65563375 -0.9873713   0.530454   ... -0.9550935  -0.22138757
   0.8804832 ]
 [-0.45294285 -0.9918248   0.94975317 ... -0.9059304   0.27188954
   0.8435341 ]], shape=(8029, 128), dtype=float32)


# Clusterization KMeans

In [None]:
#Clusterization by KMeans

import numpy as np
from sklearn.cluster import KMeans

embedded_documents_array = np.array(embeddings)

n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(embedded_documents_array)





In [None]:
#Add column with cluster labels

df['kmeans_cluster'] = kmeans.labels_
df.head()

Unnamed: 0,job_title,job_class,job_company,job_description,location,skills,salary,platform,clean_text,kmeans_cluster
0,Manager Data,Data Manager,CGI,"Manager data description de postechez cgi, lea...",France,"Excel,",45000.0,HelloWork,manager data description de postechez cgi lead...,1
1,Développeur Big Data,Data Architect,Inetum,"Nous sommes une esn agile, un groupe internati...",France,"Python, Sql, Spark, Java, Scala, Hadoop, Nosql...",45000.0,HelloWork,nous sommes une esn agile un groupe internatio...,1
2,Data Developer - Cdd,Développeur,Crédit Agricole Consumer Finance,"Crédit agricole consumer finance, filiale à 10...",Paris,"Python, Sql, R , Excel, , Sas, Vba, Sas",45000.0,HelloWork,crédit agricole consumer finance filiale à 100...,2
3,Responsable Bi Et Data,Data/BI Analyst,Timac AGRO France,Spécialiste de la production d'amendements de ...,France,"Sql, Tableau, Sas, Sas",45000.0,HelloWork,spécialiste de la production damendements de s...,2
4,Tech Lead Big Data,Data Architect,SG Services centraux,"Chez société générale, nous sommes convaincus ...",France,"Spark, Scala, Hadoop",45000.0,HelloWork,chez société générale nous sommes convaincus q...,2


In [None]:
df['kmeans_cluster'].value_counts()

1    2714
0    2666
2    2649
Name: kmeans_cluster, dtype: int64

In [None]:
centroids = kmeans.cluster_centers_
centroids.shape

(3, 128)

# Clusters summaries

In [None]:
#Manipulations to visualize the composition of the clusters

skills_list = df['skills'].str.split(", ", expand=True).stack().str.lower().unique().tolist()
job_classes_list = df['job_class'].str.lower().unique().tolist()
skills_and_jobs_list = np.append(skills_list, job_classes_list).tolist()

irrelevant_words = ["autres", "excel", "git", "sql"]
skills_and_jobs_list = [elt for elt in skills_and_jobs_list if elt not in irrelevant_words]

In [None]:
#Viz of the composition of each cluster in terms of skills required

from sklearn.metrics.pairwise import cosine_similarity

nb_top_motifs = 100

for i, centroid in enumerate(centroids):
    similarities = cosine_similarity(centroid.reshape(1, -1), embedded_documents_array)
    top_indices = np.argsort(similarities.ravel())[::-1][:nb_top_motifs]
    top_motifs = [descs_list[idx] for idx in top_indices][:100]

    print("Cluster", i, ":", top_indices, "\n")
    print("Cluster", i, ":", " ".join(top_motifs), "\n")
    keyword_in = []
    for motif in top_motifs:
        for elt in skills_and_jobs_list:
            if elt in motif:
                keyword_in.append(elt)
    print("Keywords", i, ":", "\n", pd.Series(keyword_in).value_counts(), '\n')

Cluster 0 : [ 169  406 1344  521 6956  349  706 7229 4344  644 1145 7376 6913 5187
 1039 3595 3197 3760 2719 1770 1771 3118 1772 3975 1773  756  755  753
 3725 3698 2861  807 1589 6687 6327 6226 3194 7283 3567  128  167 3060
  196 4948 4992 1996 3058 2067 3372  267 3395 2351 6521 1004 4336 4213
 3166 2333   75 2986 4129 6709 6926 6079 5955 2856 6422 2598  535 4938
 7686 3099 1582 2057 4573  244 4702 4701 3702 5542 3934 6096 6319 3512
  987 6554 1571 4437  653 6684 5849  780 7253 5412 1836 7093 1089  982
 3515 2663] 

Cluster 0 : nous recherchons un développeur data pour un client proposant une solution davantages salariaux à destination des dirigeants dentreprise il sagit dune société en pleine expansion avec une équipe de 150 personnes elle travaille avec plus de 30 partenaires qui lui font confiance dans les domaines du transport de la pharmacie du bâtiment et de lexpertise comptableen qualité de développeur data votre rôle va permettre à lentreprise de développer ses applicatifs rap

In [None]:
#Graphical viz of clusters

from sklearn.decomposition import TruncatedSVD
import plotly.graph_objects as go

svd = TruncatedSVD(n_components=3)
X_3d = svd.fit_transform(embedded_documents_array)

scatter3d_trace = go.Scatter3d(
    x=X_3d[:, 0],
    y=X_3d[:, 1],
    z=X_3d[:, 2],
    mode='markers',
    marker=dict(
        size=5,
        color=kmeans.labels_,
        colorscale='Viridis',
        opacity=0.8,
        colorbar=dict(
            title='Clusters'
        )
    ),
    name='Cluster'
)

fig = go.Figure(data=[scatter3d_trace])
fig.update_layout(
    scene=dict(
        xaxis_title='Dimension 1',
        yaxis_title='Dimension 2',
        zaxis_title='Dimension 3'
    ),
    title='Visualisation des clusters en 3D'
)

fig.show()

In [None]:
clusters_dic = {}

for i in range(n_clusters):
    clusters_dic[f'df_cluster_{i}'] = df.loc[df['kmeans_cluster'] == i]

In [None]:
#Viz of the composition of each cluster in terms of companies that recruit

clusters_dic['df_cluster_0']['job_company'].value_counts()

Atos                       102
CS GROUP                    67
Safran                      55
Thales                      52
CGI                         41
                          ... 
Hensoldt Nexeya France       1
Leyton France                1
Altim                        1
DRT                          1
 Malakoff Humanis            1
Name: job_company, Length: 1004, dtype: int64

In [None]:
clusters_dic['df_cluster_1']['job_company'].value_counts()

KPMG                 81
Open                 64
Inetum               56
CGI                  41
Verisure             35
                     ..
Expanscience          1
Captain Contrat       1
Hopscotch Groupe      1
Cheerz                1
 Cartier              1
Name: job_company, Length: 1114, dtype: int64

In [None]:
clusters_dic['df_cluster_2']['job_company'].value_counts()

GROUPE GEMA - ESI BUSINESS SCHOOL / IA SCHOOL (Siège National)     74
Sup de Vinci Rennes                                                68
CGI                                                                35
BNP Paribas                                                        35
Crédit Agricole Group Infrastructure Plateform                     30
                                                                   ..
AFD.TECH                                                            1
CERFRANCE Poitou-Charentes                                          1
BIOCODEX                                                            1
Partnaire                                                           1
 Mosica                                                             1
Name: job_company, Length: 1045, dtype: int64