In [96]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [47]:
df = pd.read_csv('jobs.csv')

In [48]:
df

Unnamed: 0,country,title,text
0,AT,Architect,https://www.linkedin.com/jobs/view/2589036509/...
1,AT,Manager,www.linkedin.com/jobs/view/2540581439/\n\nInfo...
2,AU,Inform,https://www.linkedin.com/company/global-blue/l...
3,CZ,IPSecu,https://www.linkedin.com/jobs/view/2410398234/...
4,CZ,Senior,https://www.linkedin.com/jobs/view/2487448959/...
...,...,...,...
106,RS,IncidentResponder,https://www.linkedin.com/jobs/view/26120892329...
107,RS,ProfessionalAD,https://www.linkedin.com/jobs/view/2507722470/...
108,RS,ProfessionalAD,https://www.linkedin.com/jobs/view/2507722470/...
109,SE,CyberSec,https://www.linkedin.com/jobs/view/2507722470/...


In [49]:
# Removal of stopwords
from nltk.corpus import stopwords


# Removal of links
df['text'] = df['text'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', '', x))

# Extract Job Title from text
df['job_title'] = df['text'].apply(lambda x: (re.sub(r'^.*?\n', '\n', x).strip()).split('\n')[0] )

# Removal of punctuations
df['text'] = df['text'].str.replace('[^\w\s]','')

stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

  df['text'] = df['text'].str.replace('[^\w\s]','')


In [50]:
# Lemmatization
from textblob import Word
df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df['text'].head()

0    Information Security Architect Copmany SWAROVS...
1    Information Technology Security Manager Copman...
2    Information Technology Security Manager Global...
3    IP Security Architect About job Description Th...
4    Senior Information Security Analyst About job ...
Name: text, dtype: object

In [92]:
df['job_title'].value_counts()

Security Engineer                                          7
Information Security Engineer                              4
Security Analyst                                           4
Cyber Security Consultant                                  2
Penetration Tester                                         2
                                                          ..
Cyber Security Analyst at Honeywell                        1
System Engineer at SolarWinds                              1
Associate Offensive Security Engineer, Managed Services    1
Senior Security Operations Engineer at Pure Storage        1
Incident Response Engineer                                 1
Name: job_title, Length: 91, dtype: int64

In [51]:
# Let's import text feature extraction TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import Cosien Similarity metric
from sklearn.metrics.pairwise import cosine_similarity

In [52]:
# Create TFidfVectorizer 
tfidf = TfidfVectorizer()

# Fit and transform the documents 
tfidf_vector = tfidf.fit_transform(df['text'])

In [53]:
# Compute cosine similarity
cosine_sim=cosine_similarity(tfidf_vector, tfidf_vector)
cosine_sim

array([[1.        , 0.23469631, 0.17998257, ..., 0.12238441, 0.17857406,
        0.15370278],
       [0.23469631, 1.        , 0.63815828, ..., 0.16047038, 0.23225287,
        0.14487311],
       [0.17998257, 0.63815828, 1.        , ..., 0.14749756, 0.16859459,
        0.12079596],
       ...,
       [0.12238441, 0.16047038, 0.14749756, ..., 1.        , 0.1774875 ,
        0.19516535],
       [0.17857406, 0.23225287, 0.16859459, ..., 0.1774875 , 1.        ,
        0.21243609],
       [0.15370278, 0.14487311, 0.12079596, ..., 0.19516535, 0.21243609,
        1.        ]])

In [54]:
cosine_sim.shape

(111, 111)

## Find Cosine Similarity using Spacy

In [67]:
import en_core_web_md 

nlp = en_core_web_md.load()

In [68]:
doc1 = nlp(df['text'][0])
doc2 = nlp(df['text'][1])

doc1.similarity(doc2) 

0.9873073048514656

In [107]:
N = (len(df['text']))
# N = 20
matrix = np.zeros(( N, N))
for i in range(N):
    for j in range(N):
        matrix[i][j] = nlp(df['text'][i]).similarity( nlp(df['text'][j] ) )

sp_df = pd.DataFrame(matrix)

sp_df.columns = list(df['job_title'])[:N]
sp_df.index = list(df['job_title'])[:N]

In [110]:
sp_df.to_csv('pairwise_similarity_matrix.csv', index=False)

In [112]:
sp_df.head()

Unnamed: 0,Information Security Architect,Information Technology Security Manager,Information Technology Security Manager.1,IP Security Architect,Senior Information Security Analyst,Security Analyst,IT Security Analyst,CYBER SECURITY & IT COMPLIANCE MANAGER (M/F),Infrastructure Security Architect,Group Product Manager,...,Penetration Tester,Senior AWS Developer,Cybersecurity Engineer,Information System Security Engineer,Information Security Engineer,Cyber Security Incident Responder,Cybersecurity Professional for Active Directory (w/m/d),Cybersecurity Professional for Active Directory (w/m/d).1,Cyber Security Developer,Incident Response Engineer
Information Security Architect,1.0,0.987307,0.987196,0.912618,0.977619,0.958724,0.97543,0.980403,0.969762,0.961146,...,0.971345,0.923491,0.966627,0.855451,0.962628,0.973688,0.974058,0.974258,0.97426,0.961629
Information Technology Security Manager,0.987307,1.0,0.989958,0.915909,0.981832,0.962427,0.981442,0.985367,0.973421,0.971123,...,0.97912,0.921126,0.976189,0.8591,0.959546,0.980038,0.974344,0.974397,0.974973,0.963911
Information Technology Security Manager,0.987196,0.989958,1.0,0.908403,0.968996,0.949813,0.975077,0.976106,0.963371,0.966552,...,0.973725,0.944414,0.964231,0.869746,0.960925,0.96999,0.982104,0.981855,0.969121,0.964307
IP Security Architect,0.912618,0.915909,0.908403,1.0,0.916763,0.904748,0.934176,0.936655,0.958638,0.908865,...,0.926996,0.837852,0.938838,0.872094,0.961394,0.915433,0.906894,0.907253,0.952997,0.86882
Senior Information Security Analyst,0.977619,0.981832,0.968996,0.916763,1.0,0.969186,0.974976,0.984082,0.975657,0.949615,...,0.964745,0.884707,0.969846,0.836062,0.957407,0.974982,0.948862,0.949035,0.970302,0.941297


In [126]:
print(sp_df.iloc[0].argmin())
print(sp_df.iloc[0].min())

104
0.8554507131019559


In [133]:
sp_df.index[104]

'Information System Security Engineer'

In [134]:
sp_df['Information System Security Engineer']

Information Security Architect                             0.855451
Information Technology Security Manager                    0.859100
Information Technology Security Manager                    0.869746
IP Security Architect                                      0.872094
Senior Information Security Analyst                        0.836062
                                                             ...   
Cyber Security Incident Responder                          0.847908
Cybersecurity Professional for Active Directory (w/m/d)    0.863739
Cybersecurity Professional for Active Directory (w/m/d)    0.860770
Cyber Security Developer                                   0.875392
Incident Response Engineer                                 0.826858
Name: Information System Security Engineer, Length: 111, dtype: float64

In [77]:
matrix[0] = list(df['job_title'])[:10]

ValueError: could not convert string to float: 'Information Security Architect'