In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
df = pd.read_csv('jobs.csv')

In [3]:
df.head()

Unnamed: 0,country,title,text
0,AT,Architect,https://www.linkedin.com/jobs/view/2589036509/...
1,AT,Manager,www.linkedin.com/jobs/view/2540581439/\n\nInfo...
2,AU,Inform,https://www.linkedin.com/company/global-blue/l...
3,CZ,IPSecu,https://www.linkedin.com/jobs/view/2410398234/...
4,CZ,Senior,https://www.linkedin.com/jobs/view/2487448959/...


In [4]:
df['text']

0      https://www.linkedin.com/jobs/view/2589036509/...
1      www.linkedin.com/jobs/view/2540581439/\n\nInfo...
2      https://www.linkedin.com/company/global-blue/l...
3      https://www.linkedin.com/jobs/view/2410398234/...
4      https://www.linkedin.com/jobs/view/2487448959/...
                             ...                        
106    https://www.linkedin.com/jobs/view/26120892329...
107    https://www.linkedin.com/jobs/view/2507722470/...
108    https://www.linkedin.com/jobs/view/2507722470/...
109    https://www.linkedin.com/jobs/view/2507722470/...
110    https://www.linkedin.com/jobs/view/25077224709...
Name: text, Length: 111, dtype: object

### Word Count 

In [5]:
df['word_count'] = df['text'].apply(lambda x: len( x.strip(' ')))
df[[ 'text', 'word_count']]

Unnamed: 0,text,word_count
0,https://www.linkedin.com/jobs/view/2589036509/...,2861
1,www.linkedin.com/jobs/view/2540581439/\n\nInfo...,4849
2,https://www.linkedin.com/company/global-blue/l...,10293
3,https://www.linkedin.com/jobs/view/2410398234/...,2673
4,https://www.linkedin.com/jobs/view/2487448959/...,3394
...,...,...
106,https://www.linkedin.com/jobs/view/26120892329...,4025
107,https://www.linkedin.com/jobs/view/2507722470/...,3936
108,https://www.linkedin.com/jobs/view/2507722470/...,3757
109,https://www.linkedin.com/jobs/view/2507722470/...,2750


In [6]:
df['word_count']

0       2861
1       4849
2      10293
3       2673
4       3394
       ...  
106     4025
107     3936
108     3757
109     2750
110     4192
Name: word_count, Length: 111, dtype: int64

## Stopwords

In [7]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['stopwords'] = df['text'].apply(lambda x: len([x for x in x.split() if x in stop]))

In [8]:
df.head()

Unnamed: 0,country,title,text,word_count,stopwords
0,AT,Architect,https://www.linkedin.com/jobs/view/2589036509/...,2861,134
1,AT,Manager,www.linkedin.com/jobs/view/2540581439/\n\nInfo...,4849,199
2,AU,Inform,https://www.linkedin.com/company/global-blue/l...,10293,300
3,CZ,IPSecu,https://www.linkedin.com/jobs/view/2410398234/...,2673,82
4,CZ,Senior,https://www.linkedin.com/jobs/view/2487448959/...,3394,105


## Remove links

In [9]:
# df['text'] = df['text'].apply(lambda x: '\n\n'+x)

In [10]:
df['lf_text'] = df['text'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', '', x))

df['lf_text'][:20]

0     \n\nInformation Security Architect\n\nCopmany:...
1     \n\nInformation Technology Security Manager\n\...
2     \n\nInformation Technology Security Manager\n\...
3     \n\n  IP Security Architect\n\n\n      About t...
4     \n    \n  Senior Information Security Analyst\...
5     \n\n  Security Analyst\n\n\n      About the jo...
6     \n\n  IT Security Analyst\n\n\n      About the...
7     \n   \n\n  CYBER SECURITY & IT COMPLIANCE MANA...
8     \n\n  Infrastructure Security Architect \n\nPr...
9     \n\n  Group Product Manager\n\n      About the...
10    \n\n  Application Security Architect\n\n\nEasy...
11    \n\n  Security Engineer\n\n\n      About the j...
12    ï»¿\n\nSpace Cybersecurity Specialist \n\n    ...
13    ï»¿\n\nCyber Security Specialist / Penetration...
14    ï»¿\n Cyber-Security Architect at Eaton\n\n\n ...
15    ï»¿\n\n\nSecurity and Risk Manager at IBM\n\n\...
16    ï»¿\n\n\n Cyber Security Expert at Zentiva\n\n...
17    ï»¿ \n\nCyber Security Architect at Vodafo

In [11]:
# Remove the 'ï»¿' from some rows and get job title position
df['job_title'] = df['lf_text'].apply(lambda x: (re.sub(r'^.*?\n', '\n', x).strip()).split('\n')[0] )

In [12]:
df.iloc[57]

country                                                      HU
title                                                  Engineer
text          https://www.linkedin.com/jobs/view/2494545243/...
word_count                                                 2689
stopwords                                                    72
lf_text       \n\nSecurity Engineer\n\n\nCompany: Emarsys\nL...
job_title                                     Security Engineer
Name: 57, dtype: object

In [13]:
for i in range(len(df['job_title'])):
    print(i, df['job_title'][i])

0 Information Security Architect
1 Information Technology Security Manager
2 Information Technology Security Manager
3 IP Security Architect
4 Senior Information Security Analyst
5 Security Analyst
6 IT Security Analyst
7 CYBER SECURITY & IT COMPLIANCE MANAGER (M/F)
8 Infrastructure Security Architect 
9 Group Product Manager
10 Application Security Architect
11 Security Engineer
12 Space Cybersecurity Specialist 
13 Cyber Security Specialist / Penetration Tester at Resideo
14 Cyber-Security Architect at Eaton
15 Security and Risk Manager at IBM
16 Cyber Security Expert at Zentiva
17 Cyber Security Architect at Vodafone
18 Tier 1 Security Engineer
19 Sr. Security Engineer
20 INFORMATION SECURITY ANALYST I at TE Connectivity
21 Security QA Engineer 
22 Information Security Engineer 
23 Principal Security Architect
24 Threat Hunting & Response Senior Analyst
25 Associate Security Engineer
26 Backend Developer Go/ C++ (CyberSecurity) at Acronis
27 Security Architect - Network
28 Senior Se

In [14]:
df['lf_text']

0      \n\nInformation Security Architect\n\nCopmany:...
1      \n\nInformation Technology Security Manager\n\...
2      \n\nInformation Technology Security Manager\n\...
3      \n\n  IP Security Architect\n\n\n      About t...
4      \n    \n  Senior Information Security Analyst\...
                             ...                        
106    \n\nCyber Security Incident Responder\n\nCompa...
107    \n\nCybersecurity Professional for Active Dire...
108    \n\nCybersecurity Professional for Active Dire...
109    \n\nCyber Security Developer \n\nAbout the job...
110    \n\nIncident Response Engineer\n \n\nWHO YOU A...
Name: lf_text, Length: 111, dtype: object

### Removing punctuations

In [15]:
# df['tweet'] = 
df['lf_text'] = df['lf_text'].str.replace('[^\w\s]','')

  df['lf_text'] = df['lf_text'].str.replace('[^\w\s]','')


### Remove stop words

In [16]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [17]:
df['lf_text'] = df['lf_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [18]:
df['lf_text'].head()

0    Information Security Architect Copmany SWAROVS...
1    Information Technology Security Manager Copman...
2    Information Technology Security Manager Global...
3    IP Security Architect About job Description Th...
4    Senior Information Security Analyst About job ...
Name: lf_text, dtype: object

### Common words

In [60]:
freq = pd.Series(' '.join(df['lf_text']).split()).value_counts()[-20:]
freq

calling         1
O2              1
special         1
path            1
SAFE            1
ondemand        1
Visio           1
complexities    1
suitable        1
DDI             1
versions        1
Ciscoâs         1
Github          1
Promote         1
labs            1
costs           1
UCS             1
6500            1
9500            1
applicable      1
dtype: int64

### Lemmatization

In [19]:
from textblob import Word
df['text'] = df['lf_text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df['text'].head()

0    Information Security Architect Copmany SWAROVS...
1    Information Technology Security Manager Copman...
2    Information Technology Security Manager Global...
3    IP Security Architect About job Description Th...
4    Senior Information Security Analyst About job ...
Name: text, dtype: object

### Tokenization

In [25]:
import nltk
from textblob import TextBlob
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\elsha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\elsha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
TextBlob(df['text'][1]).words[:10]

WordList(['Information', 'Technology', 'Security', 'Manager', 'Copmany', 'Global', 'Blue', 'Location', 'Vienna', 'Austria'])

### N grams

In [28]:
TextBlob(df['text'][0]).ngrams(2)[:4]

[WordList(['Information', 'Security']),
 WordList(['Security', 'Architect']),
 WordList(['Architect', 'Copmany']),
 WordList(['Copmany', 'SWAROVSKI'])]

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))

In [30]:
vect = tfidf.fit_transform(df['text'])

In [31]:
tf_df = pd.DataFrame(vect[0].T.todense(), index=tfidf.get_feature_names_out(), 
             columns=["TF-IDF"]).sort_values('TF-IDF', ascending=False)
tf_df.head()

Unnamed: 0,TF-IDF
information,0.397172
security,0.356113
program,0.298509
salary,0.217892
define,0.166721


In [39]:
tf_df.head(10)

Unnamed: 0,TF-IDF
information,0.397172
security,0.356113
program,0.298509
salary,0.217892
define,0.166721
policy,0.164488
control,0.152612
unique,0.151267
gross,0.151267
framework,0.149255


In [37]:
tf_df.tail(10)

Unnamed: 0,TF-IDF
excellent,0.0
exchange,0.0
exciting,0.0
execute,0.0
executive,0.0
exercise,0.0
existing,0.0
expanding,0.0
expect,0.0
zenitech,0.0


## Co-Occurence Matrix

In [110]:
from sklearn.feature_extraction.text import CountVectorizer
# Using in built english stop words to remove noise
count_vectorizer = CountVectorizer(stop_words = 'english', max_features=50)
vectorized_matrix = count_vectorizer.fit_transform(df['text'])

In [111]:
co_occurrence_matrix = (vectorized_matrix.T * vectorized_matrix)
pd.DataFrame(co_occurrence_matrix.A, 
                   columns=count_vectorizer.get_feature_names_out(),
                   index=count_vectorizer.get_feature_names_out())

Unnamed: 0,ability,analysis,application,business,cloud,company,computer,control,customer,cyber,...,technical,technology,testing,threat,tool,understanding,vulnerability,work,working,year
ability,263,98,100,160,93,115,113,118,83,138,...,133,234,61,138,93,114,69,254,88,116
analysis,98,219,64,92,39,70,110,67,79,71,...,90,177,59,155,99,84,72,163,57,91
application,100,64,301,110,120,100,57,55,44,56,...,124,238,135,94,136,60,105,181,68,75
business,160,92,110,416,128,236,92,179,85,136,...,160,385,53,140,92,99,49,270,146,168
cloud,93,39,120,128,601,35,83,46,171,44,...,167,210,42,75,83,65,59,198,95,95
company,115,70,100,236,35,528,73,102,84,125,...,157,333,72,178,100,75,70,248,108,180
computer,113,110,57,92,83,73,154,73,92,121,...,86,173,36,119,59,82,71,193,75,86
control,118,67,55,179,46,102,73,196,59,103,...,82,194,40,95,63,73,56,143,92,78
customer,83,79,44,85,171,84,92,59,392,107,...,150,138,44,53,46,69,31,187,61,97
cyber,138,71,56,136,44,125,121,103,107,372,...,179,228,41,134,72,96,89,255,105,140
