In [1]:
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

from spacy import displacy
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])

import warnings 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = pd.read_csv("Updated_Job_req.csv")

In [3]:
data

Unnamed: 0,job_title,industry,location,type,minimum_job_year,needed,company_name,Requirements
0,Sr. Frontend Web Developer,Software Development,Jakarta Pusat,Full-time,4,1,Worlder Team Pte Ltd,ideal candidate position minimum 5 year experi...
1,Sr. Backend Engineer,Software Development,Jakarta Pusat,Full-time,4,1,Worlder Team Pte Ltd,ideal candidate position strong technical back...
2,Jr Project Manager,Software Development,Jakarta Pusat,Full-time,1,1,Worlder Team Pte Ltd,successful candidate junior project manager po...
3,Finance Accounting AR,Retail,Jakarta Selatan,Full-time,1,1,PT Aditya Saraana Graha,currently seeking candidate position accountan...
4,Product Designer,Technology,Bandung,Full-time,1,1,Dicoding Indonesia,designer essential understand plan conduct des...
...,...,...,...,...,...,...,...,...
117,Microsoft 365 Specialist,Technology Consultant,Jakarta Pusat,Contract,1,2,Nomura Research Institute Indonesia,ideal candidate must demonstrate proven track ...
118,Frontend Engineer (Flutter),Technology Consultant,Jakarta Pusat,Contract,1,2,Nomura Research Institute Indonesia,ideal candidate demonstrate strong proficiency...
119,Project Manager,Jasa TI dan Konsultan IT,Jakarta Selatan,Contract,1,2,Adiraja Integrasi,applicant posse minimum 2 year experience busi...
120,Social Media Specialist / Digital Marketing,Technology,Jakarta Pusat,Full-time,0,2,PT VEF Solusi Indonesia,candidate demonstrate proficiency social mediu...


In [4]:
import spacy

# Load the large English model
nlp = spacy.load('en_core_web_lg')

In [5]:
doc = nlp(data.Requirements[1])
displacy.render(doc, style="ent", jupyter=True)

In [6]:
# The jobzilla skill dataset is jsonl file containing different skills that can be used to create spaCy entity_ruler.
# The data set contains label and pattern-> diferent words used to descibe skills in various resume.
skill_pattern_path = "skill_patterns.jsonl"

In [7]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [8]:
# Add entity_ruler to the pipeline if it's not already present
if "entity_ruler" not in nlp.pipe_names:
    ruler = nlp.add_pipe("entity_ruler", after="ner")
else:
    ruler = nlp.get_pipe("entity_ruler")

In [9]:
# Load skill patterns into the entity_ruler
ruler.from_disk(skill_pattern_path)

<spacy.pipeline.entityruler.EntityRuler at 0x1d60cefbb40>

In [10]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [11]:
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset


def unique_skills(x):
    return list(set(x))

In [12]:
data.head(5)

Unnamed: 0,job_title,industry,location,type,minimum_job_year,needed,company_name,Requirements
0,Sr. Frontend Web Developer,Software Development,Jakarta Pusat,Full-time,4,1,Worlder Team Pte Ltd,ideal candidate position minimum 5 year experi...
1,Sr. Backend Engineer,Software Development,Jakarta Pusat,Full-time,4,1,Worlder Team Pte Ltd,ideal candidate position strong technical back...
2,Jr Project Manager,Software Development,Jakarta Pusat,Full-time,1,1,Worlder Team Pte Ltd,successful candidate junior project manager po...
3,Finance Accounting AR,Retail,Jakarta Selatan,Full-time,1,1,PT Aditya Saraana Graha,currently seeking candidate position accountan...
4,Product Designer,Technology,Bandung,Full-time,1,1,Dicoding Indonesia,designer essential understand plan conduct des...


In [13]:
data["skills"] = data.head(200)["Requirements"].str.lower().apply(get_skills)
data["skills"] = data.head(200)["skills"].apply(unique_skills)
data.head()

Unnamed: 0,job_title,industry,location,type,minimum_job_year,needed,company_name,Requirements,skills
0,Sr. Frontend Web Developer,Software Development,Jakarta Pusat,Full-time,4,1,Worlder Team Pte Ltd,ideal candidate position minimum 5 year experi...,"[cs, github, strong technical background, web ..."
1,Sr. Backend Engineer,Software Development,Jakarta Pusat,Full-time,4,1,Worlder Team Pte Ltd,ideal candidate position strong technical back...,"[framework, github, source code management, st..."
2,Jr Project Manager,Software Development,Jakarta Pusat,Full-time,1,1,Worlder Team Pte Ltd,successful candidate junior project manager po...,"[jira, computer science, communication skill, ..."
3,Finance Accounting AR,Retail,Jakarta Selatan,Full-time,1,1,PT Aditya Saraana Graha,currently seeking candidate position accountan...,"[diligent, accounting, google, work independen..."
4,Product Designer,Technology,Bandung,Full-time,1,1,Dicoding Indonesia,designer essential understand plan conduct des...,"[design, professional, growth mindset]"


In [14]:
doc = nlp(data.Requirements[41])
displacy.render(doc, style="ent", jupyter=True)

In [16]:
# Create a new DataFrame containing only the 'skills' column
skills_df = pd.DataFrame(data["skills"])

# Export the DataFrame to a new CSV file
skills_df.to_csv("skills.csv", index=False)

In [17]:
data.skills[50]

['collaboration',
 'cs',
 'google',
 'html',
 'google analytics',
 'communication',
 'problemsolving',
 'marketing',
 'javascript',
 'multidisciplinary',
 'data analysis',
 'seo']

### Using re

In [21]:
import re
def get_experience(text):
    pattern = re.compile(r'(\d+)\s+(?:years?|yrs?)\s+experience')
    matches = pattern.findall(text.lower())
    years = [int(match[0] or match[1] or match[2]) for match in matches]
    return years

# Apply the function to the Requirements column
data["experience"] = data["Requirements"].apply(get_experience)

In [24]:
data.Requirements[2]

'successful candidate junior project manager position posse bachelor degree engineering computer science along minimum 1 year experience project management expediting previous experience mobile app development software development industry highly preferred ideal candidate excellent organisational communication skill well ability work well team collaborate multiple stakeholder attention detail ability prioritise task essential role proficiency project management tool software particularly jira required knowledge project management methodology agile waterfall plus additionally holding project management professional pmp certification equivalent would advantageous'

In [25]:
data.head()

Unnamed: 0,job_title,industry,location,type,minimum_job_year,needed,company_name,Requirements,skills,experience
0,Sr. Frontend Web Developer,Software Development,Jakarta Pusat,Full-time,4,1,Worlder Team Pte Ltd,ideal candidate position minimum 5 year experi...,"[cs, github, strong technical background, web ...",[5]
1,Sr. Backend Engineer,Software Development,Jakarta Pusat,Full-time,4,1,Worlder Team Pte Ltd,ideal candidate position strong technical back...,"[framework, github, source code management, st...",[5]
2,Jr Project Manager,Software Development,Jakarta Pusat,Full-time,1,1,Worlder Team Pte Ltd,successful candidate junior project manager po...,"[jira, computer science, communication skill, ...",[1]
3,Finance Accounting AR,Retail,Jakarta Selatan,Full-time,1,1,PT Aditya Saraana Graha,currently seeking candidate position accountan...,"[diligent, accounting, google, work independen...",[3]
4,Product Designer,Technology,Bandung,Full-time,1,1,Dicoding Indonesia,designer essential understand plan conduct des...,"[design, professional, growth mindset]",[]


In [45]:
data.head()

Unnamed: 0,job_title,industry,location,type,minimum_job_year,needed,company_name,Requirements,skills,experience,education
0,Sr. Frontend Web Developer,Software Development,Jakarta Pusat,Full-time,4,1,Worlder Team Pte Ltd,ideal candidate position minimum 5 year experi...,"[cs, github, strong technical background, web ...",[5],[]
1,Sr. Backend Engineer,Software Development,Jakarta Pusat,Full-time,4,1,Worlder Team Pte Ltd,ideal candidate position strong technical back...,"[framework, github, source code management, st...",[5],[]
2,Jr Project Manager,Software Development,Jakarta Pusat,Full-time,1,1,Worlder Team Pte Ltd,successful candidate junior project manager po...,"[jira, computer science, communication skill, ...",[1],[bachelor degree]
3,Finance Accounting AR,Retail,Jakarta Selatan,Full-time,1,1,PT Aditya Saraana Graha,currently seeking candidate position accountan...,"[diligent, accounting, google, work independen...",[3],[d3]
4,Product Designer,Technology,Bandung,Full-time,1,1,Dicoding Indonesia,designer essential understand plan conduct des...,"[design, professional, growth mindset]",[],[]


In [44]:
data.education[10]

['bachelor']

In [48]:
# Define the required skills (user input)
required_skills = ['data science', 'professional', 'python', 'data analysis']

# Function to calculate the points between the skills and the required skills
def calculate_points(skills, required_skills):
    num_skills = len(skills)
    num_required_skills = len(required_skills)
    
    # Calculate the intersection length
    common_skills = len(set(skills).intersection(required_skills))
    
    # Determine the denominator (bigger length)
    denominator = max(num_skills, num_required_skills)
    
    # Calculate the points
    points = common_skills / denominator if denominator != 0 else 0
    return points

# Apply the function to the skills column
data["points"] = data["skills"].apply(lambda skills: calculate_points(skills, required_skills))

# Display the updated dataframe
print(data.head())

                    job_title              industry         location  \
0  Sr. Frontend Web Developer  Software Development    Jakarta Pusat   
1        Sr. Backend Engineer  Software Development    Jakarta Pusat   
2          Jr Project Manager  Software Development    Jakarta Pusat   
3       Finance Accounting AR                Retail  Jakarta Selatan   
4            Product Designer            Technology          Bandung   

        type  minimum_job_year  needed             company_name  \
0  Full-time                 4       1     Worlder Team Pte Ltd   
1  Full-time                 4       1     Worlder Team Pte Ltd   
2  Full-time                 1       1     Worlder Team Pte Ltd   
3  Full-time                 1       1  PT Aditya Saraana Graha   
4  Full-time                 1       1       Dicoding Indonesia   

                                        Requirements  \
0  ideal candidate position minimum 5 year experi...   
1  ideal candidate position strong technical back...

In [50]:
data.skills[11]

['data visualization',
 'data science',
 'professional',
 'python',
 'data analysis']

In [51]:
data.points[11]

0.8