In [37]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
data = pd.read_csv('/content/job_descriptions.csv')

In [4]:
data.head(3)

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."


In [5]:
data.shape

(19434, 23)

In [6]:
data.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [7]:
data.isnull().sum()

Job Id               0
Experience           0
Qualifications       0
Salary Range         0
location             0
Country              0
latitude             0
longitude            0
Work Type            0
Company Size         0
Job Posting Date     0
Preference           0
Contact Person       0
Contact              0
Job Title            0
Role                 0
Job Portal           0
Job Description      0
Benefits             0
skills               0
Responsibilities     1
Company              1
Company Profile     65
dtype: int64

In [8]:
data.duplicated().sum()

0

In [9]:
data['Job Title'].unique()

array(['Digital Marketing Specialist', 'Web Developer',
       'Operations Manager', 'Network Engineer', 'Event Manager',
       'Software Tester', 'Teacher', 'UX/UI Designer', 'Wedding Planner',
       'QA Analyst', 'Litigation Attorney', 'Mechanical Engineer',
       'Network Administrator', 'Account Manager', 'Brand Manager',
       'Social Worker', 'Social Media Coordinator',
       'Email Marketing Specialist', 'HR Generalist', 'Legal Assistant',
       'Nurse Practitioner', 'Account Director', 'Software Engineer',
       'Purchasing Agent', 'Sales Consultant', 'Civil Engineer',
       'Network Security Specialist', 'UI Developer', 'Financial Planner',
       'Event Planner', 'Psychologist', 'Electrical Designer',
       'Data Analyst', 'Technical Writer', 'Tax Consultant',
       'Account Executive', 'Systems Administrator',
       'Database Administrator', 'Research Analyst', 'Data Entry Clerk',
       'Registered Nurse', 'Investment Analyst', 'Speech Therapist',
       'Sales M

In [10]:
data['skills'].unique()

array(['Social media platforms (e.g., Facebook, Twitter, Instagram) Content creation and scheduling Social media analytics and insights Community engagement Paid social advertising',
       'HTML, CSS, JavaScript Frontend frameworks (e.g., React, Angular) User experience (UX)',
       'Quality control processes and methodologies Statistical process control (SPC) Root cause analysis and corrective action Quality management systems (e.g., ISO 9001) Compliance and regulatory knowledge',
       'Wireless network design and architecture Wi-Fi standards and protocols RF (Radio Frequency) planning and optimization Wireless security protocols Troubleshooting wireless network issues',
       'Event planning Conference logistics Budget management Vendor coordination Marketing and promotion Client relations',
       'Quality assurance processes Testing methodologies (e.g., manual, automated) Bug tracking and reporting Test case development Regression testing',
       'Teaching pedagogy Classroom 

In [11]:
data.head(10)

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."
5,116831420231957,4 to 12 Years,MCA,$59K-$93K,Brussels,Belgium,50.5039,4.4699,Full-Time,23196,...,(973)791-5355x52199,Software Tester,Quality Assurance Analyst,Snagajob,A Quality Assurance Analyst tests software and...,"{'Life and Disability Insurance, Stock Options...",Quality assurance processes Testing methodolog...,Test software applications and systems to iden...,Adani Ports and Special Economic Zone,"{""Sector"":""Infrastructure"",""Industry"":""Ports a..."
6,1292168246729889,3 to 15 Years,PhD,$63K-$103K,George Town,Cayman Islands,19.3133,-81.2546,Temporary,26119,...,001-268-510-4362x789,Teacher,Classroom Teacher,FlexJobs,A Classroom Teacher educates students in a spe...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Teaching pedagogy Classroom management Curricu...,"Plan and deliver engaging lessons, adapting te...",FedEx,"{""Sector"":""Logistics"",""Industry"":""Mail, Packag..."
7,1498778686197107,2 to 8 Years,M.Com,$65K-$102K,SÃ£o TomÃ©,Sao Tome and Principe,0.1864,6.6131,Contract,40558,...,667.202.6824x15893,UX/UI Designer,User Interface Designer,Indeed,User Interface Designers focus on the visual a...,"{'Employee Assistance Programs (EAP), Tuition ...",UI design principles and best practices Graphi...,Create visually appealing user interfaces (UI)...,Ryder System,"{""Sector"":""Transportation"",""Industry"":""Transpo..."
8,1680293940995740,2 to 9 Years,BBA,$65K-$102K,Male,Maldives,3.2028,73.2207,Temporary,105343,...,+1-337-946-9956x550,UX/UI Designer,Interaction Designer,Indeed,Interaction Designers specialize in designing ...,"{'Transportation Benefits, Professional Develo...",Interaction design principles User behavior an...,"Work on interaction design, defining how users...",Zee Entertainment Enterprises,"{""Sector"":""Media & Entertainment"",""Industry"":""..."
9,255627812588102,1 to 10 Years,BBA,$60K-$80K,Saint John's,Antigua and Barbuda,17.0608,-61.7964,Full-Time,102069,...,001-318-990-0531x978,Wedding Planner,Wedding Consultant,Stack Overflow Jobs,A Wedding Consultant assists couples in planni...,"{'Legal Assistance, Bonuses and Incentive Prog...",Wedding planning Vendor coordination Event man...,Offer expert advice and guidance to couples pl...,CSX,"{""Sector"":""Transportation"",""Industry"":""Railroa..."


In [12]:
data['Qualifications'].unique()

array(['M.Tech', 'BCA', 'PhD', 'MBA', 'MCA', 'M.Com', 'BBA', 'B.Tech',
       'B.Com', 'BA'], dtype=object)

In [13]:
data.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [14]:
nltk.download('averaged_perceptron_tagger')
phrase =  'Network security protocols Intrusion detection and prevention Security incident response Firewall administration Vulnerability assessment Security policies and procedures Log analysis Ethical hacking Problem-solving Communication skills Attention to detail'
tokens = word_tokenize(phrase)
tags = pos_tag(tokens)
competence_tags = ['NN', 'NNS', 'NNP', 'JJ']
competences = []
for token, tag in tags:
    if tag in competence_tags:
        competences.append(token)

print("Compétences extraites :", competences)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Compétences extraites : ['Network', 'security', 'protocols', 'Intrusion', 'detection', 'prevention', 'Security', 'incident', 'response', 'Firewall', 'administration', 'Vulnerability', 'assessment', 'Security', 'policies', 'procedures', 'Log', 'analysis', 'Ethical', 'Problem-solving', 'Communication', 'skills', 'Attention']


In [15]:
def extract_skills(phrase):
    tokens = word_tokenize(phrase)
    tags = pos_tag(tokens)
    competence_tags = ['NN', 'NNS', 'NNP', 'JJ']
    competences = []
    for token, tag in tags:
        if tag in competence_tags:
            competences.append(token)
    return competences

In [16]:
phrase = 'Network security protocols Intrusion detection and prevention Security incident response Firewall administration Vulnerability assessment Security policies and procedures Log analysis Ethical hacking Problem-solving Communication skills Attention to detail'
competences_extraites = extract_skills(phrase)
print("Compétences extraites :", competences_extraites)

Compétences extraites : ['Network', 'security', 'protocols', 'Intrusion', 'detection', 'prevention', 'Security', 'incident', 'response', 'Firewall', 'administration', 'Vulnerability', 'assessment', 'Security', 'policies', 'procedures', 'Log', 'analysis', 'Ethical', 'Problem-solving', 'Communication', 'skills', 'Attention']


In [17]:
data.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [18]:
data.isnull().sum()

Job Id               0
Experience           0
Qualifications       0
Salary Range         0
location             0
Country              0
latitude             0
longitude            0
Work Type            0
Company Size         0
Job Posting Date     0
Preference           0
Contact Person       0
Contact              0
Job Title            0
Role                 0
Job Portal           0
Job Description      0
Benefits             0
skills               0
Responsibilities     1
Company              1
Company Profile     65
dtype: int64

In [19]:
data['extracted_skills'] = data['skills'].apply(extract_skills)

In [20]:
data.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile',
       'extracted_skills'],
      dtype='object')

In [21]:
data['extracted_skills']

0        [Social, media, platforms, e.g., Facebook, Twi...
1        [HTML, CSS, JavaScript, Frontend, frameworks, ...
2        [Quality, control, methodologies, Statistical,...
3        [Wireless, network, design, architecture, Wi-F...
4        [Event, Conference, logistics, Budget, managem...
                               ...                        
19429    [Investment, management, Financial, analysis, ...
19430    [Construction, management, Structural, design,...
19431    [Organizational, time, management, skills, Cal...
19432    [analytics, Data, analysis, tools, e.g., Googl...
19433    [Brand, management, Public, relations, Brand, ...
Name: extracted_skills, Length: 19434, dtype: object

In [22]:
data.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile',
       'extracted_skills'],
      dtype='object')

In [23]:
data.drop(columns= ['latitude' , 'longitude' , 'location' , 'Preference', 'Job Portal' , 'Benefits' , 'Company' , 'Company Profile' , 'Company Size' ],inplace = True)

In [24]:
data.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'Country',
       'Work Type', 'Job Posting Date', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Description', 'skills', 'Responsibilities',
       'extracted_skills'],
      dtype='object')

In [25]:
data.drop(columns = ['Experience' , 'Salary Range' , 'Contact Person' , 'Country' , 'Job Posting Date'],inplace = True)

In [26]:
data.columns

Index(['Job Id', 'Qualifications', 'Work Type', 'Contact', 'Job Title', 'Role',
       'Job Description', 'skills', 'Responsibilities', 'extracted_skills'],
      dtype='object')

In [27]:
data.drop(columns = ['Work Type', 'Contact'],inplace=True)

In [28]:
selected_columns = ['Job Title','Job Description', 'skills', 'Responsibilities', 'extracted_skills']

In [29]:
print(selected_columns)

['Job Title', 'Job Description', 'skills', 'Responsibilities', 'extracted_skills']


In [30]:
selected_data = data[selected_columns]

In [31]:
selected_data['text'] = selected_data['Job Title'] + ' ' + selected_data['Job Description'] + ' ' + selected_data['skills']+' '+selected_data['Responsibilities']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['text'] = selected_data['Job Title'] + ' ' + selected_data['Job Description'] + ' ' + selected_data['skills']+' '+selected_data['Responsibilities']


In [33]:
selected_data['text'] = selected_data['text'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['text'] = selected_data['text'].fillna('')


In [34]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(selected_data['text'])

In [38]:
def predictions(skills, tfidf_vectorizer=tfidf_vectorizer, tfidf_matrix=tfidf_matrix, top_n=8):
    skills_vector = tfidf_vectorizer.transform([skills])
    cosine_similarities = cosine_similarity(skills_vector, tfidf_matrix).flatten()
    job_scores = {}
    for skill in skills.split():
        similar_jobs_indices = cosine_similarities.argsort()[::-1]
        for index in similar_jobs_indices:
            job_title = selected_data.iloc[index]['Job Title']
            if job_title not in job_scores:
                job_scores[job_title] = 0
            if skill in selected_data.iloc[index]['skills']:
                job_scores[job_title] += 1
    sorted_jobs = sorted(job_scores.items(), key=lambda x: x[1], reverse=True)
    predicted_job_titles = [job[0] for job in sorted_jobs[:top_n]]
    return predicted_job_titles

In [44]:
skills = "java php html networking"
predicted_jobs = predictions(skills)
print("8 Posts predicted : ", skills, ":")
for i, job_title in enumerate(predicted_jobs):
    print(i+1, ".", job_title)


8 Posts predicted :  java php html networking :
1 . Sales Representative
2 . Network Administrator
3 . Java Developer
4 . Web Designer
5 . UI Developer
6 . Software Engineer
7 . Back-End Developer
8 . Front-End Developer
