In [1]:
# library imports
import ast
import pandas as pd
import spacy
from spacy import displacy
from spacy.tokens import DocBin
import json
from datetime import datetime
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re

import string
import nltk
nltk.download('stopwords')

#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\m84246307\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load data
df = pd.read_csv("data/jobpostings.csv")

# Remove all duplicate rows 
df = df.drop_duplicates(keep='last')

In [3]:
df.columns

Index(['Job Id', 'Job Title', 'SOC Code', 'Job Description', 'Company Name',
       'Skills', 'Qualification', 'City', 'State', 'Zipcode',
       'Job Opening Date', 'Job Closing Date', 'Status', 'Website Url'],
      dtype='object')

In [4]:
df["Skills"][0]

'[Local Media, Editing, Journalism]'

# Preprocessing

In [5]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords])

def preprocessing(description):
    
    # remove punctuation
    description = remove_punctuation(str(description))
                                                            
    # lowering the text
    description = description.lower()
    
    # remove stopwords
    description = remove_stopwords(description)
    
    return description

In [6]:
# Preprocessing...
for idx, row in tqdm(df.iterrows()):
    df['Job Description'][idx] = preprocessing(df['Job Description'][idx])

188627it [06:22, 493.49it/s]


In [7]:
df_tech = pd.read_excel("data/Technology Skills.xlsx")

In [8]:
df_tech.shape

(31078, 6)

In [9]:
tech_skills = list(df_tech["Example"].unique())

In [10]:
len(tech_skills)

8869

In [11]:
tech_skills[:10]

['Adobe Systems Adobe Acrobat',
 'AdSense Tracker',
 'Atlassian JIRA',
 "Blackbaud The Raiser's Edge",
 'ComputerEase Construction Accounting',
 'Database reporting software',
 'Databox',
 'Email software',
 'Enterprise resource planning ERP software',
 'Exact Software Macola ES Labor Performance']

In [12]:
df_skill = pd.read_excel("data/Skills.xlsx")

In [13]:
df_skill.columns

Index(['O*NET-SOC Code', 'Title', 'Element ID', 'Element Name', 'Scale ID',
       'Scale Name', 'Data Value', 'N', 'Standard Error', 'Lower CI Bound',
       'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date',
       'Domain Source'],
      dtype='object')

In [14]:
df_skill.shape

(61110, 15)

In [15]:
regular_skills = list(df_skill["Element Name"].unique())

In [16]:
regular_skills[:10]

['Reading Comprehension',
 'Active Listening',
 'Writing',
 'Speaking',
 'Mathematics',
 'Science',
 'Critical Thinking',
 'Active Learning',
 'Learning Strategies',
 'Monitoring']

In [17]:
len(regular_skills)

35

In [18]:
skills = tech_skills + regular_skills

In [19]:
# skills should be unique
skills = list(set(skills))

In [20]:
len(skills)

8904

## Prepare training data

In [27]:
# this dictionary will contain all annotated examples
collective_dict = {'TRAINING_DATA': []}

def structure_training_data(text, kw_list):
    results = []
    entities = []
    
    # search for instances of keywords within the text (ignoring letter case)
    for kw in kw_list:
        
        # Check whether kw equals to "R", "J", ...
        try:
            search = re.finditer(f" {kw} ", text, flags=re.IGNORECASE)
        except:
            # To avoid "multiple repeat" error for inputs have special chars. like "c++"", we use re.escape(). 
            search = re.finditer(re.escape(f" {kw} "), text, flags=re.IGNORECASE)
              
        # store the start/end character positions
        all_instances = [[m.start(),m.end()] for m in search] 
        
        # if the callable_iterator found matches, create an 'entities' list
        if len(all_instances)>0:
            for i in all_instances:
                start = i[0]
                end = i[1]
                entities.append((start, end, kw))
            
        # alert when no matches are found given the user inputs
#         else:
#             print("No pattern matches found. Keyword:", kw)
                
    # add any found entities into a JSON format within collective_dict
    if len(entities)>0:
        results = [text, {"entities": entities}]
        collective_dict['TRAINING_DATA'].append(results)
        return

In [28]:
train, test = train_test_split(df, test_size=0.2, random_state=14, shuffle=True)
train = train.reset_index()
test = test.reset_index()

In [29]:
print(train.shape)
print(test.shape)

(150901, 15)
(37726, 15)


In [30]:
# import nest_asyncio
# nest_asyncio.apply()

import asyncio

def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)

    return wrapped

def prepare_training_data(row):
    # Convert string to list
    # Clear the whitespaces at the beginning and the end of each item
    annotated_skills = convert_to_list(row['Skills'])
    if annotated_skills:
        annotated_skills = [s.strip() for s in annotated_skills]

        # Merge skills list with already annotated skills
        merged_skills = skills + annotated_skills
    
        # shape the training data
        structure_training_data(row['Job Description'], merged_skills)
    else:
        # shape the training data
        structure_training_data(row['Job Description'], skills)

In [31]:
def convert_to_list(string_shape_list):
    try:
        return string_shape_list.replace("[","").replace("]","").split(",")
    except AttributeError:
        return None

In [None]:
for idx, row in tqdm(train.iterrows()):
    prepare_training_data(row)

1847it [30:25,  1.06it/s]

In [None]:
# define our training data to TRAIN_DATA
TRAIN_DATA = collective_dict['TRAINING_DATA']

# create a blank model
nlp = spacy.blank('en')

def create_training_set(TRAIN_DATA):
    db = DocBin()
    for text, annot in tqdm(TRAIN_DATA):
        doc = nlp.make_doc(text)
        ents = []

        # create span objects
        for start, end, label in annot["entities"]:
            
            span = doc.char_span(start, end, label=label, alignment_mode="contract")

            # skip if the character indices do not map to a valid span
            if span is None:
                # print("Skipping entity.")
                continue
            else:
                ents.append(span)
                # handle erroneous entity annotations by removing them
                try:
                    doc.ents = ents
                except:
                    # print("BAD SPAN:", span, "\n")
                    ents.pop()
        doc.ents = ents
    
        # pack Doc objects into DocBin
        db.add(doc)
    return db

TRAIN_DATA_DOC = create_training_set(TRAIN_DATA)

# Export results (here I add it to a TRAIN_DATA folder within the directory)
TRAIN_DATA_DOC.to_disk("./TRAIN_DATA/TRAIN_DATA.spacy")

46057it [10:12:41,  1.06it/s]                                                                                          

In [None]:
# for text, annot in tqdm(TRAIN_DATA):
#     doc = nlp.make_doc(text)
#     span = doc.char_span(272, 277, label='SKILL')
#     print(span)
#     break

# Model Results

In [None]:
test.columns

In [None]:
model_test = test["Job Description"][9]
# load the trained model
nlp_output = spacy.load("output/model-best")

# pass our test instance into the trained pipeline
doc = nlp_output(model_test)

# # customize the label colors
# colors = {"SERVICE": "linear-gradient(90deg, #E1D436, #F59710)"}
# options = {"ents": ["SERVICE"], "colors": colors}

# # visualize the identified entities
# displacy.render(doc, style="ent", options=options)

# print out the identified entities
{"Job Id": test["Job Id"][0],"Entity Values": list(doc.ents)}

In [None]:
test["Job Description"][1]