# Data Preparation - Get Data and Label it!

## Import Package for Scraping

In [1]:
import requests, time, os, spacy, unicodedata, json
from spacy import displacy
from bs4 import BeautifulSoup
from tqdm import tqdm

## Scrap data
I scrape [SEEK](https://www.seek.com.au) to extract the information of recruitment advertisement for data analyst, data scientist and data engineer. There are two steps in this part.
1. Get Job Urls: parse the search pages to get the url of each job advertisement.
2. Get Job Description: extract the text content for all jobs from the urls I scraped before.

In [2]:
def getUrlsSeek(keyWord, page):
    keyString = keyWord.replace(' ', '-')
    mainUrl = 'https://www.seek.com.au'
    jobUrlList = []
    for i in tqdm(range(1, 1+page)):
        time.sleep(1)
        url = f"{mainUrl}/{keyString}-jobs?page={i}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        jobs = soup.find_all("article")
        for job in jobs:
            jobid = job["data-job-id"]
            jobUrlList.append(f"https://www.seek.com.au/job/{jobid}")
        time.sleep(1)
    return jobUrlList

def getContent(urlList):
    tags = ["li", "p", "h1", "h2", "h3", "h4", "h5", "h6"]
    text = []
    for url in tqdm(urlList):
        time.sleep(0.5)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        details = soup.find("div", {"data-automation": "jobAdDetails"})
        for tag in tags:
            desc_all_tag = details.find_all(tag)
            text = [*text, *[desc_tag.text for desc_tag in desc_all_tag]]
    return list(set(text))


job_title = ["data analyst", "data scientist", "data engineer"]
urlList = []
for job in job_title:
    urlList = [*urlList, *getUrlsSeek(job, page = 10)]
urlList = list(set(urlList))
text = getContent(urlList)

100%|███████████████████████████████████████████| 10/10 [00:24<00:00,  2.43s/it]
100%|███████████████████████████████████████████| 10/10 [00:24<00:00,  2.43s/it]
100%|███████████████████████████████████████████| 10/10 [00:23<00:00,  2.38s/it]
100%|█████████████████████████████████████████| 506/506 [07:01<00:00,  1.20it/s]


## Split text by sentence then write it into text file
Here I used spacy 'en_core_web_sm' model to split the description of jobs into sentences and write these sentences decoded by unicodedata into text file.

In [3]:
nlp = spacy.load('en_core_web_sm')
sentences = []
for t in tqdm(text):
    sentences = [*sentences, *[i for i in nlp(t).sents]]

100%|██████████████████████████████████████| 9445/9445 [00:47<00:00, 198.80it/s]


In [4]:
sentences[0:6]

[If you have a passion for learning new technologies, want to deliver real client impact and work with like-minded technologists, Hypetap is the place to grow your career and have fun in the process!,
 We are considering applications Australia wide.,
 Advanced programming skills in SQL & Python,
 To be eligible for this position please refer to the following criteria:,
 Salary range $4566.20 to $4900.40 p.f.,
 Accelerated career growth opportunities.]

In [5]:
if os.path.exists("../data/data_ad.txt"):
    os.remove("../data/data_ad.txt")
with open("../data/data_ad.txt", "w") as f:
    for sen in sentences:
        f.write(unicodedata.normalize("NFKD", str(sen))+'\n')

## Create patterns for EntityRuler
The skill entity list was created by scraping [AngelList](https://angel.co/)'s skill report, but the page is not available now. Here is how the page looked like. ![](https://i.imgur.com/K9QCrAU.png)

In [6]:
import pandas as pd

df = pd.read_csv('../data/entitylist.csv')
df = df.dropna()
ad_skills_regular = [{"label": row[1], "pattern": row[0]} for row in zip(df['Text'], df['Type'])]
ad_skills_lower = [{"label": row[1], "pattern": row[0].lower()} for row in zip(df['Text'], df['Type'])]
ad_skills = [*ad_skills_regular, *ad_skills_lower]

df.head()

Unnamed: 0,Text,Type
0,HTML,SKILL
1,Java,SKILL
2,Javascript,SKILL
3,Python,SKILL
4,CSS,SKILL


### The format of one pattern 

In [7]:
ad_skills[0:6]

[{'label': 'SKILL', 'pattern': 'HTML'},
 {'label': 'SKILL', 'pattern': 'Java'},
 {'label': 'SKILL', 'pattern': 'Javascript'},
 {'label': 'SKILL', 'pattern': 'Python'},
 {'label': 'SKILL', 'pattern': 'CSS'},
 {'label': 'SKILL', 'pattern': 'C++'}]

## Build simple NLP model with EntityRuler and then annotate the data

In [8]:
from spacy.lang.en import English

def generate_rule_based_nlp(patterns):
    nlp = English()
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    return nlp

def test_model(model, text):
    doc = model(text)
    entities = []
    results = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    if len(entities) > 0:
        results = [text, {"entities" : entities}]
    return results


nlp = generate_rule_based_nlp(ad_skills)
ad_data = []

with open("../data/data_ad.txt", "r") as f:
    data = f.read().splitlines()
    for line in data:
        result = test_model(nlp, line)
        if result:
            ad_data = [*ad_data, result]


## Test annotation and save model

In [9]:
test_text = "Machine Learning Engineering skills who use advanced techniques such as Deep Learning (GPU accelerated), NLP, Graph ML as well as other predictive modelling methods to identify business opportunities from a variety of data sources"
doc = nlp(test_text)
displacy.render(doc, style="ent")
nlp.to_disk('../model/entRuler')

## Write processed data into json

In [10]:
if os.path.exists("../data/ad_data_labeled.json"):
    os.remove("../data/ad_data_labeled.json")
with open("../data/ad_data_labeled.json", "w", encoding = "utf-8") as f:
    json.dump(ad_data, f)