In [82]:
import requests
from bs4 import BeautifulSoup
def scrape_careers_information(page_url):
    r = requests.get(page_url)
    data = BeautifulSoup(r.text,"html.parser")
    jobs= []
    all_div = data.find_all('div', attrs={'class':"bti-ui-job-detail-container"}) 
    for i in all_div:
        job=[]
        job.append(i.find('a').text)
        job.append(i.find('div', attrs={'class':"bti-ui-job-result-detail-employer"}).text.strip())
        job.append(i.find('div', attrs={'class':"bti-ui-job-result-detail-location"}).text.strip())
        job.append(i.find('div', attrs={'class':"bti-ui-job-result-detail-age"}).text.strip())
        job.append('https://careers.journalists.org{}'.format(i.find('a')['href']))
        job.append('careers')
        jobs.append(job)
    return jobs

def scrape_careers_description(description_url):
    r = requests.get(description_url)
    data = BeautifulSoup(r.text,"html.parser")
    return data.find('div', attrs={'class':"bti-jd-description"}).text

def scrape_indeed_information(page_url):
    r = requests.get(page_url)
    data = BeautifulSoup(r.text,"html.parser")
    jobs= []
    all_h2 = data.find_all('h2', attrs={'class':"jobtitle"}) 
    #it's a weirdo page that the last item's 'class' is different from above 9, so that we use its sub label h2.
    for i in all_h2:
        job = []
        job.append(i.a['title'])
        job.append(i.parent.find('span', attrs={'class':"company"}).text.strip()) 
        #use .parent back to the higher label
        job.append(i.parent.find('span', attrs={'class':"location"}).text.strip())
        job.append(i.parent.find('span', attrs={'class':"date"}).text.strip())
        job.append('https://www.indeed.com/viewjob?jk={}'.format(i['id'][3:]))
        job.append('indeed')
        if job not in all_jobs:
            jobs.append(job)
        #when the index url exceeds the range of pages in indeed, the page will become circulation, so that we should do duplicate checking
    return jobs

def scrape_indeed_description(description_url):
    r = requests.get(description_url)
    data = BeautifulSoup(r.text,"html.parser")
    return data.find('div', attrs={'class':"jobsearch-JobComponent-description icl-u-xs-mt--md"}).text

def scrape_jobsdb_information(page_url):
    r = requests.get(page_url)
    data = BeautifulSoup(r.text,"html.parser")
    jobs= []
    all_div = data.find_all('div', attrs={'class':"_3ASfTyv _2EUSthc"})
    for i in all_div:
        job=[]
        job.append(i.find('div', attrs={'class':"_3gfm7U9 _3ho-Knb _2swcdgn"}).a.text)
        job.append(i.find('div', attrs={'class':"_1NdWRqw _3ho-Knb _2swcdgn"}).find('span').text)
        job.append(i.find('div', attrs={'class':"_124cxoK _3ho-Knb _2swcdgn"}).find('span').text)
        job.append(i.find('span', attrs={'class':"JG37Vx2 _3Re95QG _2XGgj_O"}).find('span').text)
        job.append(i.find('div', attrs={'class':"_3gfm7U9 _3ho-Knb _2swcdgn"}).a['href'])
        job.append('jobsdb')
        jobs.append(job)
    return jobs
    
def scrape_jobsdb_description(description_url):
    r = requests.get(description_url)
    data = BeautifulSoup(r.text,"html.parser")
    return data.find('div', attrs={'class':"jobad-primary"}).text

def scrape_all_information(base_url,starting_index,step,function_scrape_information):
    global all_jobs
    all_jobs = [] 
    page_index = starting_index
    while True: 
        page_url = '{}{}'.format(base_url,page_index)
        try:
            jobs = function_scrape_information(page_url)
        except:
            jobs = []   
        if jobs == []: #if all jobs have been scraped, break the loop
            break
        all_jobs.extend(jobs)
        if len(all_jobs) > 5: #only scrape 50 jobs from each website
            break
        page_index += step  #level is in url to indiccate different pages' index
#     all_jobs.sort(key=lambda item: item[3], reverse=False) 

def scrape_all_description(urls_and_sources):
    list_descriptions = []
    list_websites = [m["source"] for m in websites]
    list_functions = [i["function_scrape_description"] for i in websites]
    for u, s in urls_and_sources:
        list_descriptions.append(list_functions[list_websites.index(s)](u))
    return list_descriptions

websites = [
    {
        "source": 'careers',
        "base_url": 'https://careers.journalists.org/jobs/?keywords=data+OR+journalist&page=',
        "starting_index": 1,        
        "step": 1,
        "function_scrape_information": scrape_careers_information,
        "function_scrape_description": scrape_careers_description,
    }, 
    {
        "source": 'indeed',
        "base_url": 'https://www.indeed.com/jobs?q=Data+Journalist+Internship&start=',
        "starting_index": 0,        
        "step": 10,
        "function_scrape_information": scrape_indeed_information,
        "function_scrape_description": scrape_indeed_description,
    },
    {
        "source": 'jobsdb',
        "base_url": 'https://hk.jobsdb.com/hk/search-jobs/data-journalist/',
        "starting_index": 1,        
        "step": 1,
        "function_scrape_information": scrape_jobsdb_information,
        "function_scrape_description": scrape_jobsdb_description,
    },
]  

In [84]:
import csv
from os import remove
try:
    e = open('existedjobs.csv','r')
    jobs_existed_url = [row[4] for row in csv.reader(e)]
    e.close()
except FileNotFoundError:
    jobs_existed_url = []
try:
    remove('newjobs.csv')
except FileNotFoundError:
    pass
for i in websites:
    scrape_all_information(i["base_url"],i["starting_index"],i["step"],i["function_scrape_information"])
    e = open('existedjobs.csv','a')
    n = open('newjobs.csv','a')
    for a in all_jobs:
        if a[4] not in jobs_existed_url: #duplicate checking
            csv.writer(e).writerow(a)
            csv.writer(n).writerow(a)
    e.close()
    n.close()

In [83]:
%%time
import pandas as pd
df=pd.read_csv('existedjobs.csv', header=None, names=['Title','Company','Location','Date','URL','Source'])
urls_and_sources = zip(df['URL'].tolist(), df['Source'].tolist())
df['Description'] = scrape_all_description(urls_and_sources)

CPU times: user 2.42 s, sys: 72.3 ms, total: 2.49 s
Wall time: 52.9 s


In [80]:
df

Unnamed: 0,Title,Company,Location,Date,URL,Source,Description
0,Data Analyst,Memorial Sloan-Kettering Cancer Center,"New York, New York",3 Months Ago,https://careers.journalists.org/jobs/11315414/...,careers,\n Company Overview: At Memorial Sloan Kette...
1,Assistant Professor - Data Journalism,Syracuse University,"Syracuse, New York",2 Weeks Ago,https://careers.journalists.org/jobs/11620737/...,careers,\n Job Description:Syracuse University's S.I. ...
2,Assistant Professor - Data Journalism,SYRACUSE UNIVERSITY,"Syracuse, New York",1 Week Ago,https://careers.journalists.org/jobs/11639944/...,careers,\n Assistant Professor - Data JournalismThe S....
3,Sr. Data Scientist,Memorial Sloan-Kettering Cancer Center,"New York, New York",6 Months Ago,https://careers.journalists.org/jobs/10983607/...,careers,\n Company Overview: At Memorial Sloan Kette...
4,Digital Communications Data Analyst,National Association of Manufacturers (NAM),"Washington, D.C.",3 Months Ago,https://careers.journalists.org/jobs/11256389/...,careers,\n Who We Are: NAM Strategic Communications is...
5,Ops Research Engineer/ Data Scientist,Memorial Sloan-Kettering Cancer Center,"New York, New York",3 Months Ago,https://careers.journalists.org/jobs/11286625/...,careers,\n Company Overview: At Memorial Sloan Kette...
6,Assistant Professor in Computer and Informatio...,University of Michigan - Dearborn,"Dearborn, Michigan",2 Weeks Ago,https://careers.journalists.org/jobs/11625639/...,careers,"\n A cover letter, curriculum vitae, teaching ..."
7,Assistant Professor in Computer and Informatio...,University of Michigan - Dearborn,"Dearborn, Michigan",2 Weeks Ago,https://careers.journalists.org/jobs/11625641/...,careers,"\n A cover letter, curriculum vitae, teaching ..."
8,Senior Investigative Reporter,The Markup,"New York, New York",2 Days Ago,https://careers.journalists.org/jobs/11677622/...,careers,\n Welcome. We are The Markup (https://themark...
9,Deputy online news editor,Arkansas Democrat-Gazette,"Little Rock, Arkansas",6 Days Ago,https://careers.journalists.org/jobs/11661471/...,careers,"\n Arkansas' premier statewide newspaper, the ..."


In [8]:
import pandas
df = pandas.read_csv('newjobs.csv', header=None, names=['Title','Company','Location','Date','URL','Source','Description'])
df

Unnamed: 0,Title,Company,Location,Date,URL,source,discription
0,Journalist – Financial Publishing,Initiative Recruitment Hong Kong Limited,Not Specified,09-Nov-18,https://hk.jobsdb.com/hk/en/job/journalist-fin...,,
1,"Sales & Marketing Assistant, Luxury Goods Company",Cochine Limited,Central & Western Area,07-Nov-18,https://hk.jobsdb.com/hk/en/job/sales-marketin...,,
2,"Marketing Executive, Exhibitions (Marketing Co...",Hong Kong Trade Development Council,Wan Chai Area,05-Nov-18,https://hk.jobsdb.com/hk/en/job/marketing-exec...,,
3,Healthcare Industry Editor,"Standard & Poor's International, LLC",Yau Tsim Mong Area,02-Nov-18,https://hk.jobsdb.com/hk/en/job/healthcare-ind...,,
4,Account Director / Associate Account Director ...,Bentley Communications Limited,Sham Shui Po Area,02-Nov-18,https://hk.jobsdb.com/hk/en/job/account-direct...,,
5,Sr. Public Relations Executive,Global Sources,Southern Area,30-Oct-18,https://hk.jobsdb.com/hk/en/job/sr-public-rela...,,
6,Android Developer,Dow Jones & Co,Wan Chai Area,26-Oct-18,https://hk.jobsdb.com/hk/en/job/android-develo...,,
7,"Manager, External Relations (Ref. M(ER))",Urban Renewal Authority,Not Specified,22-Oct-18,https://hk.jobsdb.com/hk/en/job/manager-extern...,,
8,"Lead Manager, Media Relations (Corporate Devel...",Hong Kong Productivity Council,Not Specified,22-Oct-18,https://hk.jobsdb.com/hk/en/job/lead-manager-m...,,
9,Content Expert,Telum Media HK Limited,Eastern Area,19-Oct-18,https://hk.jobsdb.com/hk/en/job/content-expert...,,
