In [1]:
import requests
from bs4 import BeautifulSoup

alba_result = requests.get("http://www.alba.co.kr/")
#let's check if requests works fine
print(alba_result.status_code)
print(alba_result.text[:100])

alba_soup = BeautifulSoup(alba_result.text, 'html.parser')

200

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xh


### get links to "슈퍼브랜드"

In [2]:
#get links to "super brands" on the main page of www.alba.co.kr, save save them on links
super_brand = alba_soup.find("div", {"id":"MainSuperBrand"})
super_brand_box = super_brand.find("ul",{"class":"goodsBox"})
lis = super_brand_box.find_all("li")

links = []
for li in lis:
    for a in li.find_all('a',{"class":"goodsBox-info"}):
        company_name = a.find("span",{"class":"company"}).text
        url = a['href']
        links.append({'company_name':company_name, 'url':url})
    
print(f"number of \"super brands\": {len(links)}\n")
#test
for link in links[:3]:
    print(link.get('company_name'), link.get('url'))

number of "super brands": 128

이자녹스/비욘드/네이처컬렉션 http://lghnh.alba.co.kr/
딜리온 http://delion.alba.co.kr/
SSG.COM http://ssg.alba.co.kr/


### scrape jobs and save them in csv files

In [4]:
import math
import csv

def get_last_page(soup):
    normal_info = soup.find("div",{"id":"NormalInfo"})
    job_count = normal_info.find("p", {"class":"jobCount"}).find("strong").string
    page = int(job_count.replace(',',''))/50.0
    print(f"last page : {math.ceil(page)}")
    return math.ceil(page)

def extract_job_from_a_page(URL):
    print(f"extracting {URL}...")
    result = requests.get(URL)
    soup = BeautifulSoup(result.text, 'html.parser')
    trs = soup.find_all("tr") #one <tr> contains one job
    jobs = []
    for tr in trs:
        if(tr.find("td",{"class":"local first"}) is None):
            continue
        place = tr.find("td",{"class":"local first"}).text.strip()
        title = tr.find("span",{"class":"title"}).text.strip()
        working_hours = tr.find("td",{"class":"data"}).text.strip()
        remuneration = tr.find("td",{"class":"pay"}).text.strip()
        last_update = tr.find("td",{"class":"regDate last"}).text.strip()
        jobs.append({'place':place,
                    'title':title,
                    'working_hours':working_hours,
                    'remuneration':remuneration,
                    'last_update':last_update})
        
    return jobs

def save_to_file(company_name, jobs):
    company_name = company_name.replace('/','_')
    with open(f"{company_name}.csv", 'w', newline="") as csv_file:
        columns = ["place","title","working_hours","remuneration","last_update"]
        writer = csv.DictWriter(csv_file, fieldnames = columns)
        writer.writeheader()
        for job in jobs:
            writer.writerow(job)

def extract_from_one_brand(company_name, URL):
    result = requests.get(URL)
    soup = BeautifulSoup(result.text, 'html.parser')
    last_page = get_last_page(soup)
    all_jobs = []
    
    for i in range(last_page):
        jobs = extract_job_from_a_page(f"{URL}job/brand/?page={i+1}")
        all_jobs = all_jobs + jobs
    
    save_to_file(company_name, all_jobs)


#Here is where the code starts
#limit the number of files to 10 (I don't want to save more than 100 files)
limit=10
for link in links[:limit]:
    print(f"\nscraping {link.get('company_name')} {link.get('url')}...")
    extract_from_one_brand(link.get('company_name'), link.get('url'))


scraping 이자녹스/비욘드/네이처컬렉션 http://lghnh.alba.co.kr/...
last page : 1
extracting http://lghnh.alba.co.kr/job/brand/?page=1...

scraping 딜리온 http://delion.alba.co.kr/...
last page : 3
extracting http://delion.alba.co.kr/job/brand/?page=1...
extracting http://delion.alba.co.kr/job/brand/?page=2...
extracting http://delion.alba.co.kr/job/brand/?page=3...

scraping SSG.COM http://ssg.alba.co.kr/...
last page : 2
extracting http://ssg.alba.co.kr/job/brand/?page=1...
extracting http://ssg.alba.co.kr/job/brand/?page=2...

scraping 교촌치킨 http://kyochon.alba.co.kr/...
last page : 19
extracting http://kyochon.alba.co.kr/job/brand/?page=1...
extracting http://kyochon.alba.co.kr/job/brand/?page=2...
extracting http://kyochon.alba.co.kr/job/brand/?page=3...
extracting http://kyochon.alba.co.kr/job/brand/?page=4...
extracting http://kyochon.alba.co.kr/job/brand/?page=5...
extracting http://kyochon.alba.co.kr/job/brand/?page=6...
extracting http://kyochon.alba.co.kr/job/brand/?page=7...
extracting http: