# Python으로 웹 스크래퍼 만들기 3

사이트 운영자들은 웹 스크래핑 매우 싫어한다.

그래서 봇 인증 프로그램 등을 만들어 작업을 까다롭게 조치를 해두는 경우가 있다. ( 이용약관에도 기입해 두기도 한다. )

민감한 사항이 때문에 스크래핑한 내용을 상업적으로 사용할 경우 법적다툼이 벌어질 가능성도 있다.

교육 목적 이외에 상업적인 서비스를 만들 생각이라면 참고하자.


- URL Formatting

- Requests

- [HTTP Status Codes](https://developer.mozilla.org/ko/docs/Web/HTTP/Status)

- dictionary

- .startswith

- f"{}"

- find_all : list

- find : v

---

## extract_jobs

- search_term 을 parameter 로 받고, dict 데이터로 results 를 return 하는 함수 생성


In [1]:
from requests import get 
from bs4 import BeautifulSoup

# extract 폴더, wwr.py 파일, extract_jobs 함수 import

from extract.wwr import extract_jobs

jobs = extract_jobs('python')
print(jobs)

[{'position': 'Full-Time', 'company': 'OpenCraft', 'location': 'Anywhere in the World', 'link': 'https://weworkremotely.com/remote-jobs/opencraft-senior-open-source-developer-devops-python-django-react-aws-openstack'}, {'position': 'Full-Time', 'company': 'Close', 'location': 'USA Only', 'link': 'https://weworkremotely.com/remote-jobs/close-senior-staff-software-engineer-backend-python-usa-100-remote'}, {'position': 'Full-Time', 'company': 'Proxify AB', 'location': 'Latin America Only/Europe Only/EMEA Only', 'link': 'https://weworkremotely.com/remote-jobs/proxify-ab-senior-backend-python-engineer-long-term-job-100-remote'}]


---

## get_page_count

In [6]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


def get_page_count(keyword):
    
    options = Options()
    browser = webdriver.Chrome(options=options)

    base_url = "https://kr.indeed.com/jobs"
    browser.get(f"{base_url}?q={keyword}")
    
    soup = BeautifulSoup(browser.page_source, "html.parser")
    
    pagination = soup.find('nav', class_='css-jbuxu0 ecydgvn0')    
    
    pages = pagination.find_all('div', recursive=False)
    
    length = len(pages)
    
    # 5를 초과하는 6 이상 부터는 처리 하지 않을 예정
    if length == 0:
        return 1
    elif length > 5:
        return 5
    else: 
        return length

---

## extract_indeed_jobs


- 함수 내에서 크롬 브라우저 띄어서 실행


```.py
options = Options()
browser = webdriver.Chrome(options=options)
```


In [4]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


def extract_indeed_jobs(keyword):
    
    pages = get_page_count(keyword)
    
    results = []
  
    for page in range(pages):
        
        options = Options()

        browser = webdriver.Chrome(options=options)
        
        base_url = "https://kr.indeed.com/jobs"
        search_term = f'{keyword}'
        final_url = f"{base_url}?q={search_term}&start={page*10}"
        
        print(f'Request url : {final_url}')
        browser.get(final_url)

        # --- bs4 처리 

        soup = BeautifulSoup(browser.page_source, "html.parser")

        job_list = soup.find("ul", class_="jobsearch-ResultsList")

        jobs = job_list.find_all('li', recursive=False)


        # li 태그 중 class 네임에 따라 출력,
        # find, none 논리로 출력 


        for job in jobs:

            zone = job.find("div", class_="mosaic-zone")

            if zone == None:

                # anchor = job.find_all('a') # list

                anchor = job.select_one('h2 a') # dict

                title = anchor['aria-label']
                link = anchor['href']

                company = job.find('span', class_='companyName')
                location = job.find('div', class_='companyLocation')

                # dict 데이터 형
                job_data = {
                    'position': title,
                    'link': f"https://kr.indeed.com{link}",
                    'company': company.string,
                    'location': location.string
                }

                # 결과 리스트 append
                results.append(job_data)

    
    return results

In [7]:
jobs = extract_indeed_jobs('python')
print(jobs)

Request url : https://kr.indeed.com/jobs?q=python&start=0
Request url : https://kr.indeed.com/jobs?q=python&start=10
Request url : https://kr.indeed.com/jobs?q=python&start=20
Request url : https://kr.indeed.com/jobs?q=python&start=30
Request url : https://kr.indeed.com/jobs?q=python&start=40
[{'position': '2023 Software Engineering Intern, People with Disabilities (장애인 채용)의 전체 세부 정보', 'link': 'https://kr.indeed.com/rc/clk?jk=ed7b7e6eb72ffab8&fccid=a5b4499d9e91a5c6&vjs=3', 'company': 'Google', 'location': '서울'}, {'position': '2023 Student Training in Engineering Program (STEP), People with Disabilities (장애인 채용)의 전체 세부 정보', 'link': 'https://kr.indeed.com/rc/clk?jk=5d24a062cabc9f0d&fccid=a5b4499d9e91a5c6&vjs=3', 'company': 'Google', 'location': '서울'}, {'position': '[송파구] 데이터분석(Python,R,모델,ML)의 전체 세부 정보', 'link': 'https://kr.indeed.com/rc/clk?jk=a0f0a1cb28e20f8c&fccid=59060ddcdf96b101&vjs=3', 'company': '시니어앤파트너즈', 'location': '서울 송파구'}, {'position': '인공지능 관리 및 개발자 (Python, yolo 사용자 우대)의 

In [8]:
print(len(jobs))

75


- 첫번째 페이지로 이동후, 그 이후 페이지가 몇개 있는지 확인, 제약사항(5) 후 처리


---

## Refactoring

- 함수 처리

In [10]:
# 폴더.파일.py import 함수
# 리스트 리턴,(리스트내 데이터 dict)

from extract.indeed import extract_indeed_jobs
from extract.wwr import extract_jobs

wwr = extract_jobs('python')
indeed = extract_indeed_jobs('python')

jobs = wwr + indeed


Request url : https://kr.indeed.com/jobs?q=python&start=0
Request url : https://kr.indeed.com/jobs?q=python&start=10
Request url : https://kr.indeed.com/jobs?q=python&start=20
Request url : https://kr.indeed.com/jobs?q=python&start=30
Request url : https://kr.indeed.com/jobs?q=python&start=40


In [11]:
len(jobs) # 3 + 15 * 5 

78

In [12]:
print(jobs)

[{'position': 'Full-Time', 'company': 'OpenCraft', 'location': 'Anywhere in the World', 'link': 'https://weworkremotely.com/remote-jobs/opencraft-senior-open-source-developer-devops-python-django-react-aws-openstack'}, {'position': 'Full-Time', 'company': 'Close', 'location': 'USA Only', 'link': 'https://weworkremotely.com/remote-jobs/close-senior-staff-software-engineer-backend-python-usa-100-remote'}, {'position': 'Full-Time', 'company': 'Proxify AB', 'location': 'Latin America Only/Europe Only/EMEA Only', 'link': 'https://weworkremotely.com/remote-jobs/proxify-ab-senior-backend-python-engineer-long-term-job-100-remote'}, {'position': '2023 Software Engineering Intern  People with Disabilities (장애인 채용)의 전체 세부 정보', 'company': 'Google', 'location': '서울', 'link': 'https://kr.indeed.com/rc/clk?jk=ed7b7e6eb72ffab8&fccid=a5b4499d9e91a5c6&vjs=3'}, {'position': '2023 Student Training in Engineering Program (STEP)  People with Disabilities (장애인 채용)의 전체 세부 정보', 'company': 'Google', 'location':

---

## 사용자 입력 및 파일 처리


- csv 파일 생성 

- 파일 open 및 write



In [14]:
keyword = input('what do you want to search for? ')

what do you want to search for? python


In [2]:
from extract.indeed import extract_indeed_jobs
from extract.wwr import extract_jobs

wwr = extract_jobs(keyword)
indeed = extract_indeed_jobs(keyword)

jobs = wwr + indeed

Request url : https://kr.indeed.com/jobs?q=python&start=0
Request url : https://kr.indeed.com/jobs?q=python&start=10
Request url : https://kr.indeed.com/jobs?q=python&start=20
Request url : https://kr.indeed.com/jobs?q=python&start=30
Request url : https://kr.indeed.com/jobs?q=python&start=40


In [15]:
# 입력 받은 키워드 이름으로 파일 오픈, 없으면 새로 생성
file = open(f"{keyword}.csv", "w",encoding="utf-8-sig")

# 파일에 헤더 입력 , \n
file.write("Position,Company,Location,URL\n")

# jobs 리스트, job dict
for job in jobs:
    file.write(f"{job['position']},{job['company']},{job['location']},{job['link']}\n")

In [16]:
file.close()

---

## 쉽표 (`,`) 처리

In [9]:
'hello,world, happy,toady'.replace(',', ' ')

'hello world  happy toady'

- 딕셔너리 내 데이터가  `,` 포함 하고 있음, `,` 는 csv 파일에서 열 분리를 의미함. csv 인코딩 하는데 문제 발생 
    - ex) '구글, 서울' > 구글 입력후 `,` 로 자동 열 분리 
    
- extract 함수 내에서 `,`가 있다면, 공백으로 처리 하여 해결
```.py
                job_data = {
                    'position': title.replace(",", " "),
                    'company': company.string.replace(",", " "),
                    'location': location.string.replace(",", " "),
                    'link': f"https://kr.indeed.com{link}"

                }
```



---

## Refactoring

- 사용자 입력 , 파일 생성 까지 모두 함수화

In [17]:
from extract.indeed import extract_indeed_jobs
from extract.wwr import extract_jobs
from extract.file import save_to_file

keyword = input('what do you want to search for ? ')

wwr = extract_jobs(keyword)
indeed = extract_indeed_jobs(keyword)

jobs = wwr + indeed

save_to_file(keyword,jobs)


what do you want to search for ? python
Request url : https://kr.indeed.com/jobs?q=python&start=0
Request url : https://kr.indeed.com/jobs?q=python&start=10
Request url : https://kr.indeed.com/jobs?q=python&start=20
Request url : https://kr.indeed.com/jobs?q=python&start=30
Request url : https://kr.indeed.com/jobs?q=python&start=40


---