# Python으로 웹 스크래퍼 만들기 3

사이트 운영자들은 웹 스크래핑 매우 싫어한다.

그래서 봇 인증 프로그램 등을 만들어 작업을 까다롭게 조치를 해두는 경우가 있다. ( 이용약관에도 기입해 두기도 한다. )

민감한 사항이 때문에 스크래핑한 내용을 상업적으로 사용할 경우 법적다툼이 벌어질 가능성도 있다.

교육 목적 이외에 상업적인 서비스를 만들 생각이라면 참고하자.


- URL Formatting

- Requests

- [HTTP Status Codes](https://developer.mozilla.org/ko/docs/Web/HTTP/Status)

- dictionary

- .startswith

- f"{}"

- find_all : list

- find : v

---

## ~ 5.9

In [None]:
from requests import get 

from bs4 import BeautifulSoup


base_url = "https://weworkremotely.com/remote-jobs/search?utf8=✓&term="

search_term = "java"

response = get(f"{base_url}{search_term}")

    
if response.status_code != 200:
    print("Can't request website")
else:
    soup = BeautifulSoup(response.text,"html.parser")
    
    # print(soup.find_all('title'))

    jobs = soup.find_all('section', class_="jobs")
    # print(soup.find_all('section', class_="jobs"))    
    # print(len(jobs))
    
    results = []
    
    for job_section in jobs:
    
        # print(job_section.find_all('li')) 
        job_posts = job_section.find_all('li')
        
        # 리스트 컴프리헨션으로 view-all 클래스 제외 (버튼)
        job_posts = [post for post in job_section.find_all('li') if 'view-all' not in post['class']]

        for post in job_posts:
            
            
            anchors = post.find_all('a') # list
            
            anchor = anchors[1] # dictionary in list
            
            link = anchor['href'] # value in dictionary
            
            # anchor('a') 내 span tag (class = company)추출, list
            company, position, region = anchor.find_all('span',class_='company')
            
            title = anchor.find('span', class_='title')
            
            # html tag 제외 후, dictionary 데이터 생성 후, list append
            data_result = {
                'title': title.string,
                'company' : company.string,
                'position' : position.string,
                'region' : region.string,
                'link' : f'https://weworkremotely.com{link}'
                }
            
            results.append(data_result)
        
    
    for result in results:
        print(result)

            

---

## Refactoring -1

- search_term 을 parameter 로 받고, dict 데이터로 results 를 return 하는 함수 생성


In [None]:
from requests import get 
from bs4 import BeautifulSoup

# extract 폴더, wwr.py 파일, extract_jobs 함수 import

from extract.wwr import extract_jobs

jobs = extract_jobs('python')
print(jobs)

---

## Refactoring -2

- 함수화
- pages 추가


In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


def extract_indeed_jobs(keyword):
    
    pages = get_page_count(keyword)
    
    for page in range(pages):
        
        options = Options()

        browser = webdriver.Chrome(options=options)
        
        base_url = "https://kr.indeed.com/jobs"
        search_term = f'{keyword}'

        browser.get(f"{base_url}?q={search_term}")

        # --- bs4 처리 

        soup = BeautifulSoup(browser.page_source, "html.parser")

        job_list = soup.find("ul", class_="jobsearch-ResultsList")

        jobs = job_list.find_all('li', recursive=False)


        # li 태그 중 class 네임에 따라 출력,
        # find, none 논리로 출력 

        results = []

        for job in jobs:

            zone = job.find("div", class_="mosaic-zone")

            if zone == None:

                # anchor = job.find_all('a') # list

                anchor = job.select_one('h2 a') # dict

                title = anchor['aria-label']
                link = anchor['href']

                company = job.find('span', class_='companyName')
                location = job.find('div', class_='companyLocation')

                # dict 데이터 형
                job_data = {
                    'position': title,
                    'link': f"https://kr.indeed.com{link}",
                    'company': company.string,
                    'location': location.string
                }

                # 결과 리스트 append
                results.append(job_data)

        for result in results:
            print(result)
            print()

In [None]:
# 1번 페이지 5번 반복 

extract_indeed_jobs('python')

---

`options = Options()`
`browser = webdriver.Chrome(options=options)`

를 함수 내에서 실행시키게 해야함.

- 켜져있는 상태로, 한번 더 켜지면, 봇 의심

- 함수 내에서 오픈하게 되면, 실행 후, 꺼지고, 다시 실행

- 오픈, 카운트, 클로즈

- 1~5. 오픈, 출력, 클로즈



---

## Refactoring -3

- 1 ~ 5 번페이지 start 처리

- page 에 따라 start 부분 처리 x(0), 10, 20

- start를 0처리 하면, start 없이 요청 하는것과 같은 결과가 나옴

- result 빼서 처리

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


def extract_indeed_jobs(keyword):
    
    pages = get_page_count(keyword)
    
    results = []
  
    for page in range(pages):
        
        options = Options()

        browser = webdriver.Chrome(options=options)
        
        base_url = "https://kr.indeed.com/jobs"
        search_term = f'{keyword}'

        browser.get(f"{base_url}?q={search_term}&start={page*10}")

        # --- bs4 처리 

        soup = BeautifulSoup(browser.page_source, "html.parser")

        job_list = soup.find("ul", class_="jobsearch-ResultsList")

        jobs = job_list.find_all('li', recursive=False)


        # li 태그 중 class 네임에 따라 출력,
        # find, none 논리로 출력 


        for job in jobs:

            zone = job.find("div", class_="mosaic-zone")

            if zone == None:

                # anchor = job.find_all('a') # list

                anchor = job.select_one('h2 a') # dict

                title = anchor['aria-label']
                link = anchor['href']

                company = job.find('span', class_='companyName')
                location = job.find('div', class_='companyLocation')

                # dict 데이터 형
                job_data = {
                    'position': title,
                    'link': f"https://kr.indeed.com{link}",
                    'company': company.string,
                    'location': location.string
                }

                # 결과 리스트 append
                results.append(job_data)

    
    for result in results:
        print(result)
        print()

In [None]:
extract_indeed_jobs('python')

---

- 완전 함수로 변환

- return 리스트

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


def extract_indeed_jobs(keyword):
    
    pages = get_page_count(keyword)
    
    results = []
  
    for page in range(pages):
        
        options = Options()

        browser = webdriver.Chrome(options=options)
        
        base_url = "https://kr.indeed.com/jobs"
        search_term = f'{keyword}'
        final_url = f"{base_url}?q={search_term}&start={page*10}"
        
        print(f'Request url : {final_url}')
        browser.get(final_url)

        # --- bs4 처리 

        soup = BeautifulSoup(browser.page_source, "html.parser")

        job_list = soup.find("ul", class_="jobsearch-ResultsList")

        jobs = job_list.find_all('li', recursive=False)


        # li 태그 중 class 네임에 따라 출력,
        # find, none 논리로 출력 


        for job in jobs:

            zone = job.find("div", class_="mosaic-zone")

            if zone == None:

                # anchor = job.find_all('a') # list

                anchor = job.select_one('h2 a') # dict

                title = anchor['aria-label']
                link = anchor['href']

                company = job.find('span', class_='companyName')
                location = job.find('div', class_='companyLocation')

                # dict 데이터 형
                job_data = {
                    'position': title,
                    'link': f"https://kr.indeed.com{link}",
                    'company': company.string,
                    'location': location.string
                }

                # 결과 리스트 append
                results.append(job_data)

    
    return results

In [None]:
jobs = extract_indeed_jobs('python')
print(jobs)

In [None]:
print(len(jobs))

- 첫번째 페이지로 이동후, 그 이후 페이지가 몇개 있는지 확인, 제약사항(5) 후 처리


---

## Refactoring -4 

- 함수 처리

In [None]:
'hello,world, happy,toady'.replace(',', ' ')

In [1]:
# 폴더.파일.py import 함수
# 리스트 리턴,(리스트내 데이터 dict)

from extract.indeed import extract_indeed_jobs
from extract.wwr import extract_jobs

wwr = extract_jobs('python')
indeed = extract_indeed_jobs('python')

jobs = wwr + indeed


Request url : https://kr.indeed.com/jobs?q=python&start=0
Request url : https://kr.indeed.com/jobs?q=python&start=10
Request url : https://kr.indeed.com/jobs?q=python&start=20
Request url : https://kr.indeed.com/jobs?q=python&start=30
Request url : https://kr.indeed.com/jobs?q=python&start=40


In [None]:
len(jobs) # 5 + 15 * 5 

In [None]:
print(jobs)

---

## 사용자 입력 및 파일 처리


- csv 파일 생성 

- 파일 open 및 write



In [1]:
keyword = input('what do you want to search for? ')

what do you want to search for? python


In [2]:
from extract.indeed import extract_indeed_jobs
from extract.wwr import extract_jobs

wwr = extract_jobs(keyword)
indeed = extract_indeed_jobs(keyword)

jobs = wwr + indeed

Request url : https://kr.indeed.com/jobs?q=python&start=0
Request url : https://kr.indeed.com/jobs?q=python&start=10
Request url : https://kr.indeed.com/jobs?q=python&start=20
Request url : https://kr.indeed.com/jobs?q=python&start=30
Request url : https://kr.indeed.com/jobs?q=python&start=40


In [3]:
file = open(f"{keyword}.csv", "w",encoding="utf-8-sig")
# 파일에 헤더 입력 , \n
file.write("Position,Company,Location,URL\n")

# jobs 리스트, job dict
for job in jobs:
    file.write(f"{job['position']},{job['company']},{job['location']},{job['link']}\n")

In [4]:
file.close()

---

## 쉽표 (`,`) 처리

- 딕셔너리 내 데이터가  `,` 포함 하고 있음, `,` 는 csv 파일에서 열 분리를 의미함. csv 인코딩 하는데 문제 발생 
    - ex) '구글, 서울' > 구글 입력후 `,` 로 자동 열 분리 
    
- extract 함수 내에서 `,`가 있다면, 공백으로 처리 하여 해결
```.py
                job_data = {
                    'position': title.replace(",", " "),
                    'company': company.string.replace(",", " "),
                    'location': location.string.replace(",", " "),
                    'link': f"https://kr.indeed.com{link}"

                }
```



---

## Refactoring

- 파일 생성 함수화

In [5]:
from extract.indeed import extract_indeed_jobs
from extract.wwr import extract_jobs
from extract.file import save_to_file

keyword = input('what do you want to search for ? ')

wwr = extract_jobs(keyword)
indeed = extract_indeed_jobs(keyword)

jobs = wwr + indeed

save_to_file(keyword,jobs)


what do you want to search for? python
Request url : https://kr.indeed.com/jobs?q=python&start=0
Request url : https://kr.indeed.com/jobs?q=python&start=10
Request url : https://kr.indeed.com/jobs?q=python&start=20
Request url : https://kr.indeed.com/jobs?q=python&start=30
Request url : https://kr.indeed.com/jobs?q=python&start=40


---

## Flask

- install