# Python으로 웹 스크래퍼 만들기

사이트 운영자들은 웹 스크래핑 매우 싫어한다.

그래서 봇 인증 프로그램 등을 만들어 작업을 까다롭게 조치를 해두는 경우가 있다. ( 이용약관에도 기입해 두기도 한다. )

민감한 사항이 때문에 스크래핑한 내용을 상업적으로 사용할 경우 법적다툼이 벌어질 가능성도 있다.

교육 목적 이외에 상업적인 서비스를 만들 생각이라면 참고하자.

- https://wikidocs.net/82607


- URL Formatting

- Requests

- [HTTP Status Codes](https://developer.mozilla.org/ko/docs/Web/HTTP/Status)

- dictionary

- .startswith

- f"{}"

- find_all : list

- find : v

In [1]:
from requests import get

websites=[
    "google.com",
    "https://httpstat.us/502",
    "https://httpstat.us/404",
    "https://httpstat.us/300",
    "https://httpstat.us/200",
    "https://httpstat.us/101"
]


results = {}

for website in websites:
    
    if not website.startswith("https://"):
        website = f"https://{website}"
        
    
    code = get(website).status_code
    
    if code >= 500:
        results[website] = "5xx / server error"
    elif code >= 400:
        results[website] = "4xx / client error"
    elif code >= 300:
        results[website] = "3xx / redirection "
    elif code >= 200:
        results[website] = "2xx / successful"
    elif code >= 100:
        results[website] = "1xx / informational response"

print(results)

{'https://google.com': '2xx / successful', 'https://httpstat.us/502': '5xx / server error', 'https://httpstat.us/404': '4xx / client error', 'https://httpstat.us/300': '3xx / redirection ', 'https://httpstat.us/200': '2xx / successful', 'https://httpstat.us/101': '1xx / informational response'}


---

In [2]:
from requests import get 

base_url = "https://weworkremotely.com/remote-jobs/search?utf8=✓&term="

search_term = "python"

response = get(f"{base_url}{search_term}")

print(response.text)

<!DOCTYPE html><html><head><meta charset=utf-8 /><script>window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"f7ae79e7ca","applicationID":"192262830","transactionName":"d1gPFhEMXVVWQxwMDVZEThEGAkNaWw==","queueTime":2,"applicationTime":56,"agent":""}</script><script>(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"f7ae79e7ca",applicationID:"192262830"};;(()=>{"use strict";var e,t,n={8768:(e,t,n)=>{n.d(t,{T:()=>r,p:()=>i});const r=/(iPad|iPhone|iPod)/g.test(navigator.userAgent),i=r&&Boolean("undefined"==typeof SharedWorker)},2919:(e,t,n)=>{n.d(t,{P_:()=>h,Mt:()=>p,C5:()=>c,DL:()=>w,OP:()=>N,lF:()=>C,Yu:()=>A,Dg:()=>v,CX:()=>u,GE:()=>y,sU:()=>I});var r={};n.r(r),n.d(r,{agent:()=>x,match:()=>k,version:()=>_});var i=n(6797),o=n(4286);const a={beacon:i.ce.beacon,errorBeacon:i.ce.errorBeacon,licenseKey:void 0,applicationID:void 0,sa:void 0,queueTime:void 0,appli

In [10]:
print(type(response.text))

<class 'str'>


---

## BeautifulSoup

- request.text를 이용해 가져온 데이터는 str 형태의 html

- str 형태의 html 데이터에서 원하는 html 태그를 추출 할수 있도록 도와주는 라이브러리



- https://www.crummy.com/software/BeautifulSoup/bs4/doc/

- https://weworkremotely.com/remote-jobs/search?utf8=%E2%9C%93&term=python

- `pip3 install bs4`

- find_all

In [29]:
from requests import get 

from bs4 import BeautifulSoup


base_url = "https://weworkremotely.com/remote-jobs/search?utf8=✓&term="

search_term = "python"

response = get(f"{base_url}{search_term}")

    
if response.status_code != 200:
    print("Can't request website")
else:
    
    # str 형태의 html 데이터 를 받아와 soup 객체로 변환
    soup = BeautifulSoup(response.text,"html.parser")  
    
    # print(soup.find_all('title'))
    
    # class 네임이 jobs 인 section 태그 데이터를 <class 'bs4.element.ResultSet'> 데이터 로 반환 
    # 리스트 호환
    
    jobs = soup.find_all('section', class_="jobs")
    print(len(jobs))
    print()
    
    for job_section in jobs:
    
        # li tag 모두 리스트 호환 형으로 반환
        job_posts = job_section.find_all('li') 
    
#         for post in job_posts:
#             print(post)
#             print()
        
        # --- 필터링 해줘야 함 (view-all 클래스)
        
        # 리스트 컴프리헨션으로 li tag 중 class 네임 view-all 클래스 제외 (버튼)
        # 리스트 형으로, pop 도 가능
        job_posts = [ post for post in job_section.find_all('li') if 'view-all' not in post['class'] ]
        #job_posts = [ post for post in job_section.find_all('li') if 'feature' in post['class'] ]

        for post in job_posts:
            print(post)
            print()
            

2

<li class="feature"><div class="highlight-bar"></div><div class="tooltip"><a href="/company/opencraft"><div alt="OpenCraft is hiring a remote Senior Open Source Developer &amp; DevOps (Python, Django, React, AWS/OpenStack) at We Work Remotely." class="flag-logo" loading="lazy" style="background-image:url(https://we-work-remotely.imgix.net/logos/0018/2699/logo.gif?ixlib=rails-4.0.0&amp;w=50&amp;h=50&amp;dpr=2&amp;fit=fill&amp;auto=compress)"></div><span class="tooltiptext">View Company Profile</span></a></div><a href="/remote-jobs/opencraft-senior-open-source-developer-devops-python-django-react-aws-openstack"><span class="company">OpenCraft</span><br/><span class="title">Senior Open Source Developer &amp; DevOps (Python, Django, React, AWS/OpenStack)</span><span class="featured">featured</span><br/><span class="company">Full-Time</span><span>/</span><span class="region company">Anywhere in the World</span></a></li>

<li class="feature"><div class="highlight-bar"></div><div class="to

- jobs 클래스 인 모든 section 에 접근 가능

- 'class_' argument 로 사용해야 함 

- li 안에 있는 각 항목

---

In [16]:
from requests import get 

from bs4 import BeautifulSoup


base_url = "https://weworkremotely.com/remote-jobs/search?utf8=✓&term="

search_term = "python"

response = get(f"{base_url}{search_term}")

    
if response.status_code != 200:
    print("Can't request website")
else:
    soup = BeautifulSoup(response.text,"html.parser")
    
    # print(soup.find_all('title'))

    jobs = soup.find_all('section', class_="jobs")
    # print(soup.find_all('section', class_="jobs"))    
    # print(len(jobs))
    
    for job_section in jobs:
    
        # print(job_section.find_all('li')) 
        job_posts = job_section.find_all('li')
        
        # 리스트 컴프리헨션으로 view-all 클래스 제외 (버튼)
        job_posts = [post for post in job_section.find_all('li') if 'view-all' not in post['class']]

        # 그중 'a' 태그 추출 
        for post in job_posts:
            
            anchors = post.find_all('a')
            
            print(anchors)
            print()
            

[<a href="/company/opencraft"><div alt="OpenCraft is hiring a remote Senior Open Source Developer &amp; DevOps (Python, Django, React, AWS/OpenStack) at We Work Remotely." class="flag-logo" loading="lazy" style="background-image:url(https://we-work-remotely.imgix.net/logos/0018/2699/logo.gif?ixlib=rails-4.0.0&amp;w=50&amp;h=50&amp;dpr=2&amp;fit=fill&amp;auto=compress)"></div><span class="tooltiptext">View Company Profile</span></a>, <a href="/remote-jobs/opencraft-senior-open-source-developer-devops-python-django-react-aws-openstack"><span class="company">OpenCraft</span><br/><span class="title">Senior Open Source Developer &amp; DevOps (Python, Django, React, AWS/OpenStack)</span><span class="date"><time data-format="%b %e" data-local="time" datetime="2023-04-29T20:46:50Z">Apr 29</time></span><br/><span class="company">Full-Time</span><span>/</span><span class="region company">Anywhere in the World</span></a>]

[<a href="/company/proxify-ab"><div alt="Proxify AB is hiring a remote Sen

---

In [39]:
from requests import get 

from bs4 import BeautifulSoup


base_url = "https://weworkremotely.com/remote-jobs/search?utf8=✓&term="

search_term = "python"

response = get(f"{base_url}{search_term}")

    
if response.status_code != 200:
    print("Can't request website")
else:
    soup = BeautifulSoup(response.text,"html.parser")
    
    # print(soup.find_all('title'))

    jobs = soup.find_all('section', class_="jobs")
    # print(soup.find_all('section', class_="jobs"))    
    # print(len(jobs))
    
    for job_section in jobs:
    
        # print(job_section.find_all('li')) 
        job_posts = job_section.find_all('li')
        
        # 리스트 컴프리헨션으로 view-all 클래스 제외 (버튼)
        job_posts = [post for post in job_section.find_all('li') if 'view-all' not in post['class']]

        for post in job_posts:
            anchors = post.find_all('a')
            
            # 'a' 태그중 필요한 정보가 있는 곳 인덱싱
            anchor = anchors[1] 
            print(anchor)
            print()
            

<a href="/remote-jobs/opencraft-senior-open-source-developer-devops-python-django-react-aws-openstack"><span class="company">OpenCraft</span><br/><span class="title">Senior Open Source Developer &amp; DevOps (Python, Django, React, AWS/OpenStack)</span><span class="featured">featured</span><br/><span class="company">Full-Time</span><span>/</span><span class="region company">Anywhere in the World</span></a>

<a href="/remote-jobs/proxify-ab-senior-python-engineer-long-term-job-100-remote"><span class="company">Proxify AB</span><br/><span class="title">Senior Python Engineer: Long-term job - 100% remote</span><span class="date"><time data-format="%b %e" data-local="time" datetime="2023-04-24T09:27:26Z">Apr 24</time></span><br/><span class="company">Full-Time</span><span>/</span><span class="region company">Anywhere in the World</span></a>

<a href="/remote-jobs/lemon-io-full-stack-python-developer"><span class="company">Lemon.io</span><br/><span class="title">Full-stack Python Developer 

<img src = '1.png' >

---

In [42]:
from requests import get 

from bs4 import BeautifulSoup


base_url = "https://weworkremotely.com/remote-jobs/search?utf8=✓&term="

search_term = "python"

response = get(f"{base_url}{search_term}")

    
if response.status_code != 200:
    print("Can't request website")
else:
    soup = BeautifulSoup(response.text,"html.parser")
    
    # print(soup.find_all('title'))

    jobs = soup.find_all('section', class_="jobs")
    # print(soup.find_all('section', class_="jobs"))    
    # print(len(jobs))
    
    for job_section in jobs:
    
        # print(job_section.find_all('li')) 
        job_posts = job_section.find_all('li')
        
        # 리스트 컴프리헨션으로 view-all 클래스 제외 (버튼)
        job_posts = [post for post in job_section.find_all('li') if 'view-all' not in post['class']]

        for post in job_posts:
            anchors = post.find_all('a')
            
            anchor = anchors[1]
            
            # 'a'태그중 href 링크 추출
            print(anchor['href'])
            print()
            

/remote-jobs/opencraft-senior-open-source-developer-devops-python-django-react-aws-openstack

/remote-jobs/proxify-ab-senior-python-engineer-long-term-job-100-remote

/remote-jobs/lemon-io-full-stack-python-developer

/remote-jobs/onthegosystems-senior-backend-developers-for-mass-market-project-ror-python-are-a-plus

/remote-jobs/close-senior-staff-software-engineer-backend-python-usa-100-remote



---

- span 내 데이터 가져오기

In [22]:
from requests import get 

from bs4 import BeautifulSoup


base_url = "https://weworkremotely.com/remote-jobs/search?utf8=✓&term="

search_term = "python"

response = get(f"{base_url}{search_term}")

    
if response.status_code != 200:
    print("Can't request website")
else:
    soup = BeautifulSoup(response.text,"html.parser")
    
    # print(soup.find_all('title'))

    jobs = soup.find_all('section', class_="jobs")
    # print(soup.find_all('section', class_="jobs"))    
    # print(len(jobs))
    
    for job_section in jobs:
    
        # print(job_section.find_all('li')) 
        job_posts = job_section.find_all('li')
        
        # 리스트 컴프리헨션으로 view-all 클래스 제외 (버튼)
        job_posts = [post for post in job_section.find_all('li') if 'view-all' not in post['class']]

        for post in job_posts:
            anchors = post.find_all('a')
            
            anchor = anchors[1]
            link = anchor['href']
            
            company, position, region = anchor.find_all('span',class_='company')
            
            title = anchor.find('span', class_='title')
            
            print(company,position,region,title)
            print()
            

<span class="company">OpenCraft</span> <span class="company">Full-Time</span> <span class="region company">Anywhere in the World</span> <span class="title">Senior Open Source Developer &amp; DevOps (Python, Django, React, AWS/OpenStack)</span>

<span class="company">Proxify AB</span> <span class="company">Full-Time</span> <span class="region company">Anywhere in the World</span> <span class="title">Senior Python Engineer: Long-term job - 100% remote</span>

<span class="company">Lemon.io</span> <span class="company">Full-Time</span> <span class="region company">Latin America Only/Europe Only/UK Only/Canada Only</span> <span class="title">Full-stack Python Developer </span>

<span class="company">OnTheGoSystems</span> <span class="company">Full-Time</span> <span class="region company">Anywhere in the World</span> <span class="title">Senior Backend Developers for Mass-Market Project (RoR / Python are a plus!)</span>

<span class="company">Close</span> <span class="company">Full-Time</spa

---

- 태그 제거 text 형태만 추출 `.string`

In [23]:
from requests import get 

from bs4 import BeautifulSoup


base_url = "https://weworkremotely.com/remote-jobs/search?utf8=✓&term="

search_term = "python"

response = get(f"{base_url}{search_term}")

    
if response.status_code != 200:
    print("Can't request website")
else:
    soup = BeautifulSoup(response.text,"html.parser")
    
    # print(soup.find_all('title'))

    jobs = soup.find_all('section', class_="jobs")
    # print(soup.find_all('section', class_="jobs"))    
    # print(len(jobs))
    
    for job_section in jobs:
    
        # print(job_section.find_all('li')) 
        job_posts = job_section.find_all('li')
        
        # 리스트 컴프리헨션으로 view-all 클래스 제외 (버튼)
        job_posts = [post for post in job_section.find_all('li') if 'view-all' not in post['class']]

        for post in job_posts:
            anchors = post.find_all('a')
            
            anchor = anchors[1]
            link = anchor['href']
            
            company, position, region = anchor.find_all('span',class_='company')
            
            title = anchor.find('span', class_='title')
            
            print(company.string,position.string,region.string,title.string)
            print()
            

OpenCraft Full-Time Anywhere in the World Senior Open Source Developer & DevOps (Python, Django, React, AWS/OpenStack)

Proxify AB Full-Time Anywhere in the World Senior Python Engineer: Long-term job - 100% remote

Lemon.io Full-Time Latin America Only/Europe Only/UK Only/Canada Only Full-stack Python Developer 

OnTheGoSystems Full-Time Anywhere in the World Senior Backend Developers for Mass-Market Project (RoR / Python are a plus!)

Close Full-Time USA Only Senior/Staff Software Engineer - Backend/Python - USA (100% Remote)



---

- dict 

In [27]:
from requests import get 

from bs4 import BeautifulSoup


base_url = "https://weworkremotely.com/remote-jobs/search?utf8=✓&term="

search_term = "python"

response = get(f"{base_url}{search_term}")

    
if response.status_code != 200:
    print("Can't request website")
else:
    soup = BeautifulSoup(response.text,"html.parser")
    
    # print(soup.find_all('title'))

    jobs = soup.find_all('section', class_="jobs")
    # print(soup.find_all('section', class_="jobs"))    
    # print(len(jobs))
    
    results = []
    
    for job_section in jobs:
    
        # print(job_section.find_all('li')) 
        job_posts = job_section.find_all('li')
        
        # 리스트 컴프리헨션으로 view-all 클래스 제외 (버튼)
        job_posts = [post for post in job_section.find_all('li') if 'view-all' not in post['class']]

        for post in job_posts:
            anchors = post.find_all('a')
            
            anchor = anchors[1]
            link = anchor['href']
            
            company, position, region = anchor.find_all('span',class_='company')
            
            title = anchor.find('span', class_='title')
            
            data_result = {
                'title': title.string,
                'company' : company.string,
                'position' : position.string,
                'region' : region.string
                }
            
            results.append(data_result)
        
    
    print(results)

            

[{'title': 'Senior Open Source Developer & DevOps (Python, Django, React, AWS/OpenStack)', 'company': 'OpenCraft', 'position': 'Full-Time', 'region': 'Anywhere in the World'}, {'title': 'Senior Python Engineer: Long-term job - 100% remote', 'company': 'Proxify AB', 'position': 'Full-Time', 'region': 'Anywhere in the World'}, {'title': 'Full-stack Python Developer ', 'company': 'Lemon.io', 'position': 'Full-Time', 'region': 'Latin America Only/Europe Only/UK Only/Canada Only'}, {'title': 'Senior Backend Developers for Mass-Market Project (RoR / Python are a plus!)', 'company': 'OnTheGoSystems', 'position': 'Full-Time', 'region': 'Anywhere in the World'}, {'title': 'Senior/Staff Software Engineer - Backend/Python - USA (100% Remote)', 'company': 'Close', 'position': 'Full-Time', 'region': 'USA Only'}]


---

## ~ 5.9

In [33]:
from requests import get 

from bs4 import BeautifulSoup


base_url = "https://weworkremotely.com/remote-jobs/search?utf8=✓&term="

search_term = "java"

response = get(f"{base_url}{search_term}")

    
if response.status_code != 200:
    print("Can't request website")
else:
    soup = BeautifulSoup(response.text,"html.parser")
    
    # print(soup.find_all('title'))

    jobs = soup.find_all('section', class_="jobs")
    # print(soup.find_all('section', class_="jobs"))    
    # print(len(jobs))
    
    results = []
    
    for job_section in jobs:
    
        # print(job_section.find_all('li')) 
        job_posts = job_section.find_all('li')
        
        # 리스트 컴프리헨션으로 view-all 클래스 제외 (버튼)
        job_posts = [post for post in job_section.find_all('li') if 'view-all' not in post['class']]

        for post in job_posts:
            
            
            anchors = post.find_all('a') # list
            
            anchor = anchors[1] # dictionary in list
            
            link = anchor['href'] # value in dictionary
            
            # anchor('a') 내 span tag (class = company)추출, list
            company, position, region = anchor.find_all('span',class_='company')
            
            title = anchor.find('span', class_='title')
            
            # html tag 제외 후, dictionary 데이터 생성 후, list append
            data_result = {
                'title': title.string,
                'company' : company.string,
                'position' : position.string,
                'region' : region.string,
                'link' : f'https://weworkremotely.com{link}'
                }
            
            results.append(data_result)
        
    
    for result in results:
        print(result)

            

{'title': 'Java Developer (Full-Stack)', 'company': 'Simvoly', 'position': 'Full-Time', 'region': 'Anywhere in the World', 'link': 'https://weworkremotely.com/remote-jobs/simvoly-java-developer-full-stack-6'}
{'title': 'Backend Javascript / Node.js Developer - Remote/SaaS (m/f/d)', 'company': 'SocialHub', 'position': 'Full-Time', 'region': 'Anywhere in the World', 'link': 'https://weworkremotely.com/remote-jobs/socialhub-backend-javascript-node-js-developer-remote-saas-m-f-d-2'}
{'title': 'Full Stack JavaScript/TypeScript Engineer', 'company': 'WeatherWorks', 'position': 'Full-Time', 'region': 'North America Only/Latin America Only/Europe Only/Africa Only', 'link': 'https://weworkremotely.com/remote-jobs/weatherworks-full-stack-javascript-typescript-engineer'}
{'title': 'Software Developer (PHP, Javascript, AWS) ', 'company': 'Tixel', 'position': 'Full-Time', 'region': 'Europe Only/UK Only', 'link': 'https://weworkremotely.com/remote-jobs/tixel-software-developer-php-javascript-aws'}
{

---

## Refactoring

- search_term 을 parameter 로 받고, dictionary results 를 return 하는
함수 생성

In [2]:
from requests import get 

from bs4 import BeautifulSoup

from extract.wwr import extract_jobs

jobs = extract_jobs('python')
print(jobs)

[{'title': 'Senior Open Source Developer & DevOps (Python, Django, React, AWS/OpenStack)', 'company': 'OpenCraft', 'position': 'Full-Time', 'region': 'Anywhere in the World', 'link': 'https://weworkremotely.com/remote-jobs/opencraft-senior-open-source-developer-devops-python-django-react-aws-openstack'}, {'title': 'Senior Python Engineer: Long-term job - 100% remote', 'company': 'Proxify AB', 'position': 'Full-Time', 'region': 'Anywhere in the World', 'link': 'https://weworkremotely.com/remote-jobs/proxify-ab-senior-python-engineer-long-term-job-100-remote'}, {'title': 'Full-stack Python Developer ', 'company': 'Lemon.io', 'position': 'Full-Time', 'region': 'Latin America Only/Europe Only/UK Only/Canada Only', 'link': 'https://weworkremotely.com/remote-jobs/lemon-io-full-stack-python-developer'}, {'title': 'Senior Backend Developers for Mass-Market Project (RoR / Python are a plus!)', 'company': 'OnTheGoSystems', 'position': 'Full-Time', 'region': 'Anywhere in the World', 'link': 'http

---