# Web Crawling
- Web Scraoing with Python Chapter 3

## Traversing a Single Domain
- Six Degrees of separation: 
    - 친구의 친구를 통해 모든 사람은 6단계 이하의 사회망으로 연결
    - 6 Handshakes rule
    - Frigyes Karinthy, 1929
- http://oracleofbacon.org/
    - Kevin Bacon
    - 영화 관계인
    ![oracle_of_bacon](six_degree.png)

### http://en.wikipedia.org/wiki/Kevin_Bacon
- 페이지 소스보기에서 "a href" 검색 => 785개 검색

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup 

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
i = 1
for link in bs.find_all('a'):
    if 'href' in link.attrs:
        print(i, link.attrs['href'])
        i += 1

1 /wiki/Wikipedia:Protection_policy#semi
2 #mw-head
3 #searchInput
4 /wiki/Kevin_Bacon_(disambiguation)
5 /wiki/File:Kevin_Bacon_SDCC_2014.jpg
6 /wiki/Philadelphia
7 /wiki/Kevin_Bacon_filmography
8 /wiki/Kyra_Sedgwick
9 /wiki/Sosie_Bacon
10 #cite_note-1
11 /wiki/Edmund_Bacon_(architect)
12 /wiki/Michael_Bacon_(musician)
13 /wiki/Holly_Near
14 /wiki/Wikipedia:Citation_needed
15 http://baconbros.com/
16 #cite_note-2
17 #cite_note-actor-3
18 /wiki/Footloose_(1984_film)
19 /wiki/JFK_(film)
20 /wiki/A_Few_Good_Men
21 /wiki/Apollo_13_(film)
22 /wiki/Mystic_River_(film)
23 /wiki/Balto_(film)
24 /wiki/Sleepers
25 /wiki/The_Woodsman_(2004_film)
26 /wiki/Animal_House
27 /wiki/Diner_(1982_film)
28 /wiki/Tremors_(1990_film)
29 /wiki/Crazy,_Stupid,_Love
30 /wiki/Friday_the_13th_(1980_film)
31 /wiki/Flatliners
32 /wiki/The_River_Wild
33 /wiki/Wild_Things_(film)
34 /wiki/Stir_of_Echoes
35 /wiki/Hollow_Man
36 /wiki/Frost/Nixon_(film)
37 /wiki/X-Men:_First_Class
38 /wiki/Black_Mass_(film)
39 /wiki/Patr

긁어온 href를 다시 읽어서 또다시 href 추출하는 과정을 반복하면
결국엔 6 handshake rule에 의해 모든 링크를 가져올수있다라는 것이 크롤링의 원리

### wiki 내의 다른 항목 링크만 추출
- div id="bodyContent" 안에 있음
- URL에 ':' 이 없음 => 정규식 [^:]*
- URL이 /wiki/ 로 시작 => 정규식 ^/wiki/

In [2]:
from urllib.request import urlopen 
from bs4 import BeautifulSoup 
import re

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
i = 1
for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^/wiki/[^:]*$')):
    if 'href' in link.attrs:
        print(i, link.attrs['href'])
        i += 1

1 /wiki/Kevin_Bacon_(disambiguation)
2 /wiki/Philadelphia
3 /wiki/Kevin_Bacon_filmography
4 /wiki/Kyra_Sedgwick
5 /wiki/Sosie_Bacon
6 /wiki/Edmund_Bacon_(architect)
7 /wiki/Michael_Bacon_(musician)
8 /wiki/Holly_Near
9 /wiki/Footloose_(1984_film)
10 /wiki/JFK_(film)
11 /wiki/A_Few_Good_Men
12 /wiki/Apollo_13_(film)
13 /wiki/Mystic_River_(film)
14 /wiki/Balto_(film)
15 /wiki/Sleepers
16 /wiki/The_Woodsman_(2004_film)
17 /wiki/Animal_House
18 /wiki/Diner_(1982_film)
19 /wiki/Tremors_(1990_film)
20 /wiki/Crazy,_Stupid,_Love
21 /wiki/Friday_the_13th_(1980_film)
22 /wiki/Flatliners
23 /wiki/The_River_Wild
24 /wiki/Wild_Things_(film)
25 /wiki/Stir_of_Echoes
26 /wiki/Hollow_Man
27 /wiki/Frost/Nixon_(film)
28 /wiki/Black_Mass_(film)
29 /wiki/Patriots_Day_(film)
30 /wiki/Fox_Broadcasting_Company
31 /wiki/The_Following
32 /wiki/HBO
33 /wiki/Taking_Chance
34 /wiki/Golden_Globe_Award
35 /wiki/Screen_Actors_Guild_Award
36 /wiki/Primetime_Emmy_Award
37 /wiki/I_Love_Dick_(TV_series)
38 /wiki/Golden_G

### 1.2 위키백과 각 페이지에서 무작위 항목선택
- 난수발생기를 이용하여 페이지내의 링크에서 무작위로 링크선택 (6 handshakes rule에 의해 굳이 full-search할 필요가 없음으로)
- random.randint(0, len(links)-1)
- 난수의 seed: 시스템 현재시간, datetime.datetime.now()
- 따라가는 link (즉, link 안에 있는 link) 횟수 제한(depth)

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

import pdb

random.seed(datetime.datetime.now())

def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org'+articleUrl)
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^/wiki/[^:]*$'))

# 시작 페이지
depth = 0
links = getLinks('/wiki/Kevin_Bacon')

while len(links) > 0 and depth < 5:      # depth제한
    #pdb.set_trace()
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    links = getLinks(newArticle)
    depth +=1    
    print(depth, len(links), newArticle)

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(datetime.datetime.now())


1 383 /wiki/E!_People%27s_Choice_Awards
2 196 /wiki/Roma_Downey
3 435 /wiki/Order_of_the_British_Empire
4 124 /wiki/Merchant_Navy_Medal_for_Meritorious_Service
5 642 /wiki/Order_of_the_Garter


## 전체 사이트 crawling
- 사이트의 모든 페이지를 분류, 검색
    - site map 생성
    - data 수집
- 검색된 페이지를 저장할 데이터베이스 필요
- 같은 페이지는 한번만 crawling => set 활용

In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
depth = 0

def getLinks(pageUrl):
    global pages
    global depth
    html = urlopen('http://ko.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    links = bs.find_all('a', href=re.compile('^(/wiki/)'))
    for link in links:
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                # 새로운 page
                newPage = link.attrs['href']
                print(depth, len(links), "\t", newPage)
                pages.add(newPage)
                try:
                    getLinks(newPage)
                except:
                    print('no page')
                    
                depth += 1
                if (depth > 2):
                    break
getLinks('')

0 167 	 /wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8
0 167 	 /wiki/%ED%8A%B9%EC%88%98:%EA%B2%80%EC%83%89
0 20 	 /wiki/%EB%8F%84%EC%9B%80%EB%A7%90:%EC%86%8C%EA%B0%9C
0 45 	 /wiki/%ED%8A%B9%EC%88%98:%EB%82%B4%EC%82%AC%EC%9A%A9%EC%9E%90%ED%86%A0%EB%A1%A0
0 23 	 /wiki/%ED%8A%B9%EC%88%98:%EB%82%B4%EA%B8%B0%EC%97%AC
0 26 	 /wiki/%ED%8A%B9%EC%88%98:%EC%B5%9C%EA%B7%BC%EB%B0%94%EB%80%9C
0 273 	 /wiki/%ED%8F%AC%ED%84%B8:%EC%9A%94%EC%A6%98_%ED%99%94%EC%A0%9C
0 251 	 /wiki/%ED%8A%B9%EC%88%98:%EC%9E%84%EC%9D%98%EB%AC%B8%EC%84%9C
0 50 	 /wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EC%82%AC%EB%9E%91%EB%B0%A9
0 234 	 /wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EC%82%AC%EC%9A%A9%EC%9E%90_%EB%AA%A8%EC%9E%84
0 246 	 /wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EC%9A%94%EC%B2%AD
0 54 	 /wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8F%84%EC%9B%80%EB%A7%90
0 92 	 /wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EC%A0%95%EC%B1%85%EA%B3%BC_%EC%A7%80%EC%B9%A8
0 206 	 /wiki/%EC%9C%84%ED%82%A4%EB%B0%B

### 전체 사이트에서 Data 수집
- 제목, 첫번째 문단, 편집페이지링크
- 제목: h1 tag (페이지 당 하나만 있음)
- 텍스트 body: div#bodyContent tag
- 첫번째 문단: div#mw-content-text → p
- 편집페이지링크: li#ca-edit → span → a 아래 li#ca-edit tag

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
depth = 0

def getLinks(pageUrl):
    global pages
    global depth
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
#    html = urlopen('http://ko.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print(bs.h1.get_text())
        #print(bs.find(id ='mw-content-text').find_all('p')[0])
        #print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')
    
    for link in bs.find_all('a', href=re.compile('^/wiki/[^:]*$')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
                depth += 1
                if (depth > 1):
                    break
getLinks('') 

Main Page
--------------------
/wiki/Wikipedia
Wikipedia
--------------------
/wiki/Main_Page
Main Page
--------------------
/wiki/Free_content
Free content
--------------------
/wiki/Definition_of_Free_Cultural_Works
Definition of Free Cultural Works
--------------------
/wiki/Free_content_movement
Free-culture movement
--------------------
/wiki/Lawrence_Lessig
Lawrence Lessig
--------------------
/wiki/Lawrence_Lessing
Lawrence Lessing
--------------------
/wiki/Science_writer
Science journalism
--------------------
/wiki/Scientific_writing
Scientific writing
--------------------
/wiki/Science_journalism
Science journalism
--------------------
/wiki/Science_communication
Science communication
--------------------
/wiki/Scientific_literature
Scientific literature
--------------------
/wiki/Medical_literature
Medical literature
--------------------
/wiki/Edwin_Smith_Papyrus
Edwin Smith Papyrus
--------------------
/wiki/New_York_Academy_of_Medicine
New York Academy of Medicine
-------

Order of St. Gregory the Great
--------------------
/wiki/Sylvestrines
Sylvestrines
--------------------
/wiki/Anno_Domini
Anno Domini
--------------------
/wiki/Anno_Domini_(disambiguation)
Anno Domini (disambiguation)
--------------------
/wiki/Anno_Domini_High_Definition
Anno Domini High Definition
--------------------
/wiki/Album
Album
--------------------
/wiki/Album_(disambiguation)
Album (disambiguation)
--------------------
/wiki/Album_%E2%80%93_Generic_Flipper
Album – Generic Flipper
--------------------
/wiki/Flipper_(band)
Flipper (band)
--------------------
/wiki/Washington,_D.C.
Washington, D.C.
--------------------
/wiki/District_of_Columbia_(disambiguation)
District of Columbia (disambiguation)
--------------------
/wiki/District_of_Columbia_(until_1871)
District of Columbia (until 1871)
--------------------
/wiki/History_of_Washington,_D.C.
History of Washington, D.C.
--------------------
/wiki/List_of_capitals_in_the_United_States#National_capitals
List of capitals in 

New England town
--------------------
/wiki/Municipal_corporation
Municipal corporation
--------------------
/wiki/Local_government
Local government
--------------------
/wiki/List_of_forms_of_government
List of forms of government
--------------------
/wiki/Democracy
Democracy
--------------------
/wiki/Liberal_democracy
Liberal democracy
--------------------
/wiki/History_of_democracy
History of democracy
--------------------
/wiki/Denarius
Denarius
--------------------
/wiki/Augustus
Augustus
--------------------
/wiki/Augustus_(title)
Augustus (title)
--------------------
/wiki/Augustus_(disambiguation)
Augustus (disambiguation)
--------------------
/wiki/Augustus_(honorific)
Augustus (title)
--------------------
/wiki/Diocletian
Diocletian
--------------------
/wiki/Diocletian_(band)
Diocletian (band)
--------------------
/wiki/Auckland
Auckland
--------------------
/wiki/Auckland_Region
Auckland Region
--------------------
/wiki/Auckland_(disambiguation)
Auckland (disambiguation)

Administrative division
--------------------
/wiki/Division_(business)
Division (business)
--------------------
/wiki/Business
Business
--------------------
/wiki/Business_(disambiguation)
Business (disambiguation)
--------------------
/wiki/Goods_and_services
Goods and services
--------------------
/wiki/Capitalism
Capitalism
--------------------
/wiki/Capitalism_(disambiguation)
Capitalism (disambiguation)
--------------------
/wiki/Economic_liberalism
Economic liberalism
--------------------
/wiki/Austerity
Austerity
--------------------
/wiki/Austerity_(disambiguation)
Austerity (disambiguation)
--------------------
/wiki/Tim_Yatras
Tim Yatras
--------------------
/wiki/Wollongong,_New_South_Wales
Wollongong
--------------------
/wiki/City_of_Wollongong
City of Wollongong
--------------------
/wiki/Wollongong
Wollongong
--------------------
/wiki/New_South_Wales
New South Wales
--------------------
/wiki/New_Britain_(Canada)
New Britain (Canada)
--------------------
/wiki/New_Franc

University press
--------------------
/wiki/University_of_Cambridge
University of Cambridge
--------------------
/wiki/Coat_of_arms_of_the_University_of_Cambridge
Coat of arms of the University of Cambridge
--------------------
/wiki/Banner_of_arms
Banner of arms
--------------------
/wiki/Flag_of_Scania
Flag of Scania
--------------------
/wiki/Scania
Scania
--------------------
/wiki/Scania_AB
Scania AB
--------------------
/wiki/Sk%C3%A5ne_(disambiguation)
Skåne (disambiguation)
--------------------
/wiki/Sk%C3%A5ne_County
Skåne County
--------------------
/wiki/Counties_of_Sweden
Counties of Sweden
--------------------
/wiki/Swedish_language
Swedish language
--------------------
/wiki/Sweden
Sweden
--------------------
/wiki/Sweden_(disambiguation)
Sweden (disambiguation)
--------------------
/wiki/Sweden_(European_Parliament_constituency)
Sweden (European Parliament constituency)
--------------------
/wiki/European_Parliament_constituency
European Parliament constituency
---------

KeyboardInterrupt: 

## 인터넷 Crawling
- web site마다 layout이 다름
- 수집하려는 data 성격 (언어)
- 특정 web site 도달했을 때 조처

![internet_crawling](fig3_1.png)

In [6]:
from urllib.request import urlopen, Request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

import pdb

pages = set()
#random.seed(datetime.datetime.now())
depth = 0

# 내부 링크를 목록으로 만듦
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):                          # begin with 설정
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks
            
# 외부 링크를 목록으로 만듦
def getExternalLinks(bs, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" that do
    #not contain the current URL
         
    # 외부 링크와 내부 링크의 차이점인 상위 도메인 네임의 일치여부를 기준으로 파싱
    links = bs.find_all('a', href=re.compile('^(http|www)((?!' + excludeUrl + ').)*$'))    
        
    for link in links:
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:               
                externalLinks.append(link.attrs['href'])
       
    return externalLinks

def getRandomExternalLink(startingPage):    
              
    html = urlopen(Request(startingPage, headers={'User-Agent': 'Mozilla/5.0'}))
    bs = BeautifulSoup(html, 'html.parser')                

    externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)         

    if len(externalLinks) == 0:
        print('No external links, looking around the site for one')
        domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
        try:
            internalLinks = getInternalLinks(bs, domain)     
        except:
            printf ("예외")

        if len(internalLinks) == 0:            
            print('No internal links')
            return
        
        return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)])
    else:      

        return externalLinks[random.randint(0, len(externalLinks)-1)]
  
    
def followExternalOnly(startingSite):
    global depth
    externalLink = getRandomExternalLink(startingSite)
    print(depth, 'Random external link is: {}'.format(externalLink))
    
    if externalLink is not None:
        followExternalOnly(externalLink)
        depth += 1
        if (depth > 5):
            return
            
followExternalOnly('https://oreilly.com')
#followExternalOnly('http://en.wikipedia.org')

0 Random external link is: https://oreilly.hk/
0 Random external link is: https://www.oreilly.com/about/editorial_independence.html
0 Random external link is: https://oreilly.hk/
0 Random external link is: https://www.oreilly.com/about/contact.html
0 Random external link is: https://oreillylearning.in/
0 Random external link is: https://cdn.oreillystatic.com/oreilly/pdfs/OReilly_Technical_Health_isnt_Optional.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


No external links, looking around the site for one
No internal links
0 Random external link is: None


#### https://python.flowdas.com/library/urllib.parse.html

urlparse에 적용하면 아래와 같은 옵션으로 파티션된다
(scheme는 http, ftp와 같은 요소, netloc는 주소에서 www.cwi.nl:80까지의 요소, path는 netloc을 제외한 나머지 주소)

![urllib_parse](urllib_parse.png)

### 웹사이트의 외부링크 수집

![외부링크수집](fig3_2.png)

In [7]:
# Collects a list of all external URLs found on the site
allExtLinks = set()
allIntLinks = set()


def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = '{}://{}'.format(urlparse(siteUrl).scheme,
                              urlparse(siteUrl).netloc)
    bs = BeautifulSoup(html, 'html.parser')
    internalLinks = getInternalLinks(bs, domain)
    externalLinks = getExternalLinks(bs, domain)

    for link in externalLinks:
        if link not in allExtLinks:   # 중복처리
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:  # 중복처리
            allIntLinks.add(link)
            getAllExternalLinks(link)


allIntLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')

https://www.oreilly.com
https://www.oreilly.com/member/login/
https://www.oreilly.com/online-learning/try-now.html
https://www.oreilly.com/online-learning/teams.html
https://www.oreilly.com/online-learning/government.html
https://www.oreilly.com/online-learning/academic.html
https://www.oreilly.com/online-learning/individuals.html
https://www.oreilly.com/online-learning/features.html
https://www.oreilly.com/online-learning/feature-certification.html
https://www.oreilly.com/online-learning/intro-interactive-learning.html
https://www.oreilly.com/online-learning/live-events.html
https://www.oreilly.com/online-learning/feature-answers.html
https://www.oreilly.com/online-learning/insights-dashboard.html
https://www.oreilly.com/radar/
https://www.oreilly.com/content-marketing-solutions.html
https://www.oreilly.com/ceros/345800-holiday-card-2022.html
https://learning.oreilly.com/start-trial/
https://www.oreilly.com/online-learning/testimonials.html
https://www.oreilly.com/online-learning/orei

https://www.oreilly.com/partner/optimize-your-community-partnership.html
https://app.oreilly.com/cs/user/register?x-redirect=http://www.oreilly.com/partner/signup.csp
https://app.oreilly.com/cs/user/login?x-redirect=http://www.oreilly.com/partner/signup.csp
https://signup.cj.com/member/brandedPublisherSignUp.do?air_refmerchantid=3812999
http://www.cj.com/
https://www.oreilly.com/oreilly/privacy.html
https://www.oreilly.com/work-with-us.html
https://www.oreilly.com/careers/index.csp#positions
https://www.cio.com/article/3516012/women-in-tech-statistics-the-hard-truths-of-an-uphill-battle.html
https://www.oreilly.com/content-marketing/available-products.html
https://www.oreilly.com/content-sponsorship/testimonial-dataiku.html
https://www.oreilly.com/content-sponsorship/testimonials.html
https://www.oreilly.com/online-learning/support/getting-started.html
https://www.oreilly.com/online-learning/support/features.html#certification
https://www.oreilly.com/online-learning/support/content.htm

https://www.crowncommercial.gov.uk/agreements/RM1557.12
https://www.oreilly.com/pub/pr/3298
https://learning.oreilly.com/playlists/discover/
http://www.brandonhall.com
https://trainingindustry.com/top-training-companies/it-and-technical-training/2020-top-it-training-companies/
https://www.oreilly.com/online-learning/
https://trainingindustry.com/top-training-companies/e-learning/2020-top-online-learning-library-companies/
https://www.katacoda.com/
https://www.oreilly.com/online-learning/certifications.html
https://www.oreilly.com/playlists/discover/
https://www.oreilly.com/pub/pr/3273
https://www.oreilly.com/resource-centers/
https://www.oreilly.com/demo/online-learning-demo.html
https://www.oreilly.com/radar/microservices-adoption-in-2020/
https://learning.oreilly.com/featured/software-architecture/
https://learning.oreilly.com/featured/infrastructure-ops/
https://learning.oreilly.com/featured/oscon/
https://learning.oreilly.com/featured/strata/
https://www.oreilly.com/webcasts/lot/22

https://trainingindustry.com/top-training-companies/e-learning/2019-top-online-learning-library-companies/
https://conferences.oreilly.com/oscon/oscon-or
https://conferences.oreilly.com/oscon/oscon-or/public/schedule/stopic/3130
https://conferences.oreilly.com/oscon/oscon-or/public/schedule/stopic/3131
https://conferences.oreilly.com/oscon/oscon-or/public/schedule/stopic/3132
https://conferences.oreilly.com/oscon/oscon-or/public/schedule/stopic/3133
https://conferences.oreilly.com/oscon/oscon-or/public/schedule/stopic/3134
https://conferences.oreilly.com/oscon/oscon-or/public/schedule/speaker/108386
https://conferences.oreilly.com/oscon/oscon-or/public/schedule/speaker/2314
https://conferences.oreilly.com/oscon/oscon-or/public/schedule/speaker/76203
https://conferences.oreilly.com/oscon/oscon-or/public/schedule/speaker/128567
https://conferences.oreilly.com/oscon/oscon-or/public/schedule/speaker/132932
https://conferences.oreilly.com/oscon/oscon-or/public/schedule/speaker/270627
https:

https://conferences.oreilly.com/strata/strata-ca/public/schedule/topic/2451
https://conferences.oreilly.com/strata/strata-ca/public/schedule/full/data-case-studies
https://conferences.oreilly.com/strata/strata-ca/public/schedule/full/media-and-advertising
https://conferences.oreilly.com/strata/strata-ca/public/schedule/topic/2452
https://conferences.oreilly.com/strata/strata-ca/public/schedule/topic/2453
https://conferences.oreilly.com/strata/strata-ca/public/schedule/detail/64452
https://conferences.oreilly.com/strata/strata-ca/public/schedule/detail/64458
https://conferences.oreilly.com/strata/strata-ca/public/schedule/detail/64460
https://conferences.oreilly.com/strata/strata-ca/public/schedule/detail/64462
https://conferences.oreilly.com/strata/strata-ca/public/schedule/detail/64463
https://conferences.oreilly.com/strata/strata-ca/public/schedule/detail/64469
https://conferences.oreilly.com/strata/strata-ca/public/schedule/detail/64531
https://twitter.com/hashtag/stratadata
http://

HTTPError: HTTP Error 404: Not Found

## 다양한 웹사이트 layout 다루기
- Web Scraping with Python, 4장

### 예제1: Brooking,NYTimes 사이트
- 사이트별로 web crawler 지정(CSS 선택자만 다르게)
- title: 제목 텍스트 추출
- body: 기사의 주요 콘텐츠 선택/추출
- Content(URL, title, body) class 반환

-> 결론적으로 각 사이트마다 다른 layout 구성에서 범용적으로 같은 data를 가지고 오는 것은 힘들기 때문에 
   사이트 별로 web crawler를 지정하여 data를 가져온다

In [8]:
import requests

class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')
    
def scrapeNYTimes(url):
    bs = getPage(url)
    title = bs.find("h1").text
#    lines = bs.select('div.StoryBodyCompanionColumn div p')
    #lines = bs.find_all("p", {"class":"story-content"})
    lines = bs.find_all("p", {"class":"story-body-content"})
    body = '\n'.join([line.text for line in lines])
    return Content(url, title, body)

def scrapeBrookings(url):
    bs = getPage(url)
    title = bs.find("h1").text
    body = bs.find("div",{"class","post-body"}).text
    return Content(url, title, body)

### Brookings_blog
![Brookings_blog](Brookings_blog.png)

In [9]:
url = "https://www.brookings.edu/blog/education-plus-development/2021/10/05/invest-in-programs-that-boost-childrens-learning-and-development/"

In [10]:
content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

Title: Invest in programs that boost children’s learning and development
URL: https://www.brookings.edu/blog/education-plus-development/2021/10/05/invest-in-programs-that-boost-childrens-learning-and-development/


Congress will soon determine the extent of the nation’s investment in its youngest citizens. After years of spending on the margins, President Biden’s plan for universal pre-K for 3- and 4-year-olds, child care subsidies, and increased wages for teachers and caregivers acknowledges that the early years of childhood are of unique importance to the public welfare and must be funded as such. Investing in children’s early years pays off both for their and their parent’s success in life, and their contributions to the economy and society—a multigenerational return. 







Robert Pianta

					Dean - School of Education and Human Development, University of Virginia 

					Batten Bicentennial Professor of Early Childhood Education - School of Education and Human Development, Univer

### NYTimes_2020_09_26_opinion
![NYTimes](NYTimes_2020_09_26_opinion.png)

In [11]:
#url = 'https://www.nytimes.com/2021/10/05/opinion/facebook-blackout-2021.html'
url = 'https://www.nytimes.com/books/best-sellers/2022/11/06/'


In [12]:
content = scrapeNYTimes(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

Title: The New York Times Best Sellers- November 06, 2022
URL: https://www.nytimes.com/books/best-sellers/2022/11/06/




### 예제2: Web 사이트 구조에 대한 정보 class

In [13]:
class Content:
    """
    articles/pages 전체에 사용할 공통 기반 class
    """
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        """
        Flexible printing function controls output
        """
        print("URL: {}".format(self.url))
        print("TITLE: {}".format(self.title))
        print("BODY:\n{}".format(self.body))
        print("\n")

In [14]:
class Website:
    """
    website 구조에 대한 정보를 저장(데이터를 수집하는 방법에 대한 지침)
    """
    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [15]:
import requests
from bs4 import BeautifulSoup

class Crawler:
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        """
        Beautifulsoup 객체와 선택자를 받아 content를 추출하는 함수.
        주어진 selector로 발견된 객체가 없으면 empty 반환
        """
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, site, url):
        """
        URL을 받아 content 추출
        """
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

In [16]:
crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'h1', 'div.post-body'],
    ['New York Times', 'http://nytimes.com', 'h1', 'div.StoryBodyCompanionColumn div p']
]

websites = []
urls = [
    'http://shop.oreilly.com/product/0636920028154.do',
    'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0',
    'https://www.brookings.edu/blog/education-plus-development/2021/10/05/invest-in-programs-that-boost-childrens-learning-and-development/',
    'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html'
]

for row in siteData:
    websites.append(Website(row[0], row[1], row[2], row[3]))

crawler.parse(websites[0], urls[0])
crawler.parse(websites[1], urls[1])
crawler.parse(websites[2], urls[2])
crawler.parse(websites[3], urls[3])

URL: https://www.brookings.edu/blog/education-plus-development/2021/10/05/invest-in-programs-that-boost-childrens-learning-and-development/
TITLE: Invest in programs that boost children’s learning and development
Invest in programs that boost children’s learning and development
BODY:

Congress will soon determine the extent of the nation’s investment in its youngest citizens. After years of spending on the margins, President Biden’s plan for universal pre-K for 3- and 4-year-olds, child care subsidies, and increased wages for teachers and caregivers acknowledges that the early years of childhood are of unique importance to the public welfare and must be funded as such. Investing in children’s early years pays off both for their and their parent’s success in life, and their contributions to the economy and society—a multigenerational return. 







Robert Pianta

					Dean - School of Education and Human Development, University of Virginia 

					Batten Bicentennial Professor of Early 

## Crawler 구성
- 필드가 없을 경우
- 다양한 type의 데이터수집
- web site의 일부부만 scrapy
- page에 관한 자세한 정보 저장

### 검색 portal을 통한 site crawling
- URL에 https://www.google.com/search?q=myTopic 으로 검색
- 검색 결과 page는 link 목록 형태로 제공  => Website 속성으로 지정
- 결과 link는 상대URL주소, 절대URL주소 => Website 속성으로 지정

In [1]:
class Content:
    """Common base class for all articles/pages"""

    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.title = title
        self.body = body
        self.url = url

    def print(self):
        """
        Flexible printing function controls output
        """
        print('New article found for topic: {}'.format(self.topic))
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

In [2]:
class Website:
    """Contains information about website structure"""
    # searchUrl: URL을 검색어에 추가한 경우 검색 결과를 어디서 얻는지 정의
    # resultListing: 각 결과에 대한 정보를 담고 있는 박스
    # resultUrl: 결과에서 정확한 URL을 추출할 때 사용할 태그 정보
    # absoluteUrl: 상대URL 혹은 절대URL (boolian)

    def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        

In [3]:
import requests
from bs4 import BeautifulSoup

class Crawler:

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        return ''

    def search(self, topic, site):
        """
        주어진 topic으로 주어진 website 검색, 발견된 모든 page 기록
        """
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)
                
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs['href']
            
            # Check to see whether it's a relative or an absolute URL
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print('Something was wrong with that page or URL. Skipping!')
                return
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, title, body, url)
                content.print()

In [4]:
crawler = Crawler()

siteData = [
    ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',
        'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',
        'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']
]
sites = []
for row in siteData:
    sites.append(Website(row[0], row[1], row[2],
                         row[3], row[4], row[5], row[6], row[7]))

topics = ['facebook', 'North Korea']
for topic in topics:
    print('GETTING INFO ABOUT: ' + topic)
    for targetSite in sites:
        crawler.search(topic, targetSite)

GETTING INFO ABOUT: facebook
New article found for topic: facebook
URL: Facebook's Libra 2.0
TITLE: 
Almost a year ago, Facebook announced it would create a global digital currency called “Libra” in order to help the billions of people around the world who lacked access to basic financial services. The currency would be a “stablecoin” backed by a basket of sovereign currencies such as the dollar, the euro and the yen. The proposal provoked widespread skepticism about Mark Zuckerberg’s motives—surely financial inclusion was just a veil for data collection ambitions—as well as criticism that Libra would undermine the U.S. dollar and cause all sorts of other problems. But the proposal also prompted a number of central banks to initiate or speed up research on the possibility of official digital currencies. 







Timothy G. Massad

					Nonresident Senior Fellow - Economic Studies, Center on Regulation and Markets 




Because the proposal was recently significantly modified, and because

New article found for topic: facebook
URL: Solving the problem of racially discriminatory advertising on Facebook
TITLE: 
While Facebook profiles may not explicitly state users’ race or ethnicity, my research demonstrates that Facebook’s current advertising algorithms can discriminate by these factors. Based on research conducted in 2020 and 2021, I used Facebook’s advertising tools to test how advertisers can use their targeting options like “multicultural affinity” groups, Lookalike Audiences, and Special Ad Audiences to ensure their ads are reaching white, African American, Asian, or Hispanic users. What I found is that discrimination by race and ethnicity on Facebook’s platforms is a significant threat to the public interest for two reasons. First, it is a violation of the existing civil rights laws that protect marginalized consumers against advertising harms and discrimination by race and ethnicity, especially in the areas of housing, employment, and credit. Second, these same Fa

New article found for topic: facebook
URL: Could the Facebook papers close the deal on privacy legislation?
TITLE: 
The disclosures from the Facebook Papers have led to a flurry of legislative proposals on Capitol Hill to address data use, kids’ online safety, and malicious content. The single most effective step Congress can take is to enact comprehensive privacy legislation to address the explosion of digital information not covered by existing, narrower privacy laws.







Cameron F. Kerry

					Ann R. and Andrew H. Tisch Distinguished Visiting Fellow - Governance Studies, Center for Technology Innovation 

 Twitter
@Cam_Kerry








Jules Polonetsky

					Chief Executive Officer - Future of Privacy Forum 

 Twitter
JulesPolonetsky





Congress should not let this latest “Facebook moment” pass without meaningful action. Every day that passes without a baseline privacy protection law in effect is another day that not just Facebook, but a multitude of businesses, collect and use da

New article found for topic: facebook
URL: Will the Facebook whistleblower’s testimony spur new US digital regulation?
TITLE: 
Facebook whistleblower Frances Haugen’s testimony on the social media platform’s business practices may have been an eye-opened for members of Congress, says former FCC Chair Tom Wheeler, but this and other recent revelations are just the canary in the coal mine for the broader digital ecosystem. Wheeler points to a lack of government oversight that has left digital platforms to make their own rules, and what responses may be forthcoming from the Federal Trade Commission, the Department of Justice, and state-level actors.

Related material: 

A focused federal agency is necessary to oversee Big Tech
The court’s Facebook decision shows why we need a digital regulatory agency
China’s new regulation of platforms: a message for American policymakers

Listen to Brookings podcasts here, on Apple or Google podcasts or on Spotify, send email feedback to bcp@brookings.e

IndexError: list index out of range

### link를 통한 site crawling
- 특정 URL 패턴과 일치하는 link를 모두 따라가는 crawler
- site 전체에서 데이터를 수집해야 하는 경우
- 어떤 종류의 page를 선택할지 지정하는 규칙이 필요

In [21]:
class Website:

    def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.targetPattern = targetPattern    # URL에 대한 정규식
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag

class Content:

    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

In [22]:
import re

class Crawler:
    def __init__(self, site):
        self.site = site
        self.visited = []

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, url):
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, self.site.titleTag)
            body = self.safeGet(bs, self.site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

    def crawl(self):
        """
        Get pages from website home page
        """
        bs = self.getPage(self.site.url)
        targetPages = bs.findAll('a', href=re.compile(self.site.targetPattern))
        for targetPage in targetPages:
            targetPage = targetPage.attrs['href']
            if targetPage not in self.visited:
                self.visited.append(targetPage)
                if not self.site.absoluteUrl:
                    targetPage = '{}{}'.format(self.site.url, targetPage)
                self.parse(targetPage)

In [23]:
reuters = Website('Reuters', 'https://www.reuters.com', '^(/business/)',
                  False, 'h1', 'div.StandardArticleBody_body_1gnLA')
crawler = Crawler(reuters)
crawler.crawl()

In [24]:
crawler.visited

['/business/',
 '/business/autos-transportation/elon-musk-sells-22-mln-tesla-shares-worth-36-bln-filing-2022-12-15/',
 '/business/finance/uk-watchdog-narrows-dividend-stripping-investigation-2022-12-15/',
 '/business/energy/germanys-half-a-trillion-dollar-energy-bazooka-may-not-be-enough-2022-12-15/',
 '/business/supply-chain-that-keeps-tech-flowing-russia-2022-12-13/',
 '/business/world-at-work/',
 '/business/autos-transportation/electric-vehicles-confront-leap-mass-market-2022-12-15/',
 '/business/charged/',
 '/business/environment/rising-climate-costs-challenge-countries-companies-2023-2022-12-14/',
 '/business/energy/year-russia-turbocharged-global-energy-crisis-2022-12-13/',
 '/business/energy/',
 '/business/energy/us-fuelmakers-more-than-recoup-pandemic-losses-2022-2022-12-15/',
 '/business/autos-transportation/adding-evs-us-biofuels-law-is-environmental-agencys-2023-task-2022-12-15/',
 '/business/autos-transportation/',
 '/business/energy/eletrobras-shell-mull-co-investment-braz

### 여러 page 유형 crawling
- page 유형 구분
    - URL에 유형이 있음 (예, brooking/blog/...)
    - 특정 필드가 존재하는지 영부에 따라
    - 특정 tag의 여부에 따라 

In [25]:
class Website:

    def __init__(self, pageType, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.targetPattern = targetPattern    # URL에 대한 정규식
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        self.pageType = pageType

In [26]:
class Website:
    """Common base class for all articles/pages"""

    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [27]:
class Product(Website):
    """Contains information for scraping a product page"""

    def __init__(self, name, url, titleTag, productNumber, price):
        Website.__init__(self, name, url, TitleTag)
        self.productNumberTag = productNumberTag
        self.priceTag = priceTag

class Article(Website):
    """Contains information for scraping an article page"""

    def __init__(self, name, url, titleTag, bodyTag, dateTag):
        Website.__init__(self, name, url, titleTag)
        self.bodyTag = bodyTag
        self.dateTag = dateTag

In [28]:
def parsePage(url):
    
    if '/ideas/' in url:
        oreilly = Website('O\'Reilly', 'https://oreilly.com', 'h1' '')        

In [29]:
oreilly = Website('O\'Reilly', 'https://oreilly.com', 'h1', '')

In [30]:
#https://github.com/REMitchell/python-scraping/