In [1]:
import requests

In [2]:
## get response

page = requests.get("https://www.google.com")  ## send request and receive response
type(page)

requests.models.Response

In [3]:
page.text  ## html code as response

'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="ko"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="uWlTfiIJ3lj6saygTDa+ww==">(function(){window.google={kEI:\'6PRSYvmdEY-UmAWipK-gCw\',kEXPI:\'0,1302536,56873,6058,207,2414,2390,2316,383,246,5,1354,4013,1237,1122516,1197760,131,380599,16115,17444,1953,9287,17572,4858,1362,9291,3024,4749,12835,4020,978,13228,3847,4192,6430,22741,5081,1593,1279,2742,149,1103,840,6297,109,3405,606,2023,1777,520,14670,3227,2845,7,17450,16320,1851,2614,13142,3,576,6460,148,13975,4,1528,2302,6464,577,25073,2658,4163,3193,31,13628,4437,9358,7428,652,5124,2581,4097,4049,3,3541,1,16807,25347,2,14022,1931,5589,743,5853,10463,1160,5679,1021,2377,2721,18279,2,6,7737,4567,6256,6720,16701,1252,4588,2,6,1239,11861,280,2826,4333,19,4998,1073,1394,445,2,2,1,1385,124,4885,15052,3333,1406,10

In [4]:
## parsing web page

from bs4 import BeautifulSoup

In [5]:
html_code = """<!DOCTYPE html>
<html>
<head>
    <title>Sample Website</title>
</head>
<body>
<h2>HTML 연습!</h2>

<p>이것은 첫 번째 문단입니다.</p>
<p>이것은 두 번째 문단입니다!</p>

<ul>
    <li>커피</li>
    <li>녹차</li>
    <li>우유</li>
</ul>

<img src='https://i.imgur.com/bY0l0PC.jpg' alt="coffee"/>
<img src='https://i.imgur.com/fvJLWdV.jpg' alt="green-tea"/>
<img src='https://i.imgur.com/rNOIbNt.jpg' alt="milk"/>

</body>
</html>"""

In [6]:
soup = BeautifulSoup(html_code, 'html.parser')

print(type(soup))

<class 'bs4.BeautifulSoup'>


In [7]:
li_tags = soup.select('li')
li_tags

[<li>커피</li>, <li>녹차</li>, <li>우유</li>]

In [8]:
print(type(li_tags[0]))

<class 'bs4.element.Tag'>


In [9]:
print(li_tags[0].text)

커피


In [10]:
## parsing text in html tag

beverage_names = []
for li in li_tags:
    beverage_names.append(li.text)
print(beverage_names)

['커피', '녹차', '우유']


In [11]:
## parsing property in html tag

img_tags = soup.select('img')
print(img_tags[0])

<img alt="coffee" src="https://i.imgur.com/bY0l0PC.jpg"/>


In [12]:
print(img_tags[0]['src'])

https://i.imgur.com/bY0l0PC.jpg


In [13]:
img_srcs = []
for img in img_tags:
    img_srcs.append(img['src'])
img_srcs

['https://i.imgur.com/bY0l0PC.jpg',
 'https://i.imgur.com/fvJLWdV.jpg',
 'https://i.imgur.com/rNOIbNt.jpg']

In [14]:
## practice 1 (web scraping)

In [15]:
response = requests.get("https://workey.codeit.kr/music/index")
print(response.text)

<!DOCTYPE html>
<html lang="ko">
<head>
  <script src="https://cdn.jsdelivr.net/npm/vue/dist/vue.js"></script>
  <script src="https://unpkg.com/axios/dist/axios.min.js"></script>
  <meta charset="UTF-8">
  <title>Codeit Music</title>
  <style>
    body {
      margin: 0;
    }

    a {
      text-decoration: none;
      color: inherit;
    }

    img {
      vertical-align: middle;
    }

    ul {
      list-style: none;
      padding: 0;
      margin: 0;
    }

    * {
      box-sizing: border-box;
    }

    .header {
      position: fixed;
      top: 0;
      left: 0;
      width: 100%;
      height: 60px;
      background-color: #ffffff;
      overflow: hidden;
      z-index: 1;
    }

    .nav-warpper {
      max-width: 1163px;
      margin: 0 auto;
      height: 100%;
      line-height: 60px;
      padding: 0 20px
    }

    .header__logo .logo-img {
      width: 34px;
      margin-right: 25px;
    }

    .header__nav {
      display: inline-block;
      vertical-align: middle;
 

In [16]:
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)

<!DOCTYPE html>

<html lang="ko">
<head>
<script src="https://cdn.jsdelivr.net/npm/vue/dist/vue.js"></script>
<script src="https://unpkg.com/axios/dist/axios.min.js"></script>
<meta charset="utf-8"/>
<title>Codeit Music</title>
<style>
    body {
      margin: 0;
    }

    a {
      text-decoration: none;
      color: inherit;
    }

    img {
      vertical-align: middle;
    }

    ul {
      list-style: none;
      padding: 0;
      margin: 0;
    }

    * {
      box-sizing: border-box;
    }

    .header {
      position: fixed;
      top: 0;
      left: 0;
      width: 100%;
      height: 60px;
      background-color: #ffffff;
      overflow: hidden;
      z-index: 1;
    }

    .nav-warpper {
      max-width: 1163px;
      margin: 0 auto;
      height: 100%;
      line-height: 60px;
      padding: 0 20px
    }

    .header__logo .logo-img {
      width: 34px;
      margin-right: 25px;
    }

    .header__nav {
      display: inline-block;
      vertical-align: middle;
    }

  

In [17]:
li_tags = soup.select('.popular__order li')

In [18]:
popular_artists = []
for li in li_tags:
    popular_artists.append(li.text.strip())
popular_artists

['1 아이유 (IU)',
 '2 방탄소년단',
 '3 Red Velvet (레드벨벳)',
 '4 IKON',
 '5 멜로망스',
 '6 다비치',
 '7 윤딴딴',
 '8 수지 (SUZY)',
 '9 김동률',
 '10 폴킴']

In [20]:
## practice 2 (web crawling)
## find only for necessary pages
import time

In [22]:
## empty list
pages = []

page_num = 1

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
}

while True:
    response = requests.get(f"https://www.ssg.com/search.ssg?target=all&query=nintendo&page={page_num}", headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    ## if 10 th page, stop
    if page_num > 10:
        break
        
    ## if '.csrch_tip' class is not in page
    if len(soup.select('.csrch_tip')) == 0:
        pages.append(soup)
        print(f"{page_num} 번째 페이지 가져오기 완료")
        page_num += 1
        time.sleep(3)
    ## if '.csrch_tip' class is in page (stop)
    else:
        break

print(len(pages))

1 번째 페이지 가져오기 완료
2 번째 페이지 가져오기 완료
3 번째 페이지 가져오기 완료
4 번째 페이지 가져오기 완료
5 번째 페이지 가져오기 완료
6 번째 페이지 가져오기 완료
7 번째 페이지 가져오기 완료
8 번째 페이지 가져오기 완료
9 번째 페이지 가져오기 완료
10 번째 페이지 가져오기 완료
10


In [23]:
## (**) practice (web crawling & web scraping & parsing to dataframe)

In [24]:
records = []

page_num = 1

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
}

while True:    
    response = requests.get(f"https://www.ssg.com/search.ssg?target=all&query=nintendo&page={page_num}", headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    ## if page > 30, stop
    if page_num > 30:
        break
        
    ## if '.csrch_tip' class is not in page
    if len(soup.select('.csrch_tip')) == 0:
        product_names = soup.select('.cunit_info > div.cunit_md.notranslate > div > a > em.tx_ko')
        product_prices = soup.select('.cunit_info > div.cunit_price.notranslate > div > em')
        product_urls = soup.select('.cunit_prod > div.thmb > a > img')
        
        for i in range(len(product_names)):
            record = []
            record.append(product_names[i].text)
            record.append(product_prices[i].text.strip())
            record.append(product_urls[i].get('src'))
            records.append(record)
        
        page_num += 1
        time.sleep(1)
    ## if '.csrch_tip' class is in page (stop)
    else:
        break

print(len(records))

1190


In [26]:
import pandas as pd

df = pd.DataFrame(data = records, columns=['이름', '가격', '이미지 주소'])

In [27]:
df

Unnamed: 0,이름,가격,이미지 주소
0,닌텐도 스위치 스포츠 한글판(레그 스트랩 포함) 스포츠 타올 선택 4월 28일 출고,49800,//sitem.ssgcdn.com/72/55/47/item/1000333475572...
1,[닌텐도스위치] 별의커비 디스커버리,52978,//sitem.ssgcdn.com/72/55/47/item/1000333475572...
2,★닌텐도 공식판매처★닌텐도 스위치 본체 OLED 타이틀 패키지(옵션 선택),60916,//sitem.ssgcdn.com/73/38/35/item/1000309353873...
3,★오후2시이전 주문 당일출고★[닌텐도 스위치] OLED 모델(화이트)(옵션 택1),62800,//sitem.ssgcdn.com/73/38/35/item/1000309353873...
4,[닌텐도 스위치] 포켓몬스터 레전드 아르세우스,415000,//sitem.ssgcdn.com/53/29/62/item/1000271622953...
...,...,...,...
1185,[닌텐도 스위치] 모여봐요 동물의 숲 (한글화),1500,//sitem.ssgcdn.com/15/40/28/item/1000038284015...
1186,★닌텐도 공식판매처★닌텐도 스위치 본체 동물의숲 아미보카드 패키지(옵션 선택),10900,//sitem.ssgcdn.com/15/40/28/item/1000038284015...
1187,"[닌텐도 스위치] 포켓몬스터 브릴리언트 다이아몬드 (정가 64,800원)",2500,//sitem.ssgcdn.com/95/31/99/item/1000037993195...
1188,[닌텐도 스위치] 포켓몬스터 소드/실드,8500,//sitem.ssgcdn.com/95/31/99/item/1000037993195...
