# Парсинг HTML. BeautifulSoup.

### Скрейпинг и парсинг данных одной страницы.

In [1]:
# Импорт необходимых библиотек
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# Путь к странице с данным
website = "https://books.toscrape.com/catalogue/category/books_1/index.html"

In [3]:
# GET-запрос к серверу
page = requests.get(website)


In [4]:
# Проверка статуса ответа сервера
page.status_code

200

In [5]:
# Парсинг данных 
soup = BeautifulSoup(page.content, 'html.parser')

In [6]:
# Поиск по тегу <div>
result = soup.find_all('li', ('class','col-xs-6 col-sm-4 col-md-3 col-lg-3'))
len(result)

20

### Скрейпинг ссылок на каждый товар

In [15]:
# Первая часть ссылки
url_1 = 'https://books.toscrape.com/'

In [40]:
# Извлечение списка относительных ссылок на товары
url_2 = []
for i in result:
  for link in i.find_all('div', ('class','image_container')):
    url_2.append('catalogue' + link.find('a').get('href')[5:])
url_2

['catalogue/a-light-in-the-attic_1000/index.html',
 'catalogue/tipping-the-velvet_999/index.html',
 'catalogue/soumission_998/index.html',
 'catalogue/sharp-objects_997/index.html',
 'catalogue/sapiens-a-brief-history-of-humankind_996/index.html',
 'catalogue/the-requiem-red_995/index.html',
 'catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html',
 'catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html',
 'catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html',
 'catalogue/the-black-maria_991/index.html',
 'catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html',
 'catalogue/shakespeares-sonnets_989/index.html',
 'catalogue/set-me-free_988/index.html',
 'catalogue/scott-pilgrims-precious-little-life-scott-pilgrim-1_987/index.html',
 'catalogue/rip-it-up-and-start-again_986/index.html',
 'catalogue/our-band-could-be-your-life-scene

In [13]:
import urllib.parse

In [41]:
# Объединение двух частей ссылки в абсолютный путь и создание списка со ссылками на каждый товар, расположенный на странице
url_joined = []

for link in url_2:
  url_joined.append(urllib.parse.urljoin(url_1, link))

url_joined

['https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
 'https://books.toscrape.com/catalogue/soumission_998/index.html',
 'https://books.toscrape.com/catalogue/sharp-objects_997/index.html',
 'https://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html',
 'https://books.toscrape.com/catalogue/the-requiem-red_995/index.html',
 'https://books.toscrape.com/catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html',
 'https://books.toscrape.com/catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html',
 'https://books.toscrape.com/catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html',
 'https://books.toscrape.com/catalogue/the-black-maria_991/index.html',
 'https://books.toscrape.com/catalogue/starving-hearts-triangular-trade-tr

### Извлечение данных одного товара

In [116]:
# Ссылка на первый товар
first_link = url_joined[0]

In [117]:
# GET-запрос серверу
response = requests.get(first_link)

In [118]:
# Парсинг данных
soup = BeautifulSoup(response.content, 'html.parser')

In [48]:
# Парсинг названия товара
name = soup.find('div', ('class', 'col-sm-6 product_main')).find('h1').text
name

'A Light in the Attic'

In [49]:
price = soup.find('div', ('class', 'col-sm-6 product_main')).find('p').text
price

'£51.77'

In [58]:
availability = soup.find('p', ('class', 'instock availability')).text.replace('\n', '')
availability

'            In stock (22 available)    '

In [119]:
description_p = soup.find('div', id='product_description', class_='sub-header').find_next('p')
description = description_p.text
description


"It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here'sGot it in for you. Shel, you never sounde

### Объединяем все вместе - скрейпинг и парсинг информации о всех товарах одной страницы

In [120]:
# Пустые списки, которые будут содержать соответствующие данные
name =[]
price =[]
availability = []
description =[]

In [121]:
# Создание пустого словаря, который будет содержать все получаемые данные
output ={}

In [122]:
# Цикл по списку ссылок на товары
for i in url_joined:
  response = requests.get(i)
  soup = BeautifulSoup(response.content, 'html.parser')

# Парсинг названия товара. Обработка исключения: добавляем пустую строку.
  try:
    name.append(soup.find('div', ('class', 'col-sm-6 product_main')).find('h1').text)
  except:
    name.append('')

# Парсинг цены товара. 
  try:
    price.append(soup.find('div', ('class', 'col-sm-6 product_main')).find('p').text)
  except:
    price.append('')

# Парсинг формы выпуска товара.
  try:
    availability.append(soup.find('p', ('class', 'instock availability')).text.replace('\n', ''))
  except:
    availability.append('')

# Парсинг производителя товара.
  try:
    description.append(soup.find('div', id='product_description', class_='sub-header').find_next('p').text)
  except:
    description.append('')

# Записываем данные в словарь
  output = {'Name' : name, 'Price' : price, 'Availability' : availability, 'Description' : description}

In [123]:
output

{'Name': ['A Light in the Attic',
  'Tipping the Velvet',
  'Soumission',
  'Sharp Objects',
  'Sapiens: A Brief History of Humankind',
  'The Requiem Red',
  'The Dirty Little Secrets of Getting Your Dream Job',
  'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
  'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
  'The Black Maria',
  'Starving Hearts (Triangular Trade Trilogy, #1)',
  "Shakespeare's Sonnets",
  'Set Me Free',
  "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
  'Rip it Up and Start Again',
  'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
  'Olio',
  'Mesaerion: The Best Science Fiction Stories 1800-1849',
  'Libertarianism for Beginners',
  "It's Only the Himalayas"],
 'Price': ['£51.77',
  '£53.74',
  '£50.10',
  '£47.82',
  '£54.23',
  '£22.65',
  '£33.34',
  '£17.93',
  '£22.60',
  '£52.15',
  '£13.99',
  '£20.66',
  '£17.46',

In [124]:
# Создание pandas dataframe
df = pd.DataFrame(output)

In [125]:
df

Unnamed: 0,Name,Price,Availability,Description
0,A Light in the Attic,£51.77,In stock (22 available),It's hard to imagine a world without A Light i...
1,Tipping the Velvet,£53.74,In stock (20 available),"""Erotic and absorbing...Written with starling ..."
2,Soumission,£50.10,In stock (20 available),"Dans une France assez proche de la nôtre, un h..."
3,Sharp Objects,£47.82,In stock (20 available),"WICKED above her hipbone, GIRL across her hear..."
4,Sapiens: A Brief History of Humankind,£54.23,In stock (20 available),From a renowned historian comes a groundbreaki...
5,The Requiem Red,£22.65,In stock (19 available),Patient Twenty-nine.A monster roams the halls ...
6,The Dirty Little Secrets of Getting Your Dream...,£33.34,In stock (19 available),Drawing on his extensive experience evaluating...
7,The Coming Woman: A Novel Based on the Life of...,£17.93,In stock (19 available),"""If you have a heart, if you have a soul, Kare..."
8,The Boys in the Boat: Nine Americans and Their...,£22.60,In stock (19 available),For readers of Laura Hillenbrand's Seabiscuit ...
9,The Black Maria,£52.15,In stock (19 available),"Praise for Aracelis Girmay:""[Girmay's] every l..."


## Скрейпинг нескольких страниц

In [127]:
name =[]
price =[]
availability = []
description =[]

url_1 = 'https://books.toscrape.com/'
url = 'https://books.toscrape.com/catalogue/category/books_1/page-1.html'

while True:
  
  page = requests.get(url)
  soup = BeautifulSoup(page.content, 'html.parser')
  next_page_link = soup.find('li', ('class', 'next'))
  result = soup.find_all('li', ('class','col-xs-6 col-sm-4 col-md-3 col-lg-3'))

  url_2 = []
  for i in result:
    for link in i.find_all('div', ('class','image_container')):
      url_2.append('catalogue' + link.find('a').get('href')[5:])
  
  url_joined = []

  for link in url_2:
    url_joined.append(urllib.parse.urljoin(url_1, link))
  
  for i in url_joined:
    response = requests.get(i)
    soup = BeautifulSoup(response.content, 'html.parser')

  # Парсинг названия товара. Обработка исключения: добавляем пустую строку.
    try:
      name.append(soup.find('div', ('class', 'col-sm-6 product_main')).find('h1').text)
    except:
      name.append('')

  # Парсинг цены товара. 
    try:
      price.append(soup.find('div', ('class', 'col-sm-6 product_main')).find('p').text)
    except:
      price.append('')

  # Парсинг формы выпуска товара.
    try:
      availability.append(soup.find('p', ('class', 'instock availability')).text.replace('\n', ''))
    except:
      availability.append('')

  # Парсинг производителя товара.
    try:
      description.append(soup.find('div', id='product_description', class_='sub-header').find_next('p').text)
    except:
      description.append('')

  # Записываем данные в словарь
    output = {'Name' : name, 'Price' : price, 'Availability' : availability, 'Description' : description}

  if not next_page_link:
    break
  
  url = url_1 + 'catalogue/category/books_1/' + next_page_link.find_next('a')['href']



In [128]:
df = pd.DataFrame(output)
df

Unnamed: 0,Name,Price,Availability,Description
0,A Light in the Attic,£51.77,In stock (22 available),It's hard to imagine a world without A Light i...
1,Tipping the Velvet,£53.74,In stock (20 available),"""Erotic and absorbing...Written with starling ..."
2,Soumission,£50.10,In stock (20 available),"Dans une France assez proche de la nôtre, un h..."
3,Sharp Objects,£47.82,In stock (20 available),"WICKED above her hipbone, GIRL across her hear..."
4,Sapiens: A Brief History of Humankind,£54.23,In stock (20 available),From a renowned historian comes a groundbreaki...
...,...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,£55.53,In stock (1 available),
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",£57.06,In stock (1 available),High school student Kei Nagai is struck dead i...
997,A Spy's Devotion (The Regency Spies of London #1),£16.97,In stock (1 available),"In England’s Regency era, manners and elegance..."
998,1st to Die (Women's Murder Club #1),£53.98,In stock (1 available),"James Patterson, bestselling author of the Ale..."


In [131]:
df.to_json('books.json', indent=4)