In [1]:
from bs4 import BeautifulSoup 
import requests
import urllib.request
import time
import random
import os
from dotenv import load_dotenv

In [2]:
#환경변수 불러오기
load_dotenv(dotenv_path = '.env', verbose=True)
site_url = os.getenv('SITE_URL')
site_name = os.getenv('SITE_NAME')
header = os.getenv('HEADER')

In [3]:
#브랜드별 향수 목록 가져오기
def brand_perfume_crawler(site_url, header):
  input_keyword = input('향수 브랜드 영문 이름을 입력하세요.\n사이트 내에서 정확한 명을 찾아 입력해주세요!(단어간 띄어쓰기, 대소문자 중요) : ')
  keyword = input_keyword.replace(' ','-') # 공백 하이픈(-)처리

  try:
    link = (site_url + '/designers/{}.html').format(keyword)
    res = requests.get(link, headers = {'User-agent': header}) # 헤더 추가해서 429 에러 우회
    bs = BeautifulSoup(res.text, 'html.parser')

    all_perfume_url = []
    for i in bs.select('body div#main-content div#brands div.cell.text-left.prefumeHbox.px1-box-shadow div.flex-child-auto h3 a'):
      perfume_url = i["href"]
      all_perfume_url.append(perfume_url)
    for i in all_perfume_url:
      print(i)
  except AttributeError as e:
   print(e)

  return all_perfume_url

In [4]:
perfume_list = brand_perfume_crawler(site_url, header)

/perfume/Chanel/Antaeus-616.html
/perfume/Chanel/Bois-Noir-34568.html
/perfume/Chanel/Chanel-No-46-22520.html
/perfume/Chanel/Cuir-de-Russie-Parfum-47794.html
/perfume/Chanel/Le-1940-Beige-de-Chanel-45188.html
/perfume/Chanel/Le-1940-Bleu-de-Chanel-45187.html
/perfume/Chanel/Le-1940-Rouge-de-Chanel-45186.html
/perfume/Chanel/Une-Fleur-de-Chanel-4689.html
/perfume/Chanel/Allure-502.html
/perfume/Chanel/Allure-Eau-de-Parfum-176.html
/perfume/Chanel/Allure-Eau-Fraichissante-Pour-l-Ete-12448.html
/perfume/Chanel/Allure-Hair-Mist-46217.html
/perfume/Chanel/Allure-Homme-523.html
/perfume/Chanel/Allure-Homme-Eau-Fraichissante-Pour-l-Ete-12447.html
/perfume/Chanel/Allure-Homme-Edition-Blanche-2653.html
/perfume/Chanel/Allure-Homme-Edition-Blanche-Eau-de-Parfum-28942.html
/perfume/Chanel/Allure-Homme-Sport-607.html
/perfume/Chanel/Allure-Homme-Sport-Cologne-1004.html
/perfume/Chanel/Allure-Homme-Sport-Eau-Extreme-14669.html
/perfume/Chanel/Allure-Parfum-33506.html
/perfume/Chanel/Allure-Sensuel

In [5]:
# 특정 향수 정보 가져오기 
def perfume_info_crawler(perfume_url, header, site_name):  
    try:
        res = requests.get(perfume_url, headers = {'User-agent': header}) # 헤더 추가해서 429 에러 우회
        bs = BeautifulSoup(res.text, 'html.parser')
        perfume_info = {}
        perfume_keyword_list = []
        top_note_list = []
        attribute_error_count = 0

        # body 내부 main content 발췌
        main_content = bs.select_one('body div#main-content div.grid-x.grid-margin-x div.grid-x.bg-white.grid-padding-x.grid-padding-y')

        #향수 이름
        perfume_name = main_content.select_one('div#toptop > h1').text
        print('향수 이름 : {}'.format(perfume_name))

        #향수 이미지
        perfume_img = main_content.select_one('div.cell.small-12 div.grid-x.grid-margin-x.grid-margin-y div.cell.small-6.text-center div.cell.small-12 img')["src"]
        print('향수 이미지 : {}'.format(perfume_img))

        #향수 키워드 목록
        for i in main_content.select('div.cell.small-12 div.grid-x.grid-margin-x.grid-margin-y div.cell.small-6.text-center div.grid-x div.cell.accord-box'):
            keyword = i.select_one('div.accord-bar').text
            perfume_keyword_list.append(keyword)
        print('향수 키워드 목록 : {}'.format(perfume_keyword_list))

        # 향수 스토리 요약 & 본문
        perfume_story_raw= main_content.select_one('div.cell.small-12 div.grid-x.grid-margin-x.grid-margin-y div[itemprop=description]')
        perfume_story_summary = perfume_story_raw.select_one('p').text
        print('향수 스토리 요약 : {}'.format(perfume_story_summary))
        perfume_story_detail_raw = perfume_story_raw.select('div.' + site_name + '-blockquote p')
        perfume_story_detail_str = ''
        for i in perfume_story_detail_raw:
            perfume_story_detail_str += i.text+ ' \n'
        perfume_story_detail = perfume_story_detail_str[:-2]  #문자열 맨 끝 \n 제거
        print('향수 스토리 본문 : {}'.format(perfume_story_detail))

        #향수 노트(top/middle/bottom)
        perfume_notes_raw = main_content.select('div#pyramid div.cell div div[style^="display: flex"]')

        top_raw = perfume_notes_raw[0]
        top_note_list = [i.text for i in top_raw]
        print('향수 top 노트 목록 : {}'.format(top_note_list))
        
        if len(perfume_notes_raw) > 1:
            middle_raw = perfume_notes_raw[1]
            middle_note_list = [i.text for i in middle_raw]
            print('향수 middle 노트 목록 : {}'.format(middle_note_list))

            bottom_raw = perfume_notes_raw[2]
            bottom_note_list = [i.text for i in bottom_raw]
            print('향수 bottom 노트 목록 : {}'.format(bottom_note_list))

        print()

    except AttributeError as e:
        attribute_error_count += 1
        print(e)
        if attribute_error_count > 5 :
            return 0 


In [6]:
for perfume_url in perfume_list:
  time.sleep(random.randrange(30, 60))
  perfume_info_crawler(siperfume_url, site_nameheader)

MissingSchema: Invalid URL '/perfume/Chanel/Antaeus-616.html': No schema supplied. Perhaps you meant http:///perfume/Chanel/Antaeus-616.html?