In [19]:
# import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import scipy.special as special
import time as time
import datetime as dt
import os

from bs4 import BeautifulSoup
import requests 

Yes24 Bestseller Data in Dataframe

In [20]:
# get bestseller links
yes24_url = 'http://www.yes24.com/24/Category/BestSeller' 
result = requests.get(yes24_url)
raw_html = BeautifulSoup(result.text, 'html.parser')

a_html = raw_html.find('ol')
list_yes24_bestseller_data = a_html.find_all('p', class_='copy')

# create list of all bestseller urls
yes24_bestseller_urls = [] 
for i in list_yes24_bestseller_data:
    urls_in_html = i.find('a')
    yes24_bestseller_urls.append('http://www.yes24.com/' + urls_in_html.get('href'))

In [21]:
# set frame for data
data = []

# get data for each best seller, create 2D array
for url in yes24_bestseller_urls: 
    result = requests.get(url) # get html
    raw_html = BeautifulSoup(result.text, 'html.parser')

    title = raw_html.find(name='h2', class_='gd_name').string

    author_wrapper = raw_html.find(name='span', class_='gd_auth')
    author = author_wrapper.find('a').string

    publisher_wrapper = raw_html.find(name='span', class_='gd_pub')
    publisher = publisher_wrapper.find('a').string

    published_date_str = raw_html.find(name='span', class_='gd_date').string
    published_date_num = published_date_str.translate({ord(i): None for i in '월년일'})

    # 주의) 가급적이면 원하는 값에'만' 해당 되는 class, tag, id 등으로 찾기 b/c html data가 바뀌면서 가져오는 값도 바뀌게 되어짐  
    origin_wrapper = raw_html.find(name='span', class_='gd_orgin') # for loop, 각 bestseller에 대해 출력
    if origin_wrapper != None:
        origin_wrapper = origin_wrapper.find('a').string

    yes24_review_score_tag = raw_html.find(name='em', class_='yes_b') # for loop, 각 bestseller에 대해 출력
    if yes24_review_score_tag is not None:
        yes24_review_score = yes24_review_score_tag.string
    
    yes24_review_number_tag = raw_html.find(name='span', class_='gd_reviewCount').a.em
    if yes24_review_number_tag is not None:
        yes24_review_number = yes24_review_number_tag.string

    yes24_fixed_price = raw_html.find_all(name='em', class_='yes_m')[0]
    yes24_fixed_price = yes24_fixed_price.string
    yes24_selling_price = raw_html.find_all(name='em', class_='yes_m')[1]
    yes24_selling_price = yes24_selling_price.string

    ISBN13_parent_html = raw_html.find(name='th', class_='txt', string='ISBN13').parent
    ISBN13 = ISBN13_parent_html.find(name='td', class_='txt lastCol').string

    ISBN10_parent_html = raw_html.find(name='th', class_='txt', string='ISBN10').parent
    ISBN10 = ISBN10_parent_html.find(name='td', class_='txt lastCol').string

    yes24_sales_text = raw_html.find(name ='span', class_='gd_sellNum')
    yes24_sales_text = yes24_sales_text.text
    yes24_sales_index = ''
    for i in yes24_sales_text:
        if i.isnumeric() == True:
            yes24_sales_index = yes24_sales_index + i

    genre_list = raw_html.find(name='em', class_='bl_dot bgYUI').parent
    yes24_final_genre_name = genre_list.find_all('a')[-1].string

    yes24_bestseller_data = [title, author, publisher, published_date_num, origin_wrapper, yes24_review_score, yes24_review_number, yes24_fixed_price, yes24_selling_price, ISBN13, ISBN10, yes24_sales_index, yes24_final_genre_name]
    data.append(yes24_bestseller_data)

In [22]:
# create dataframe with values
columns=['Title', 'Author', 'Publisher', 'Published_date', 'Origin', 'Yes24_Review_score', 'Yes24_Review_number', 'Yes24_Fixed_price', 'Yes24_Selling_price', 'ISBN13', 'ISBN10', 'Yes24_sales_index', 'Yes24_Final_genre_name'] # change to all caps or all lowercase
number_bestsellers = len(yes24_bestseller_urls)
rank = np.arange(1,number_bestsellers+1)
yes24_dataframe = pd.DataFrame(data=data, index=rank, columns=columns)
yes24_dataframe['Published_date'] = pd.to_datetime(yes24_dataframe['Published_date'], format='%Y %m %d')

Aladin Bestseller Data in CSV

In [23]:
# Get excel file from Aladin, 데이트 타임 활용하기
Aladin_URL = 'https://www.aladin.co.kr/shop/common/wbest_excel.aspx?BestType=Bestseller&BranchType=1&CID=0&Year=2023&Month=2&Week=3' # 링크가 매주 바뀜 

req = requests.get(Aladin_URL) # 200이면 온라인

Aladin_excel_name = '알라딘_주간+베스트_국내도서_2023년2월3주_20230226.csv' # 일반화 & edit later 

with open(Aladin_excel_name, 'wb') as f: # write, binary, with 사용해서 효율적
    for chunk in req.iter_content(chunk_size=9000): # 실제로 파일을 열어서 작성
        if chunk:
            f.write(chunk)

os.rename(r'H:\내 드라이브\파이썬\Analyze_Bestseller_Project\알라딘_주간+베스트_국내도서_2023년2월3주_20230226.csv', r'H:\내 드라이브\파이썬\Analyze_Bestseller_Project\aladin_bestseller_data.csv')

Kyobo Bestseller Data in CSV

In [24]:
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [25]:
# Get excel file from Kyobobook with Selenium
import shutil

Kyobo_URL = "https://product.kyobobook.co.kr/bestseller/online?period=001#?page=1&per=20&ymw=&period=001&saleCmdtClstCode=&dsplDvsnCode=000&dsplTrgtDvsnCode=001&saleCmdtDsplDvsnCode="

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) # headless 해결해보기

driver.get(url=Kyobo_URL)
driver.implicitly_wait(time_to_wait=3)

download_excel_button = driver.find_element(By.XPATH, '//*[@id="excel_btn"]')
download_excel_button.click() 

time.sleep(10)

driver.close()

shutil.move(r'C:\Users\seohy\Downloads\교보문고_온라인_베스트셀러_상품리스트.xlsx', r'H:\내 드라이브\파이썬\Analyze_Bestseller_Project\교보문고_온라인_베스트셀러_상품리스트.xlsx') # 파일 이름에 날짜 붙이기

os.rename(r'H:\내 드라이브\파이썬\Analyze_Bestseller_Project\교보문고_온라인_베스트셀러_상품리스트.xlsx', r'H:\내 드라이브\파이썬\Analyze_Bestseller_Project\kyobo_bestseller_data.xlsx')


Bring All Three Files Together 

In [26]:
aladin_path = r'H:\내 드라이브\파이썬\Analyze_Bestseller_Project\aladin_bestseller_data.csv'
kyobo_path = r'H:\내 드라이브\파이썬\Analyze_Bestseller_Project\kyobo_bestseller_data.xlsx'

aladin_dataframe = pd.read_csv(aladin_path)
kyobo_dataframe = pd.read_excel(kyobo_path)

  warn("Workbook contains no default style, apply openpyxl's default")


In [27]:
# Clean and reorganize yes24 dataframe
yes24_dataframe['ISBN13'] = yes24_dataframe['ISBN13'].astype(str)
yes24_dataframe = yes24_dataframe.set_index('ISBN13')

yes24_dataframe_columns_new_order = ['Title', 'Author', 'Publisher', 'Published_date', 'Origin', 'Yes24_Review_score', 'Yes24_Review_number', 'Yes24_Fixed_price', 'Yes24_Selling_price', 'Yes24_sales_index', 'Yes24_Final_genre_name']
yes24_dataframe = yes24_dataframe[yes24_dataframe_columns_new_order]


In [28]:
# Clean and organize Aladin Dataframe
aladin_dataframe = aladin_dataframe.drop(index=1000)

aladin_dataframe['ISBN13'] = aladin_dataframe['ISBN13'].astype(str)
aladin_dataframe['ISBN13'] = aladin_dataframe['ISBN13'].str.replace('.0', '', regex=False) # .str으로 value 선택
aladin_dataframe = aladin_dataframe.set_index('ISBN13')

aladin_dataframe = aladin_dataframe.rename(columns = {'정가': 'Aladin_Fixed_price', '판매가': 'Aladin_Selling_price', '마일리지': 'Aladin_Mileage', '세일즈포인트': 'Aladin_Sales_Point'})
picked_aladin_dataframe = aladin_dataframe[['Aladin_Fixed_price', 'Aladin_Selling_price', 'Aladin_Mileage', 'Aladin_Sales_Point']]
picked_aladin_dataframe


Unnamed: 0_level_0,Aladin_Fixed_price,Aladin_Selling_price,Aladin_Mileage,Aladin_Sales_Point
ISBN13,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9791168473690,7200,6480,360점,529205.0
9791169794930,20000,18000,"1,000점",360470.0
9791198013026,18000,16200,900점,145980.0
9791197389450,17000,15300,850점,116360.0
9791169305044,16000,14400,800점,64760.0
...,...,...,...,...
9791162240854,15000,13500,750점,3245.0
9791156467427,30000,27000,"1,500점",3210.0
9791159034596,2900,2610,140점,3080.0
9791197230271,14000,12600,700점,3005.0


In [29]:
# Clean and organize Kyobo Dataframe
kyobo_dataframe = kyobo_dataframe.rename(columns = {'정가': 'Kyobo_Fixed_price', '판매가': 'Kyobo_Selling_price', '적립예정포인트': 'Kyobo_Sales_Point', '분야': 'Kyobo_Genre'})
picked_kyobo_dataframe = kyobo_dataframe[['Kyobo_Fixed_price', 'Kyobo_Selling_price', 'Kyobo_Sales_Point', 'Kyobo_Genre']]
picked_kyobo_dataframe

Unnamed: 0,Kyobo_Fixed_price,Kyobo_Selling_price,Kyobo_Sales_Point,Kyobo_Genre
0,18000,16200,900,자기계발
1,19000,19000,570,시그니처 향
2,7200,6480,360,자기계발
3,17000,15300,850,자기계발
4,19000,17100,950,경제/경영
...,...,...,...,...
986,18900,17010,940,외국어
987,16800,15120,840,경제/경영
988,13500,12150,670,어린이(초등)
989,9800,8820,490,외국어


In [30]:
aladin_filter = []
for i in aladin_dataframe.index.tolist():
    truth = i in yes24_dataframe.index.tolist()
    aladin_filter.append(truth)
picked_aladin_dataframe = picked_aladin_dataframe[aladin_filter]


# kyobo_filter = [] # Index is not ISBN
# for i in kyobo_dataframe['상품명'].tolist():
#     truth = i in yes24_dataframe['Title'].tolist()
#    kyobo_filter.append(truth)
# picked_kyobo_dataframe = picked_kyobo_dataframe[kyobo_filter]

adding_frames = [yes24_dataframe, picked_aladin_dataframe]
total_bestseller_data = pd.concat(adding_frames, axis=1)

total_bestseller_data.to_excel('total_bestseller_data.xlsx', encoding='CP949')


  return func(*args, **kwargs)


In [31]:
# getting data complete, now need to clean data
total_bestseller_data

Unnamed: 0_level_0,Title,Author,Publisher,Published_date,Origin,Yes24_Review_score,Yes24_Review_number,Yes24_Fixed_price,Yes24_Selling_price,Yes24_sales_index,Yes24_Final_genre_name,Aladin_Fixed_price,Aladin_Selling_price,Aladin_Mileage,Aladin_Sales_Point
ISBN13,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
9791198013026,김미경의 마흔 수업,김미경,어웨이크북스,2023-02-15,,10.0,28,"18,000원",16200,471210,처세술/삶의 자세,18000,16200,900점,145980.0
9791197389450,1퍼센트 부자의 법칙,사이토 히토리,나비스쿨(NAVI SCHOOL),2023-01-30,齋藤一人の絶對成功する千回の法則,10.0,4,"17,000원",15300,191115,처세술/삶의 자세,17000,15300,850점,116360.0
9791169794930,슬램덩크 리소스-THE FIRST SLAM DUNK re:SOURCE,이노우에 타케히코,대원,2023-02-16,THE FIRST SLAM DUNK re:SOURCE,9.7,36,"20,000원",18000,310362,스포츠,20000,18000,"1,000점",360470.0
9791191521221,K 배터리 레볼루션,박순혁,지와인,2023-02-20,,10.0,7,"19,000원",17100,289359,경제전망,19000,17100,950점,67835.0
9791190538510,만일 내가 인생을 다시 산다면 (10만 부 기념 스페셜 에디션),김혜남,메이븐,2022-11-11,,9.6,73,"17,200원",15480,688929,노년 / 죽음,17200,15480,860점,202180.0
9791168473690,세이노의 가르침,세이노,데이원,2023-03-02,,9.4,10,"7,200원",6480,316608,처세술/삶의 자세,7200,6480,360점,529205.0
9791130642147,효기심의 권력으로 읽는 세계사 - 유럽편,효기심(최영효),다산초당,2023-03-03,,10.0,1,"19,800원",17820,65250,세계사/세계문화,19800,17820,990점,14000.0
9791197871269,주식 시세의 비밀,정재호,프런트페이지,2023-02-20,,10.0,2,"22,000원",19800,191778,주식/증권,22000,19800,"1,100점",107270.0
9788997575169,원씽 THE ONE THING,게리 켈러,비즈니스북스,2013-08-30,The One Thing,8.8,161,"14,000원",12600,492360,처세술/삶의 자세,14000,12600,700점,160813.0
9791161571188,불편한 편의점,김호연,나무옆의자,2021-04-20,,9.2,425,"14,000원",12600,1162683,한국 장편소설,14000,12600,700점,601933.0
