In [1]:
import requests
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import json
import os
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

import re
import time
import platform

In [2]:
from Webtoon import Webtoon 
from Episode import Episode 
from Platform import Platform 
from Crawler import Crawler 

In [3]:
ChromeDriverManager().install()


'C:\\Users\\pyj78\\.wdm\\drivers\\chromedriver\\win64\\119.0.6045.105\\chromedriver-win32/chromedriver.exe'

In [4]:
driver = webdriver.Chrome()

In [5]:
days = [
        "mon",
        "tue",
        "wed", 
        "thu", 
        "fri",
        "sat",
        "sun"
    ]

id = 0

In [6]:
class NaverWebtoonCrawler(Crawler):

    
    def __init__(self, driver, day):
        self.driver = driver
        self.platformName = "naver"
        self.day = day

        self.platform = Platform()
        self.platform.setName(self.platformName)
        self.platform.setDay(self.day)
        
        self.createFolder()
        

    def crawl(self):
        mainPageUrl = "https://comic.naver.com/webtoon?tab=" + self.day
        self.driver.get(mainPageUrl)
        self.scrolDown(driver)

        webtoonUrlList = []
        
        webtoonElementList = driver.find_elements(By.XPATH, '//*[@id="content"]/div[1]/ul/li')
        for webtoonElement in webtoonElementList:
            wait = WebDriverWait(webtoonElement, 5)
            webtoonUrl = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'Poster__link--sopnC'))).get_attribute("href")
            webtoonUrlList.append(webtoonUrl)

        for webtoonUrl in webtoonUrlList:
            try:
                webtoon = crawler.getWebtoon(webtoonUrl)
            except:
                print("age verification " + webtoonUrl)
                continue
            
            self.platform.addWebtoon(webtoon)
            episodes = self.getEpisodes(webtoonUrl)
            
            global id
            
            episodeFilePath = os.path.join( self.platformName, "webtoons", "nv" + str(id))
            print(episodeFilePath)
            self.saveToJson(episodeFilePath, episodes)

            id += 1

        webtoonFilePath = os.path.join(self.platformName, self.platformName + "_" + self.day)
        
        self.saveToJson(webtoonFilePath, self.platform)

    
    def getWebtoon(self, webtoonUrl):
            
        self.driver.get(webtoonUrl)
        time.sleep(0.8)
        wait = WebDriverWait(self.driver, 5)
        
        title = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'EpisodeListInfo__title--mYLjC'))).text
        # title = re.sub(r'[^\w\s]', '', title).replace("\n휴재", "")
        
        author = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ContentMetaInfo__link--xTtO6'))).text
        
        summary = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'EpisodeListInfo__summary--Jd1WG'))).text
        
        imageUrl = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'Poster__image--d9XTI'))).get_attribute("src")
            
        genre = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'TagGroup__tag--xu0OH'))).text
        
        serializedDay = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ContentMetaInfo__info_item--utGrf'))).text
        serializedDay = serializedDay.split("\n")[0]
        
        webtoon = Webtoon()

        global id
        webtoon.setWebtoon(self.platformName, self.day, id, title, author, serializedDay, webtoonUrl, imageUrl, genre, summary)
        
        return webtoon
        
    def getEpisodes(self, webtoonUrl):
        if not driver.current_url == webtoonUrl:
            self.driver.get(webtoonUrl)

        episodes = []
        while True:
            wait = WebDriverWait(self.driver, 3)
            try:
                time.sleep(0.8)
                pageList = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Paginate__page--iRmGj')))
                # pageList = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="content"]/div[3]/div[2]/button')))[1:-1]
               
            except:
                print("can't reach pageList " + webtoonUrl)
                break

            for page in pageList:
                wait = WebDriverWait(self.driver, 3)
                time.sleep(0.1)
                page.click()
                time.sleep(0.5)
                episodeList = self.driver.find_elements(By.XPATH, '//*[@id="content"]/div[3]/ul/li')

                for episode in episodeList:
                    wait = WebDriverWait(episode, 3)
                    title = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'EpisodeListList__title--lfIzU'))).text
                    url = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'EpisodeListList__link--DdClU'))).get_attribute("href")
                    imageUrl = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'EpisodeListList__thumbnail_area--EL1aw'))).find_element(By.TAG_NAME, "img").get_attribute("src")                    
                    publicationDate = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'date'))).text
                    
                    episode = Episode(title, url, imageUrl, publicationDate)
                    episodes.append(episode)
            nextButton = self.driver.find_element(By.CLASS_NAME, 'Paginate__next--F6rIk')
            isNext = nextButton.get_attribute("disabled")
            if isNext:
                break
            nextButton.click()
            time.sleep(0.3) # 다음 장으로 넘어가기 전에 미리 pageList를 받는 걸 방지

        
        if not episodes:
            print(webtoonUrl)
        return episodes

    

    def scrolDown(self, driver):
        body = driver.find_element(By.CSS_SELECTOR, 'body')
        body.send_keys(Keys.END)

In [7]:
for day in days:
    crawler = NaverWebtoonCrawler(driver, day)
    crawler.crawl()

Folder 'C:\Users\pyj78\Desktop\webtoon\naver' created successfully
Folder 'naver\webtoons' created successfully
naver\webtoons\nv0
naver\webtoons\nv1
naver\webtoons\nv2
naver\webtoons\nv3
naver\webtoons\nv4
naver\webtoons\nv5
naver\webtoons\nv6
naver\webtoons\nv7
naver\webtoons\nv8
age verification https://comic.naver.com/webtoon/list?titleId=796867&tab=mon
age verification https://comic.naver.com/webtoon/list?titleId=817859&tab=mon
naver\webtoons\nv9
naver\webtoons\nv10
naver\webtoons\nv11
naver\webtoons\nv12
naver\webtoons\nv13
naver\webtoons\nv14
age verification https://comic.naver.com/webtoon/list?titleId=813552&tab=mon
naver\webtoons\nv15
naver\webtoons\nv16
naver\webtoons\nv17
naver\webtoons\nv18
naver\webtoons\nv19
naver\webtoons\nv20
naver\webtoons\nv21
naver\webtoons\nv22
naver\webtoons\nv23
naver\webtoons\nv24
naver\webtoons\nv25
naver\webtoons\nv26
naver\webtoons\nv27
naver\webtoons\nv28
naver\webtoons\nv29
naver\webtoons\nv30
naver\webtoons\nv31
naver\webtoons\nv32
naver\w

In [42]:
directoryPath = "naver"

fileList = os.listdir(directoryPath)

In [43]:
jsonFiles = [file for file in file_list if file.endswith('.json')]
# jsonFiles = ['naver_fri.json']

In [50]:
driver = webdriver.Chrome()
flag = True
while flag:
    flag = False
    for jsonFile in jsonFiles:
        print(jsonFile.split("_")[1].split(".")[0])
        filePath = os.path.join(directoryPath, jsonFile)
        with open(filePath, 'r', encoding = "utf-8") as file:
            dayMetaData = json.load(file)
            webtoonList = dayMetaData["webtoonList"]
            for webtoon in webtoonList:
                if webtoon["imageUrl"] == None:
                    flag = True
                    webtoon["imageUrl"] = getImageUrl(webtoon["url"])
            saveToJson(filePath, dayMetaData)

fri
mon
sat
sun
thu
tue
wed
https://image-comic.pstatic.net/webtoon/769209/thumbnail/thumbnail_IMAG21_3511dcdd-6e33-4171-8839-598d6d266215.jpg
https://image-comic.pstatic.net/webtoon/817081/thumbnail/thumbnail_IMAG21_439e78c2-e71e-4671-ad2f-703327c0bd8d.jpeg
https://image-comic.pstatic.net/webtoon/747269/thumbnail/thumbnail_IMAG21_aabd9952-ff45-47a2-a543-33f19a5c6708.jpg
https://image-comic.pstatic.net/webtoon/811721/thumbnail/thumbnail_IMAG21_9a2a959a-666b-4156-8e4f-db64dfe319c6.jpg
https://image-comic.pstatic.net/webtoon/804862/thumbnail/thumbnail_IMAG21_f1b2f920-5c10-4155-a547-0c25a7132d8f.jpeg
https://image-comic.pstatic.net/webtoon/807178/thumbnail/thumbnail_IMAG21_28fcfb22-aae7-41b4-8767-05f6dd176b03.jpg
https://image-comic.pstatic.net/webtoon/814826/thumbnail/thumbnail_IMAG21_1de7535f-9088-4f21-8b16-413cb1e66307.jpg
https://image-comic.pstatic.net/webtoon/817945/thumbnail/thumbnail_IMAG21_120ad315-8518-4430-bb30-739654b49635.jpg
https://image-comic.pstatic.net/webtoon/717481/thu

In [48]:
def getImageUrl(url):
    driver.get(url)
    wait = WebDriverWait(driver, 5)
    time.sleep(1.5)
    imageUrl = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'Poster__image--d9XTI'))).get_attribute("src")
    if imageUrl != None:
        print(imageUrl)
    return imageUrl

In [35]:
def saveToJson(fileName, data):
    with open(fileName, "w", encoding='utf-8') as jsonFile:
        json.dump(data, jsonFile, indent=4, ensure_ascii=False, default=serialize)
    
def serialize(obj):
    return obj.__dict__

In [None]:
with open(file_path, 'r') as file:
    content = file.read()
    print(content)

# Open a file for writinga
with open('output.txt', 'w') as file:
    file.write('Hello, World!')