In [12]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [53]:
## Webscrapping to get webtoons mangas views, rating, genre, chapters,summary

# Function to extract manga title
def get_title(soup):
    try:
        # Outer Tag Object
        title = soup.find('h1', class_='subj').get_text(" ", strip=True)
        clean_title = title.replace("#", "").replace("’", "'").strip()
        
    except AttributeError:
        clean_title = ""

    return clean_title

# Function to extract manga genre
def get_genre(soup):
    try:
        genre = soup.find("h2", attrs={"class":'genre'}).string.strip()

    except AttributeError:
        genre = ""

    return genre

# Function to extract chapters count
def get_chapters(new_soup):
    try:
        chapters = new_soup.find("span", attrs={"class":'tx'}).string.strip()
        clean_chapter = chapters.replace("#", "").strip()

    except AttributeError:
        clean_chapter = ""

    return clean_chapter

# Function to extract views count
def get_views_count(soup):
    try:
        views_count = soup.find_all("em", attrs={"class":'cnt'})
        views = views_count[0].string.strip()

    except AttributeError:
        views = ""

    return views

# Function to extract subscriber count
def get_subscriber_count(soup):
    try:
        subscriber_count = soup.find_all("em", attrs={"class":'cnt'})
        subscriber = subscriber_count[1].string.strip()

    except AttributeError:
        subscriber = ""

    return subscriber

# Function to extract rating
def get_rating(soup):
    try:
        rating = soup.find("em", attrs={"id":'_starScoreAverage'}).string.strip()

    except AttributeError:
        rating = ""

    return rating

# Function to extract the daily info
def get_day_info(soup):
    try:
        day_info = soup.find("p", attrs={"class":'day_info'})
        
        # Extract text without the span content
        for span in day_info.find_all('span'):
            span.extract()  # Remove the <span> element
        
        day_info_value = day_info.string.strip()

    except AttributeError:
        day_info_value = ""

    return day_info_value

# Function to extract manga summary
def get_summary(soup):
    try:
        summary = soup.find("p", attrs={"class":'summary'}).string.strip()

    except AttributeError:
        summary = "Not Avaliable"

    return summary

In [54]:
if __name__ == '__main__':
    # add your user agent 
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

    # The webpage URL
    URL = "https://www.webtoons.com/en/genres"

    # HTTP Request
    webpage = re.get(URL, headers=HEADERS)

    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")
    
    # Fetch links as List of Tag Objects
    links = soup.find_all("a", attrs={'class':'card_item'})

    # Store the links
    links_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
            links_list.append(link.get('href'))
    
    d = {"title":[], "genre":[], "chapters":[], "views":[], "subscriber":[], "rating":[], "day_info":[], "summary":[]}
    
    # Loop for extracting product details from each link 
    for link in links_list:
        new_webpage = re.get(link, headers=HEADERS)

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Function calls to display all necessary product information
        d['title'].append(get_title(new_soup))
        d['genre'].append(get_genre(new_soup))
        d['chapters'].append(get_chapters(new_soup))
        d['views'].append(get_views_count(new_soup))
        d['subscriber'].append(get_subscriber_count(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['day_info'].append(get_day_info(new_soup))
        d['summary'].append(get_summary(new_soup))

    
    webtoon_df = pd.DataFrame.from_dict(d)
    webtoon_df['title'].replace('', np.nan, inplace=True)
    webtoon_df = webtoon_df.dropna(subset=['title'])
    webtoon_df.to_csv("webtoon_data.csv", header=True, index=False)

In [55]:
webtoon_df.head()

Unnamed: 0,title,genre,chapters,views,subscriber,rating,day_info,summary
0,Love 4 a Walk,Romance,48,61M,2M,9.77,EVERY FRIDAY,Pam's heart has been broken one too many times...
1,The Remarried Empress,Fantasy,204,494.6M,4.3M,9.81,EVERY SUNDAY,Navier Ellie Trovi was an empress perfect in e...
2,To Whom It No Longer Concerns,Drama,33,23.9M,1.4M,9.44,EVERY THURSDAY,"Exploited by her brother, betrayed by her sist..."
3,Maybe Meant to Be,Romance,92,152.9M,2.2M,9.75,EVERY FRIDAY,"Jia Han, a 32-year-old freelancer with no work..."
4,Selfish Romance,Romance,32,19.4M,1.2M,9.74,EVERY FRIDAY,"Hyeondo and Yumin, an ordinary man and woman i..."


In [49]:
##Webscrapping to get webtoon mangas author and likes info

# URL of the website you want to scrape
URL = 'https://www.webtoons.com/en/genres/drama?sortOrder=READ_COUNT'

# Headers for request
HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36','Accept-Language':'en-US, en;q=0.5'})

# HTTP Request
webpage = re.get(URL, headers = HEADERS)

soup = BeautifulSoup(webpage.content, 'html.parser')

d = {"title":[],"author":[],"likes":[]}
d['title'] = [p.get_text().replace("#", "").replace("’", "'").strip() for p in soup.find_all('p', class_='subj')]
d['author'] = [p.text for p in soup.find_all('p', class_='author')]
d['likes'] = [p.text for p in soup.find_all('em', class_='grade_num')]

webtoon_info_df = pd.DataFrame.from_dict(d)
webtoon_info_df.to_csv("webtoon_info.csv",header=True, index=False)

In [50]:
webtoon_info_df.head()

Unnamed: 0,title,author,likes
0,Love 4 a Walk,Nuria Sanguino,3.6M
1,The Remarried Empress,Alphatart / Sumpul,39.8M
2,To Whom It No Longer Concerns,LICO / fairydragon,1.8M
3,Maybe Meant to Be,honeyskein / damcho,12.4M
4,Selfish Romance,Gyogyo Park,1.4M


In [56]:
## Merging both csv files

merge_df = pd.merge(webtoon_df,webtoon_info_df,on = 'title', how ='left' )

merge_df.to_csv("Webtoon.csv",header=True,index=False)

In [57]:
merge_df.count()

title         429
genre         429
chapters      429
views         429
subscriber    429
rating        429
day_info      429
summary       429
author        429
likes         429
dtype: int64

In [58]:
merge_df.head()

Unnamed: 0,title,genre,chapters,views,subscriber,rating,day_info,summary,author,likes
0,Love 4 a Walk,Romance,48,61M,2M,9.77,EVERY FRIDAY,Pam's heart has been broken one too many times...,Nuria Sanguino,3.6M
1,The Remarried Empress,Fantasy,204,494.6M,4.3M,9.81,EVERY SUNDAY,Navier Ellie Trovi was an empress perfect in e...,Alphatart / Sumpul,39.8M
2,To Whom It No Longer Concerns,Drama,33,23.9M,1.4M,9.44,EVERY THURSDAY,"Exploited by her brother, betrayed by her sist...",LICO / fairydragon,1.8M
3,Maybe Meant to Be,Romance,92,152.9M,2.2M,9.75,EVERY FRIDAY,"Jia Han, a 32-year-old freelancer with no work...",honeyskein / damcho,12.4M
4,Selfish Romance,Romance,32,19.4M,1.2M,9.74,EVERY FRIDAY,"Hyeondo and Yumin, an ordinary man and woman i...",Gyogyo Park,1.4M


In [59]:
merge_df.describe()

Unnamed: 0,title,genre,chapters,views,subscriber,rating,day_info,summary,author,likes
count,429,429,429,429,429,429.0,429,429,429,429
unique,429,15,134,258,396,155.0,9,429,401,327
top,Love 4 a Walk,Drama,7,2.2M,1.1M,9.72,COMPLETED,Pam's heart has been broken one too many times...,Ilkwon Ha,1.2M
freq,1,150,72,8,11,11.0,264,1,5,11
