In [1]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [12]:
## Webscrapping to get webtoons mangas views, rating, genre, chapters,summary

# Function to extract title id
def get_title_id(link):
    try:
        title_id = link.split("?title_no=")[-1]
        
    except (AttributeError, ValueError, TypeError):
        title_id = ""

    return title_id
    
# Function to extract manga title
def get_title(soup):
    try:
        # Outer Tag Object
        title = soup.find('h1', class_='subj').get_text(" ", strip=True)
        clean_title = title.replace("#", "").replace("’", "'").strip()
        
    except (AttributeError, ValueError, TypeError):
        clean_title = "Not Avaliable"

    return clean_title

# Function to extract manga genre
def get_genre(soup):
    try:
        genre = soup.find("h2", attrs={"class":'genre'}).string.strip()

    except (AttributeError, ValueError, TypeError):
        genre = "Not Available"

    return genre


# Function to extract manga authors
def get_authors(soup):
    try:
        # Extract author names and join into a string
        authors = ", ".join([h3.text.strip() for h3 in soup.find_all("h3", class_="title")])

    except (AttributeError, ValueError, TypeError):
        authors = "Not Available"

    return authors


# Function to extract the released days
def get_weekdays(soup):
    try:
        weekdays = soup.find("p", attrs={"class":'day_info'}).text
        if weekdays != "COMPLETED":
            weekdays = weekdays.split(" ")
            week = weekdays[1].strip()
        else:
            week = "Released"

    except (AttributeError, ValueError, TypeError):
        week = "Not Available"

    return week
    
# Function to extract chapters count
def get_length(soup):
    try:
        chapters = soup.find("span", attrs={"class": "tx"})
        if chapters and chapters.string:
            chapters = chapters.string.strip().replace("#", "")
            clean_chapter = int(chapters) if chapters.isdigit() else None
        else:
            clean_chapter = None
    except (AttributeError, ValueError, TypeError):
        clean_chapter = None  # Handle missing elements and invalid conversion

    return clean_chapter

# Function to extract subscriber count
def get_subscriber_count(soup):
    try:
        subscriber_count = soup.find_all("em", attrs={"class":'cnt'})
        subscriber = subscriber_count[1].string.strip()
        if subscriber.endswith("M"):
            subscriber = float(subscriber[:-1]) * 1000000  # Remove 'M' and multiply
        else:
            subscriber = float(subscriber)  # Convert to float if no 'M'

    except (AttributeError, ValueError, TypeError):
        subscriber = None

    return subscriber

# Function to extract rating
def get_rating(soup):
    try:
        rating = soup.find("em", attrs={"id":'_starScoreAverage'})
        if rating:  # Ensure the element exists
            rating = float(rating.string.strip())  # Convert to float
        else:
            rating = None  # Handle case when element is missing

    except (AttributeError, ValueError, TypeError):
        rating = None

    return rating

# Function to extract views count
def get_views_count(soup):
    try:
        views_count = soup.find_all("em", attrs={"class":'cnt'})
        views = views_count[0].string.strip()
        if views.endswith("M"):
            views = float(views[:-1]) * 1000000  # Remove 'M' and multiply
        else:
            views = float(views)  # Convert to float if no 'M'

    except (AttributeError, ValueError, TypeError):
        views = None

    return views

# Function to extract the status of manga
def get_status(soup):
    try:
        weekdays = soup.find("p", attrs={'class':"day_info"}).text
        hiatus = soup.find("div", attrs={'class':"detail_paywall"})
        pause = soup.find("span",{'class':"subj"})
        p = "Finale" in pause.text
        if hiatus is not None or p:
            status = "HIATUS"
        elif weekdays != "COMPLETED":
            status = "ONGOING"
        else:
            status = "COMPLETED"

    except (AttributeError, ValueError, TypeError):
        status = "Not Available"

    return status

# Function to extract the status of manga
def get_daily_pass(soup):
    try:
        weekdays = soup.find("p", attrs={'class':"day_info"}).text
        hiatus = soup.find("div", attrs={'class':"detail_paywall"})
        pause = soup.find("span",{'class':"subj"})
        p = "Finale" in pause.text
        if hiatus is not None or p:
            status = "HIATUS"
        elif weekdays != "COMPLETED":
            status = "ONGOING"
        else:
            status = "COMPLETED"

    except (AttributeError, ValueError, TypeError):
        status = "Not Available"

    return status

# Function to extract manga summary
def get_synopsis(soup):
    try:
        summary = soup.find("p", attrs={"class":'summary'}).string.strip()

    except (AttributeError, ValueError, TypeError):
        summary = "Not Avaliable"

    return summary

In [13]:
if __name__ == '__main__':
    # add your user agent 
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

    # The webpage URL
    URL = "https://www.webtoons.com/en/originals"

    # HTTP Request
    webpage = re.get(URL, headers=HEADERS)

    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Initializing a dictionary to extract fields
    d = {"title_id":[], "title":[], "genre":[], "authors":[], "weekdays":[], "length":[], "subscriber":[], "rating":[], "views":[], "likes":[], "status":[], "daily_pass":[], "synopsis":[]}

    # Extracting likes and status
    d['likes'] = [
    float(p.text[:-1]) * 1000000 if p.text.endswith("M") else float(p.text) 
    for p in soup.find_all('em', class_='grade_num')
    ]

    # Fetch links as List of Tag Objects
    links = soup.find_all("a", attrs={'class':'card_item'})

    # Store the links
    links_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
            links_list.append(link.get('href'))
    
    # Loop for extracting product details from each link 
    for link in links_list:
        d['title_id'].append(get_link_id(link))
        
        new_webpage = re.get(link, headers=HEADERS)

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Function calls to display all necessary product information
        d['title'].append(get_title(new_soup))
        d['genre'].append(get_genre(new_soup))
        d['authors'].append(get_authors(new_soup))
        d['weekdays'].append(get_weekdays(new_soup))
        d['length'].append(get_length(new_soup))
        d['subscriber'].append(get_subscriber_count(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['views'].append(get_views_count(new_soup))
        
        d['daily_pass'].append(get_daily_pass(new_soup))
        d['synopsis'].append(get_synopsis(new_soup))

    
    webtoon_df = pd.DataFrame.from_dict(d)
    webtoon_df.to_csv("webtoon_data.csv", header=True, index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  webtoon_df['title'].replace('', np.nan, inplace=True)


In [14]:
webtoon_df.head()

Unnamed: 0,title,genre,weekdays,length,subscriber,rating,views,status,synopsis
0,Love 4 a Walk,Romance,FRIDAY,48,2M,9.77,61.4M,ONGOING,Pam's heart has been broken one too many times...
1,The Remarried Empress,Fantasy,SUNDAY,204,4.4M,9.81,495M,ONGOING,Navier Ellie Trovi was an empress perfect in e...
2,Maybe Meant to Be,Romance,FRIDAY,92,2.2M,9.75,153.2M,ONGOING,"Jia Han, a 32-year-old freelancer with no work..."
3,Selfish Romance,Romance,FRIDAY,32,1.2M,9.74,19.8M,ONGOING,"Hyeondo and Yumin, an ordinary man and woman i..."
4,To Whom It No Longer Concerns,Drama,THURSDAY,34,1.4M,9.43,24.2M,ONGOING,"Exploited by her brother, betrayed by her sist..."


In [8]:
##Webscrapping to get webtoon mangas author and likes info

# URL of the website you want to scrape
URL = 'https://www.webtoons.com/en/genres/drama?sortOrder=READ_COUNT'

# Headers for request
HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36','Accept-Language':'en-US, en;q=0.5'})

# HTTP Request
webpage = re.get(URL, headers = HEADERS)

soup = BeautifulSoup(webpage.content, 'html.parser')

d = {"title":[],"author":[],"likes":[]}
d['title'] = [p.get_text().replace("#", "").replace("’", "'").strip() for p in soup.find_all('p', class_='subj')]
d['author'] = [p.text for p in soup.find_all('p', class_='author')]
d['likes'] = [p.text for p in soup.find_all('em', class_='grade_num')]

webtoon_info_df = pd.DataFrame.from_dict(d)
webtoon_info_df.to_csv("webtoon_info.csv",header=True, index=False)

In [9]:
webtoon_info_df.head()

Unnamed: 0,title,author,likes
0,Love 4 a Walk,Nuria Sanguino,3.6M
1,The Remarried Empress,Alphatart / Sumpul,39.9M
2,To Whom It No Longer Concerns,LICO / fairydragon,1.8M
3,Maybe Meant to Be,honeyskein / damcho,12.4M
4,Selfish Romance,Gyogyo Park,1.4M


In [10]:
## Merging both csv files

merge_df = pd.merge(webtoon_df,webtoon_info_df,on = 'title', how ='left' )

merge_df.to_csv("Webtoon_Merged.csv",header=True,index=False)

In [11]:
merge_df.count()

title         429
genre         429
chapters      429
views         429
subscriber    429
rating        429
day_info      429
summary       429
author        429
likes         429
dtype: int64

In [12]:
merge_df.head()

Unnamed: 0,title,genre,chapters,views,subscriber,rating,day_info,summary,author,likes
0,Love 4 a Walk,Romance,48,61.3M,2M,9.77,EVERY FRIDAY,Pam's heart has been broken one too many times...,Nuria Sanguino,3.6M
1,The Remarried Empress,Fantasy,204,494.9M,4.4M,9.81,EVERY SUNDAY,Navier Ellie Trovi was an empress perfect in e...,Alphatart / Sumpul,39.9M
2,To Whom It No Longer Concerns,Drama,33,24M,1.4M,9.44,EVERY THURSDAY,"Exploited by her brother, betrayed by her sist...",LICO / fairydragon,1.8M
3,Maybe Meant to Be,Romance,92,153.1M,2.2M,9.75,EVERY FRIDAY,"Jia Han, a 32-year-old freelancer with no work...",honeyskein / damcho,12.4M
4,Selfish Romance,Romance,32,19.7M,1.2M,9.74,EVERY FRIDAY,"Hyeondo and Yumin, an ordinary man and woman i...",Gyogyo Park,1.4M


In [13]:
merge_df.describe()

Unnamed: 0,title,genre,chapters,views,subscriber,rating,day_info,summary,author,likes
count,429,429,429,429,429,429.0,429,429,429,429
unique,429,15,133,263,396,156.0,9,429,401,328
top,FAMILY MAN,Drama,7,1.2M,1.1M,9.72,COMPLETED,"Gang-ho is a laborer at the local factory, sle...",Ilkwon Ha,1.1M
freq,1,150,73,8,11,11.0,264,1,5,11
