In [92]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime

In [147]:
## Webscrapping to get webtoons mangas views, rating, genre, chapters,summary

# Function to extract title id
def get_title_id(link):
    try:
        title_id = link.split("?title_no=")[-1]
        
    except (AttributeError, ValueError, TypeError):
        title_id = ""

    return title_id
    
# Function to extract manga title
def get_title(soup):
    try:
        # Outer Tag Object
        title = soup.find('h1', class_='subj').get_text(" ", strip=True)
        clean_title = title.replace("#", "").replace("’", "'").strip()
        
    except (AttributeError, ValueError, TypeError):
        clean_title = "Not Avaliable"

    return clean_title

# Function to extract manga genre
def get_genre(soup):
    try:
        genre = soup.find("h2", attrs={"class":'genre'}).string.strip()

    except (AttributeError, ValueError, TypeError):
        genre = "Not Available"

    return genre


# Function to extract manga authors
def get_authors(soup):
    try:
        # Extract author names and join into a string
        authors = ", ".join([h3.text.strip() for h3 in soup.find_all("h3", class_="title")])

    except (AttributeError, ValueError, TypeError):
        authors = "Not Available"

    return authors


# Function to extract the released days
def get_weekdays(soup):
    try:
        weekdays = soup.find("p", attrs={"class":'day_info'}).text
        if weekdays != "COMPLETED":
            weekdays = weekdays.split(" ", 1)

            # Define a mapping of abbreviations and full names to full names
            day_mapping = {
                "MON": "Monday", "MONDAY": "Monday",
                "TUE": "Tuesday", "TUESDAY": "Tuesday",
                "WED": "Wednesday", "WEDNESDAY": "Wednesday",
                "THU": "Thursday", "THURSDAY": "Thursday",
                "FRI": "Friday", "FRIDAY": "Friday",
                "SAT": "Saturday", "SATURDAY": "Saturday",
                "SUN": "Sunday", "SUNDAY": "Sunday"
            }

        # If there is no space in the weekdays string, handle that case
            if len(weekdays) > 1:
                week_abbr = weekdays[1].strip()
                # Split the week string by commas, then map abbreviations and full names to full names
                week = " ".join([day_mapping.get(day.strip(), day) for day in week_abbr.split(",")])
            else:
                # Map the single day name to full name
                week = day_mapping.get(weekdays[0].strip(), weekdays[0].strip())

        else:
            week = "Released"
            
    except (AttributeError, ValueError, TypeError):
        week = "Not Available"


    return week
    
# Function to extract chapters count
def get_length(soup):
    try:
        chapters = soup.find("span", attrs={"class": "tx"})
        if chapters and chapters.string:
            chapters = chapters.string.strip().replace("#", "")
            clean_chapter = int(chapters) if chapters.isdigit() else None
        else:
            clean_chapter = None
    except (AttributeError, ValueError, TypeError):
        clean_chapter = None  # Handle missing elements and invalid conversion

    return clean_chapter

# Function to extract subscriber count
def get_subscriber_count(soup):
    try:
        subscriber_count = soup.find_all("em", attrs={"class":'cnt'})
        subscriber = subscriber_count[1].string.strip()
        if subscriber.endswith("M"):
            subscriber = int(float(subscriber[:-1]) * 1000000)  # Remove 'M' and multiply
        elif subscriber.endswith("B"):
            subscriber = int(float(subscriber[:-1]) * 1000000000)  # Remove 'B' and multiply
        else:
            subscriber = int(subscriber.replace(',', ''))  # Convert to float if no 'M' or 'B'

    except (AttributeError, ValueError, TypeError):
        subscriber = None

    return subscriber

# Function to extract rating
def get_rating(soup):
    try:
        rating = soup.find("em", attrs={"id":'_starScoreAverage'})
        if rating:  # Ensure the element exists
            rating = float(rating.string.strip())  # Convert to float
        else:
            rating = None  # Handle case when element is missing

    except (AttributeError, ValueError, TypeError):
        rating = None

    return rating

# Function to extract views count
def get_views_count(soup):
    try:
        views_count = soup.find_all("em", attrs={"class":'cnt'})
        views = views_count[0].string.strip()
        if views.endswith("M"):
            views = int(float(views[:-1]) * 1000000)  # Remove 'M' and multiply
        elif views.endswith("B"):
            views = int(float(views[:-1]) * 1000000000) # Remove 'B' and multiply 
        else:
            views = int(views.replace(',', ''))  # Convert to int if no 'M'

    except (AttributeError, ValueError, TypeError):
        views = None

    return views

# Function to extract the likes count
def get_likes_count(main_soup,manga_link):
    try:
        manga_block = main_soup.find("a", href=manga_link)
        likes = manga_block.find('em', {'class':'grade_num'}).text
        if likes.endswith("M"):
            likes = int(float(likes[:-1]) * 1000000)  # Remove 'M' and multiply
        elif likes.endswith("B"):
            likes = int(float(likes[:-1]) * 1000000000)  # Remove 'B' and multiply
        else:
            likes = int(likes.replace(',', '')) # Convert to int if no 'M' or 'B'
            

    except (AttributeError, ValueError, TypeError):
        status = None

    return likes

# Function to extract the status of manga
def get_status(main_soup,manga_link):
    try:
        manga_block = main_soup.find("a", href=manga_link)
        if manga_block:
        # Check for "HIATUS" status in new_soup (individual manga page)
            status_div = manga_block.find('p', {'class':'icon_area'})
            ongoing = manga_block.find_parent('div', class_='daily_lst').find("h2", {'id':'ongoing'})
            completed = manga_block.find_parent('div', class_='daily_lst').find("h2",{'id':'completed'})
        
        if status_div and "HIATUS" in status_div.get_text():
            return "HIATUS"
        
        # Now, check if it belongs to the ongoing or completed section from the main page
        elif ongoing:
            return "ONGOING"
        elif completed:
            return "COMPLETED"

    except (AttributeError, ValueError, TypeError):
        status = "Not Available"

    return status

# Function to extract the status of manga
def get_daily_pass(soup):
    try:
        daily_pass = new_soup.find("div",{'class':"detail_install_app"})
        if daily_pass and "every day" in daily_pass.get_text():
            daily_pass = True
        else:
            daily_pass = False

    except (AttributeError, ValueError, TypeError):
        daily_pass = "Not Available"

    return daily_pass

# Function to extract manga summary
def get_synopsis(soup):
    try:
        summary = soup.find("p", attrs={"class":'summary'}).string.strip()

    except (AttributeError, ValueError, TypeError):
        summary = "Not Avaliable"

    return summary

In [148]:
if __name__ == '__main__':
    # add your user agent 
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

    # The webpage URL
    URL = "https://www.webtoons.com/en/originals"

    # HTTP Request
    webpage = re.get(URL, headers=HEADERS)

    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Initializing a dictionary to extract fields
    d = {"title_id":[], "title":[], "genre":[], "authors":[], "weekdays":[], "length":[], "subscriber":[], "rating":[], "views":[], "likes":[], "status":[], "daily_pass":[], "synopsis":[]}

    # Fetch links as List of Tag Objects
    links = soup.find_all("a", attrs={'class':'daily_card_item'})

    # Store the links
    links_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
            links_list.append(link.get('href'))
    
    # Loop for extracting product details from each link 
    for link in links_list:
        d['title_id'].append(get_title_id(link))
        
        new_webpage = re.get(link, headers=HEADERS)

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Function calls to display all necessary product information
        d['title'].append(get_title(new_soup))
        d['genre'].append(get_genre(new_soup))
        d['authors'].append(get_authors(new_soup))
        d['weekdays'].append(get_weekdays(new_soup))
        d['length'].append(get_length(new_soup))
        d['subscriber'].append(get_subscriber_count(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['views'].append(get_views_count(new_soup))
        d['likes'].append(get_likes_count(soup,link))
        d['status'].append(get_status(soup,link))
        d['daily_pass'].append(get_daily_pass(new_soup))
        d['synopsis'].append(get_synopsis(new_soup))

    
    webtoon_df = pd.DataFrame.from_dict(d)
    filename = "webtoon_data"  # Replace with your desired filename
    today_date = datetime.today().strftime('%Y-%m-%d')  # Format today's date as YYYY-MM-DD
    final_filename = f"{filename}_{today_date}.csv"  # Create final filename
    webtoon_df.to_csv(final_filename, header=True, index=False)

In [149]:
webtoon_df.head()

Unnamed: 0,title_id,title,genre,authors,weekdays,length,subscriber,rating,views,likes,status,daily_pass,synopsis
0,7109,Duchess in Ruins,Romance,"Candlebambi, Saedle, Lemon Frog",Monday,16,446741,9.21,6000000,580314,ONGOING,False,Edele Lancaster is the perfect noble lady who ...
1,6107,The Reborn Young Lord is an Assassin,Fantasy,"swingbat, CoffeeLime",Monday,52,902400,9.51,24800000,1700000,ONGOING,False,"Cyan Vert, illegitimate son of the Duke and th..."
2,7121,A Savage Proposal,Romance,"Lee yuna, team IRUKA",Monday,16,611079,7.87,6800000,379339,ONGOING,False,When the ruthless Lord Tiwakan seizes the king...
3,4886,I'm the Queen in This Life,Fantasy,"Themis, Omin, Lefaljinf",Monday,121,1200000,9.57,92700000,8500000,ONGOING,False,The Etruscan Kingdom is stained with blood whe...
4,7169,I Grabbed the Leash of the Blind Beast,Romance,"The greedy gatsby, PPANG, Kayeh",Monday,18,223629,8.72,4200000,329268,ONGOING,False,When an average twenty-something wakes up in t...


In [199]:
webtoon_df.sample(10)

Unnamed: 0,title_id,title,genre,authors,weekdays,length,subscriber,rating,views,likes,status,daily_pass,synopsis
87,3476,Mage Again,Fantasy,"Myoung rang, Sulmo",Monday,104,169765,9.73,5700000,789418,HIATUS,False,"Rangderkessel, the greatest mage of the land, ..."
651,2574,Return to Player,Action,"UMKY, SEHON, INDEX",Sunday,191,820086,9.64,71100000,6100000,ONGOING,False,"Ten years ago, a group of gods turned everyone..."
823,2575,Dungeons & Artifacts,Action,"Zerowater, UMKY",Released,7,601980,9.52,46400000,4500000,COMPLETED,True,A dungeon-guiding gig goes sideways for explor...
1041,3795,Counting Sheep,Horror,A.Rasen,Released,5,90294,9.23,3400000,426153,COMPLETED,True,"A recurring dream haunts Caleb every night, bu..."
903,5435,The Guy with Pretty Lips,Romance,"Godot, Flowbee",Released,10,144765,9.01,6200000,595350,COMPLETED,True,"Roha, Hanbi’s younger brother’s friend and als..."
440,6528,I Didn't Sign Up to be a Nanny!,Fantasy,"Eun You, Kong Ja, Honey Bambi",Friday,39,172791,9.45,6000000,447575,ONGOING,False,While working the regular shift in a supermark...
240,2727,Lady Knight,Fantasy,Lion Illustration,Wednesday,59,339619,9.72,11700000,1800000,ONGOING,False,Aurora is a young orphan who lives in a peacef...
1269,3246,To Be Ordinary!,Drama,kkanaria,Released,100,68443,9.33,3400000,334719,COMPLETED,False,"Due to his prosopagnosia, Sang struggles with ..."
835,3176,Hell is Other People,Thriller,Yong-Ki Kim,Released,12,207662,9.57,8400000,852750,COMPLETED,True,Now a major adaptation on streaming.\nJongu Yu...
83,2964,Morgana and Oz,Fantasy,Miyuli,Monday,78,1500000,9.81,82300000,8600000,HIATUS,False,What happens when a struggling witch meets an ...


In [200]:
webtoon_df.shape

(1492, 13)

In [201]:
webtoon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1492 entries, 0 to 1491
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title_id    1492 non-null   object 
 1   title       1492 non-null   object 
 2   genre       1492 non-null   object 
 3   authors     1492 non-null   object 
 4   weekdays    1492 non-null   object 
 5   length      1492 non-null   int64  
 6   subscriber  1492 non-null   int64  
 7   rating      1492 non-null   float64
 8   views       1492 non-null   int64  
 9   likes       1492 non-null   int64  
 10  status      1492 non-null   object 
 11  daily_pass  1492 non-null   bool   
 12  synopsis    1492 non-null   object 
dtypes: bool(1), float64(1), int64(4), object(7)
memory usage: 141.5+ KB


In [202]:
webtoon_df.describe()

Unnamed: 0,length,subscriber,rating,views,likes
count,1492.0,1492.0,1492.0,1492.0,1492.0
mean,68.351877,396670.2,9.211823,35461190.0,2802337.0
std,122.800379,627239.8,0.71419,105917100.0,7043593.0
min,1.0,6312.0,3.87,62311.0,5208.0
25%,8.0,90519.25,9.07,2200000.0,222089.0
50%,30.0,187902.0,9.41,6600000.0,649303.5
75%,82.0,419153.8,9.66,23450000.0,2200000.0
max,1410.0,7500000.0,9.93,1400000000.0,74400000.0


In [203]:
webtoon_df.isin(["Not Available"]).sum()

title_id      0
title         0
genre         0
authors       0
weekdays      0
length        0
subscriber    0
rating        0
views         0
likes         0
status        0
daily_pass    0
synopsis      0
dtype: int64

In [204]:
webtoon_df.isna().sum()

title_id      0
title         0
genre         0
authors       0
weekdays      0
length        0
subscriber    0
rating        0
views         0
likes         0
status        0
daily_pass    0
synopsis      0
dtype: int64

In [205]:
webtoon_df.nunique()

title_id      1464
title         1464
genre           16
authors       1318
weekdays        16
length         251
subscriber    1357
rating         246
views          647
likes         1013
status           3
daily_pass       2
synopsis      1463
dtype: int64

In [218]:
webtoon_df['title_id'].duplicated().sum()

np.int64(28)

In [213]:
df_unique = webtoon_df.drop_duplicates(subset='title_id', keep='first')

In [221]:
df_unique['title_id'].duplicated().sum()

np.int64(0)

In [222]:
df_unique.shape

(1464, 13)

In [223]:
df_unique.head()

Unnamed: 0,title_id,title,genre,authors,weekdays,length,subscriber,rating,views,likes,status,daily_pass,synopsis
0,7109,Duchess in Ruins,Romance,"Candlebambi, Saedle, Lemon Frog",Monday,16,446741,9.21,6000000,580314,ONGOING,False,Edele Lancaster is the perfect noble lady who ...
1,6107,The Reborn Young Lord is an Assassin,Fantasy,"swingbat, CoffeeLime",Monday,52,902400,9.51,24800000,1700000,ONGOING,False,"Cyan Vert, illegitimate son of the Duke and th..."
2,7121,A Savage Proposal,Romance,"Lee yuna, team IRUKA",Monday,16,611079,7.87,6800000,379339,ONGOING,False,When the ruthless Lord Tiwakan seizes the king...
3,4886,I'm the Queen in This Life,Fantasy,"Themis, Omin, Lefaljinf",Monday,121,1200000,9.57,92700000,8500000,ONGOING,False,The Etruscan Kingdom is stained with blood whe...
4,7169,I Grabbed the Leash of the Blind Beast,Romance,"The greedy gatsby, PPANG, Kayeh",Monday,18,223629,8.72,4200000,329268,ONGOING,False,When an average twenty-something wakes up in t...


In [217]:
df_unique.nunique()

title_id      1464
title         1464
genre           16
authors       1318
weekdays        16
length         251
subscriber    1355
rating         246
views          646
likes         1013
status           3
daily_pass       2
synopsis      1463
dtype: int64

In [232]:
df_unique[df_unique['synopsis'].duplicated()]

Unnamed: 0,title_id,title,genre,authors,weekdays,length,subscriber,rating,views,likes,status,daily_pass,synopsis
1368,507,LESSA 2 - The Crimson Knight,Action,POGO,Released,110,180514,9.79,16300000,906278,COMPLETED,False,There are attacks everyday and people suddenly...


In [238]:
df_unique[df_unique['title_id']=='507']['synopsis']


1368    There are attacks everyday and people suddenly...
Name: synopsis, dtype: object

In [239]:
df_filtered = df_unique[df_unique['synopsis'].str.startswith("There are attacks everyday", na=False)]


In [240]:
df_filtered

Unnamed: 0,title_id,title,genre,authors,weekdays,length,subscriber,rating,views,likes,status,daily_pass,synopsis
1325,89,LESSA,Action,POGO,Released,58,228493,9.73,12400000,616230,COMPLETED,False,There are attacks everyday and people suddenly...
1368,507,LESSA 2 - The Crimson Knight,Action,POGO,Released,110,180514,9.79,16300000,906278,COMPLETED,False,There are attacks everyday and people suddenly...


In [241]:
filename = "webtoon_data_cleaned"  # Replace with your desired filename
today_date = datetime.today().strftime('%Y-%m-%d')  # Format today's date as YYYY-MM-DD
final_filename = f"{filename}_{today_date}.csv"  # Create final filename
df_unique.to_csv(final_filename, header=True, index=False)