In [72]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime


In [81]:
## Webscrapping to get webtoons mangas views, rating, genre, chapters,summary

# Function to extract title id
def get_title_id(link):
    try:
        title_id = int(link.split("?title_no=")[-1])
        
    except (AttributeError, ValueError, TypeError):
        title_id = ""

    return title_id

def get_released_date(soup):
    try:
        # Find all span tags with class "date"
        dates = soup.find_all("span",{'class':'date'})
        
        # Extract and convert the first episode's date (last span tag)
        if dates:
            first_episode_date_str = dates[-1].text.strip()  # Extract text (e.g., "Feb 25, 2024")
            # Convert string to datetime object
            first_episode_date = datetime.strptime(first_episode_date_str, "%b %d, %Y")
            # Convert to DATE datatype (only date part)
            first_episode_date = first_episode_date.date()
        else:
            first_episode_date = None  # Handle missing data

    except (AttributeError, ValueError, TypeError):
        first_episode_date = None

    return first_episode_date
    
# Function to extract manga title
def get_title(soup):
    try:
        # Outer Tag Object
        title = soup.find('h1', class_='subj').get_text(" ", strip=True)
        clean_title = title.replace("#", "").replace("’", "'").strip()
        
    except (AttributeError, ValueError, TypeError):
        clean_title = "Not Avaliable"

    return clean_title

# Function to extract manga genre
def get_genre(soup):
    try:
        genre = soup.find("h2", attrs={"class":'genre'}).string.strip()

    except (AttributeError, ValueError, TypeError):
        genre = "Not Available"

    return genre


# Function to extract manga authors
def get_authors(soup):
    try:
        # Extract author names and join into a string
        authors = ", ".join([h3.text.strip() for h3 in soup.find_all("h3", class_="title")])

    except (AttributeError, ValueError, TypeError):
        authors = "Not Available"

    return authors


# Function to extract the released days
def get_weekdays(soup):
    try:
        weekdays = soup.find("p", attrs={"class":'day_info'}).text
        if weekdays != "COMPLETED":
            weekdays = weekdays.split(" ", 1)

            # Define a mapping of abbreviations and full names to full names
            day_mapping = {
                "MON": "Monday", "MONDAY": "Monday",
                "TUE": "Tuesday", "TUESDAY": "Tuesday",
                "WED": "Wednesday", "WEDNESDAY": "Wednesday",
                "THU": "Thursday", "THURSDAY": "Thursday",
                "FRI": "Friday", "FRIDAY": "Friday",
                "SAT": "Saturday", "SATURDAY": "Saturday",
                "SUN": "Sunday", "SUNDAY": "Sunday"
            }

        # If there is no space in the weekdays string, handle that case
            if len(weekdays) > 1:
                week_abbr = weekdays[1].strip()
                # Split the week string by commas, then map abbreviations and full names to full names
                week = " ".join([day_mapping.get(day.strip(), day) for day in week_abbr.split(",")])
            else:
                # Map the single day name to full name
                week = day_mapping.get(weekdays[0].strip(), weekdays[0].strip())

        else:
            week = "Released"
            
    except (AttributeError, ValueError, TypeError):
        week = "Not Available"


    return week
    
# Function to extract chapters count
def get_length(soup):
    try:
        chapters = soup.find("span", attrs={"class": "tx"})
        if chapters and chapters.string:
            chapters = chapters.string.strip().replace("#", "")
            clean_chapter = int(chapters) if chapters.isdigit() else None
        else:
            clean_chapter = None
    except (AttributeError, ValueError, TypeError):
        clean_chapter = None  # Handle missing elements and invalid conversion

    return clean_chapter

# Function to extract subscriber count
def get_subscriber_count(soup):
    try:
        subscriber_count = soup.find_all("em", attrs={"class":'cnt'})
        subscriber = subscriber_count[1].string.strip()
        if subscriber.endswith("M"):
            subscriber = int(float(subscriber[:-1]) * 1000000)  # Remove 'M' and multiply
        elif subscriber.endswith("B"):
            subscriber = int(float(subscriber[:-1]) * 1000000000)  # Remove 'B' and multiply
        else:
            subscriber = int(subscriber.replace(',', ''))  # Convert to float if no 'M' or 'B'

    except (AttributeError, ValueError, TypeError):
        subscriber = None

    return subscriber

# Function to extract rating
def get_rating(soup):
    try:
        rating = soup.find("em", attrs={"id":'_starScoreAverage'})
        if rating:  # Ensure the element exists
            rating = float(rating.string.strip())  # Convert to float
        else:
            rating = None  # Handle case when element is missing

    except (AttributeError, ValueError, TypeError):
        rating = None

    return rating

# Function to extract views count
def get_views_count(soup):
    try:
        views_count = soup.find_all("em", attrs={"class":'cnt'})
        views = views_count[0].string.strip()
        if views.endswith("M"):
            views = int(float(views[:-1]) * 1000000)  # Remove 'M' and multiply
        elif views.endswith("B"):
            views = int(float(views[:-1]) * 1000000000) # Remove 'B' and multiply 
        else:
            views = int(views.replace(',', ''))  # Convert to int if no 'M'

    except (AttributeError, ValueError, TypeError):
        views = None

    return views

# Function to extract the likes count
def get_likes_count(main_soup,manga_link):
    try:
        manga_block = main_soup.find("a", href=manga_link)
        likes = manga_block.find('em', {'class':'grade_num'}).text
        if likes.endswith("M"):
            likes = int(float(likes[:-1]) * 1000000)  # Remove 'M' and multiply
        elif likes.endswith("B"):
            likes = int(float(likes[:-1]) * 1000000000)  # Remove 'B' and multiply
        else:
            likes = int(likes.replace(',', '')) # Convert to int if no 'M' or 'B'
            

    except (AttributeError, ValueError, TypeError):
        status = None

    return likes

# Function to extract the status of manga
def get_status(main_soup,manga_link):
    try:
        manga_block = main_soup.find("a", href=manga_link)
        if manga_block:
        # Check for "HIATUS" status in new_soup (individual manga page)
            status_div = manga_block.find('p', {'class':'icon_area'})
            ongoing = manga_block.find_parent('div', class_='daily_lst').find("h2", {'id':'ongoing'})
            completed = manga_block.find_parent('div', class_='daily_lst').find("h2",{'id':'completed'})
        
        if status_div and "HIATUS" in status_div.get_text():
            return "HIATUS"
        
        # Now, check if it belongs to the ongoing or completed section from the main page
        elif ongoing:
            return "ONGOING"
        elif completed:
            return "COMPLETED"

    except (AttributeError, ValueError, TypeError):
        status = "Not Available"

    return status

# Function to extract the status of manga
def get_daily_pass(soup):
    try:
        daily_pass = new_soup.find("div",{'class':"detail_install_app"})
        if daily_pass and "every day" in daily_pass.get_text():
            daily_pass = "True"
        else:
            daily_pass = "False"

    except (AttributeError, ValueError, TypeError):
        daily_pass = "Not Available"

    return daily_pass

# Function to extract manga summary
def get_synopsis(soup):
    try:
        summary = soup.find("p", attrs={"class":'summary'}).string.strip()

    except (AttributeError, ValueError, TypeError):
        summary = "Not Avaliable"

    return summary

In [None]:
if __name__ == '__main__':
    # add your user agent 
    HEADERS = ({'User-Agent':'Your web browser agent', 'Accept-Language': 'en-US, en;q=0.5'})

    # The webpage URL
    URL = "https://www.webtoons.com/en/originals"

    # HTTP Request
    webpage = re.get(URL, headers=HEADERS)

    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Initializing a dictionary to extract fields
    d = {"title_id":[], "released_date":[], "title":[], "genre":[], "authors":[], "weekdays":[], "length":[], "subscriber":[], "rating":[], "views":[], "likes":[], "status":[], "daily_pass":[], "synopsis":[]}

    # Fetch links as List of Tag Objects
    links = soup.find_all("a", attrs={'class':'daily_card_item'})

    # Store the links
    links_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
            links_list.append(link.get('href'))
    
    # Loop for extracting product details from each link 
    for link in links_list:
        d['title_id'].append(get_title_id(link))
        
        new_webpage = re.get(link, headers=HEADERS)

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Function calls to display all necessary product information
        d['title'].append(get_title(new_soup))
        d['released_date'].append(get_released_date(new_soup))
        d['genre'].append(get_genre(new_soup))
        d['authors'].append(get_authors(new_soup))
        d['weekdays'].append(get_weekdays(new_soup))
        d['length'].append(get_length(new_soup))
        d['subscriber'].append(get_subscriber_count(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['views'].append(get_views_count(new_soup))
        d['likes'].append(get_likes_count(soup,link))
        d['status'].append(get_status(soup,link))
        d['daily_pass'].append(get_daily_pass(new_soup))
        d['synopsis'].append(get_synopsis(new_soup))

    
    webtoon_df = pd.DataFrame.from_dict(d)
    filename = "webtoon_data"  # Replace with your desired filename
    today_date = datetime.today().strftime('%Y-%m-%d')  # Format today's date as YYYY-MM-DD
    final_filename = f"{filename}_{today_date}.csv"  # Create final filename
    webtoon_df.to_csv(final_filename, header=True, index=False)

In [83]:
webtoon_df.head()

Unnamed: 0,title_id,released_date,title,genre,authors,weekdays,length,subscriber,rating,views,likes,status,daily_pass,synopsis
0,7109,2024-12-22,Duchess in Ruins,Romance,"Candlebambi, Saedle, Lemon Frog",Monday,17,449713,9.21,6200000,601758,ONGOING,False,Edele Lancaster is the perfect noble lady who ...
1,6107,2024-12-22,The Reborn Young Lord is an Assassin,Fantasy,"swingbat, CoffeeLime",Monday,53,905522,9.51,25200000,1700000,ONGOING,False,"Cyan Vert, illegitimate son of the Duke and th..."
2,7121,2024-12-22,A Savage Proposal,Romance,"Lee yuna, team IRUKA",Monday,17,614151,7.89,7000000,392249,ONGOING,False,When the ruthless Lord Tiwakan seizes the king...
3,4886,2024-12-22,I'm the Queen in This Life,Fantasy,"Themis, Omin, Lefaljinf",Monday,122,1200000,9.57,92900000,8500000,ONGOING,False,The Etruscan Kingdom is stained with blood whe...
4,6756,2024-12-22,The Perfect Contract,Romance,"TTUSOL, TAEZA, Kim Sohee",Monday,29,306547,9.53,7700000,568944,ONGOING,False,"Junwon, a perfect man, and Ju-i, a woman who k..."


In [84]:
webtoon_df.sample(10)

Unnamed: 0,title_id,released_date,title,genre,authors,weekdays,length,subscriber,rating,views,likes,status,daily_pass,synopsis
999,3440,2021-10-14,Chasing Tails,Mystery,BASHI,Released,10,319300,9.73,15300000,1900000,COMPLETED,True,After nine college students are trapped under ...
1488,560,2016-01-28,Cyko-KO,Superhero,Robert Feldman,Released,26,8334,7.38,261238,9559,COMPLETED,False,With Earth rapidly deteriorating from pollutio...
1184,1823,2019-12-24,SOLEIL,Fantasy,YenKin,Released,14,238911,9.48,8500000,1300000,COMPLETED,True,"Ever since her parent's funeral, Florette seem..."
922,2647,2021-05-16,Nevertheless,Romance,Jeongseo,Released,4,575279,9.21,16500000,1300000,COMPLETED,True,Now a major adaptation on streaming.\nWhat hap...
961,4676,2022-10-30,Survival Diary,Thriller,SSurplus man,Released,7,124543,9.77,4000000,515267,COMPLETED,True,"Ghastly, red-eyed Lynn is a zombie… but unlike..."
506,4523,2024-07-04,SHOGUNNED,Action,Honesty O.,Friday,22,91460,9.66,1400000,134293,HIATUS,False,Teiko’s fearless motivation for cold hard cash...
386,7361,2025-01-29,Hellbound 2: The Resurrected,Thriller,"Yeon Sangho, Choi Gyuseok",Thursday,8,17473,9.24,63975,5426,ONGOING,False,Five years have passed since a blameless infan...
983,2506,2021-03-13,Taste of Illness,Drama,Ilkwon Ha,Released,3,172913,9.82,3700000,602803,COMPLETED,True,"Not having any friends is bad enough, but bein..."
992,1360,2018-04-19,Refund High School,Fantasy,LICO,Released,8,1300000,9.75,135900000,12100000,COMPLETED,True,High-schooler Aru's dream of becoming an idol ...
507,4524,2024-03-14,Dark Water,Supernatural,Punkcoa,Friday,20,90975,9.35,2100000,220927,HIATUS,False,"Venus is one of the most feared, hated and mis..."


In [85]:
webtoon_df.shape

(1498, 14)

In [86]:
webtoon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1498 entries, 0 to 1497
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title_id       1498 non-null   int64  
 1   released_date  1498 non-null   object 
 2   title          1498 non-null   object 
 3   genre          1498 non-null   object 
 4   authors        1498 non-null   object 
 5   weekdays       1498 non-null   object 
 6   length         1498 non-null   int64  
 7   subscriber     1498 non-null   int64  
 8   rating         1498 non-null   float64
 9   views          1498 non-null   int64  
 10  likes          1498 non-null   int64  
 11  status         1498 non-null   object 
 12  daily_pass     1498 non-null   object 
 13  synopsis       1498 non-null   object 
dtypes: float64(1), int64(5), object(8)
memory usage: 164.0+ KB


In [90]:
webtoon_df.describe()

Unnamed: 0,title_id,length,subscriber,rating,views,likes
count,1498.0,1498.0,1498.0,1498.0,1498.0,1498.0
mean,3886.493324,68.304406,395905.2,9.209633,35365370.0,2794450.0
std,2158.8694,122.653961,626562.0,0.714675,105760900.0,7032602.0
min,64.0,1.0,6311.0,3.87,63975.0,5426.0
25%,2113.25,8.0,90530.25,9.07,2200000.0,220399.5
50%,3917.5,30.0,187360.0,9.41,6600000.0,648389.0
75%,5831.75,82.0,418136.8,9.66,23350000.0,2200000.0
max,7480.0,1410.0,7500000.0,9.93,1400000000.0,74400000.0


In [91]:
webtoon_df.isin(["Not Available"]).sum()

title_id         0
released_date    0
title            0
genre            0
authors          0
weekdays         0
length           0
subscriber       0
rating           0
views            0
likes            0
status           0
daily_pass       0
synopsis         0
dtype: int64

In [92]:
webtoon_df.isna().sum()

title_id         0
released_date    0
title            0
genre            0
authors          0
weekdays         0
length           0
subscriber       0
rating           0
views            0
likes            0
status           0
daily_pass       0
synopsis         0
dtype: int64

In [93]:
webtoon_df.nunique()

title_id         1470
released_date     824
title            1470
genre              16
authors          1324
weekdays           16
length            248
subscriber       1360
rating            249
views             665
likes            1017
status              3
daily_pass          2
synopsis         1469
dtype: int64

In [94]:
webtoon_df.columns

Index(['title_id', 'released_date', 'title', 'genre', 'authors', 'weekdays',
       'length', 'subscriber', 'rating', 'views', 'likes', 'status',
       'daily_pass', 'synopsis'],
      dtype='object')

In [104]:
webtoon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1498 entries, 0 to 1497
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title_id       1498 non-null   int64  
 1   released_date  1498 non-null   object 
 2   title          1498 non-null   object 
 3   genre          1498 non-null   object 
 4   authors        1498 non-null   object 
 5   weekdays       1498 non-null   object 
 6   length         1498 non-null   int64  
 7   subscriber     1498 non-null   int64  
 8   rating         1498 non-null   float64
 9   views          1498 non-null   int64  
 10  likes          1498 non-null   int64  
 11  status         1498 non-null   object 
 12  daily_pass     1498 non-null   object 
 13  synopsis       1498 non-null   object 
dtypes: float64(1), int64(5), object(8)
memory usage: 164.0+ KB


In [107]:
webtoon_df['title_id'].duplicated().sum()

np.int64(28)

In [108]:
webtoon_df.duplicated().sum()

np.int64(23)

In [98]:
df_unique = webtoon_df.drop_duplicates(subset='title_id', keep='first')

In [99]:
df_unique['title_id'].duplicated().sum()

np.int64(0)

In [100]:
df_unique.shape

(1470, 14)

In [101]:
df_unique.head()

Unnamed: 0,title_id,released_date,title,genre,authors,weekdays,length,subscriber,rating,views,likes,status,daily_pass,synopsis
0,7109,2024-12-22,Duchess in Ruins,Romance,"Candlebambi, Saedle, Lemon Frog",Monday,17,449713,9.21,6200000,601758,ONGOING,False,Edele Lancaster is the perfect noble lady who ...
1,6107,2024-12-22,The Reborn Young Lord is an Assassin,Fantasy,"swingbat, CoffeeLime",Monday,53,905522,9.51,25200000,1700000,ONGOING,False,"Cyan Vert, illegitimate son of the Duke and th..."
2,7121,2024-12-22,A Savage Proposal,Romance,"Lee yuna, team IRUKA",Monday,17,614151,7.89,7000000,392249,ONGOING,False,When the ruthless Lord Tiwakan seizes the king...
3,4886,2024-12-22,I'm the Queen in This Life,Fantasy,"Themis, Omin, Lefaljinf",Monday,122,1200000,9.57,92900000,8500000,ONGOING,False,The Etruscan Kingdom is stained with blood whe...
4,6756,2024-12-22,The Perfect Contract,Romance,"TTUSOL, TAEZA, Kim Sohee",Monday,29,306547,9.53,7700000,568944,ONGOING,False,"Junwon, a perfect man, and Ju-i, a woman who k..."


In [102]:
df_unique.nunique()

title_id         1470
released_date     824
title            1470
genre              16
authors          1324
weekdays           16
length            248
subscriber       1355
rating            249
views             664
likes            1017
status              3
daily_pass          2
synopsis         1469
dtype: int64

In [103]:
filename = "webtoon_data_cleaned"  # Replace with your desired filename
today_date = datetime.today().strftime('%Y-%m-%d')  # Format today's date as YYYY-MM-DD
final_filename = f"{filename}_{today_date}.csv"  # Create final filename
df_unique.to_csv(final_filename, header=True, index=False)

In [109]:
df_unique.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1470 entries, 0 to 1497
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title_id       1470 non-null   int64  
 1   released_date  1470 non-null   object 
 2   title          1470 non-null   object 
 3   genre          1470 non-null   object 
 4   authors        1470 non-null   object 
 5   weekdays       1470 non-null   object 
 6   length         1470 non-null   int64  
 7   subscriber     1470 non-null   int64  
 8   rating         1470 non-null   float64
 9   views          1470 non-null   int64  
 10  likes          1470 non-null   int64  
 11  status         1470 non-null   object 
 12  daily_pass     1470 non-null   object 
 13  synopsis       1470 non-null   object 
dtypes: float64(1), int64(5), object(8)
memory usage: 172.3+ KB


In [110]:
df_unique = df_unique.copy()
df_unique['released_date'] = pd.to_datetime(df_unique['released_date'], format="%m-%d-%Y")

In [111]:
df_unique.dtypes

title_id                  int64
released_date    datetime64[ns]
title                    object
genre                    object
authors                  object
weekdays                 object
length                    int64
subscriber                int64
rating                  float64
views                     int64
likes                     int64
status                   object
daily_pass               object
synopsis                 object
dtype: object

In [None]:
import sqlalchemy as sql

# Create connection using SQLAlchemy
engine = sql.create_engine('mssql://User_Name/WebtoonDB?driver=ODBC+DRIVER+17+FOR+SQL+SERVER')
conn=engine.connect()

In [114]:
#load the data into sql server using append option
df_unique.to_sql('WebtoonData', con=conn , index=False, if_exists = 'append')

129

In [113]:
df_unique.dtypes

title_id                  int64
released_date    datetime64[ns]
title                    object
genre                    object
authors                  object
weekdays                 object
length                    int64
subscriber                int64
rating                  float64
views                     int64
likes                     int64
status                   object
daily_pass               object
synopsis                 object
dtype: object