In [13]:
def get_checkpoint_data(movie) -> tuple[int, str]:

    movie_id = int(movie['id'][9:])
    movie_latest_ep = movie.find('div', class_='episode-latest').text
    
    return (movie_id, movie_latest_ep)


def save_checkpoint(checkpoint_set):

    import pickle
    CHECKPOINT_PATH = './data/checkpoint.pkl'

    with open(CHECKPOINT_PATH, 'wb') as f:
        pickle.dump(checkpoint_set, f)


def load_checkpoint() -> set[int, str]:
    
    import pickle
    CHECKPOINT_PATH = './data/checkpoint.pkl'

    with open(CHECKPOINT_PATH, 'rb') as f:
        data = pickle.load(f)

    return data


def get_movie_items(page):

    import requests
    from bs4 import BeautifulSoup

    url = f'https://animehay.biz/phim-moi-cap-nhap/trang-{page}.html'
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    movie_items = soup.find_all('div', class_='movie-item')

    return movie_items



def extract_data(movie) -> tuple:

    import requests
    from bs4 import BeautifulSoup
    
    link = movie.select('a:nth-child(2)')[0]['href']
    response = requests.get(link)
    
    if response.status_code != 200:
        return ()

    soup = BeautifulSoup(response.content, 'html.parser')
    
    movie_data = soup.find('div', class_='last')

    if movie_data is None:
        return ()
    
    name = soup.find('h1', class_='heading_movie').text.strip()

    raw_categories = movie_data.find('div', class_='list_cate') \
                            .text.split('\n')[4:-1:2]
    categories = [category.strip() for category in raw_categories]

    movie_status = movie_data.find('div', class_='status') \
                    .select('div:nth-child(2)')[0] \
                    .text.strip()
    
    score_and_review = movie_data.find('div', class_='score') \
                                 .select('div:nth-child(2)')[0].text.split()[:-2:2]
    score = float(score_and_review[0]) if score_and_review[0] != 'NaN' else None
    review = int(score_and_review[1]) if score_and_review[1] != 'NaN' else None

    publish_year_str = movie_data.find('div', class_='update_time') \
                                 .select('div:nth-child(2)')[0].text.strip()
    publish_year = int(publish_year_str) if publish_year_str != 'NaN' else None
    
    duration = movie_data.find('div', class_='duration') \
                        .select('div:nth-child(2)')[0].text.strip()
    
    return (name, categories, movie_status, score, review, publish_year, duration, link)

def convert_to_dict(id, data) -> dict:

    result_dict = {
        'id': id,
        'name': data[0],
        'genre': data[1],
        'status': data[2],
        'score': data[3],
        'review': data[4],
        'publish_year': data[5],
        'duration': data[6],
        'link': data[7]
    }
    return result_dict


def append_data(id, data, data_dict):
    
    for i in range(len(data[1])):

        for j, key in enumerate(data_dict):

            if j == 0:  
                data_dict[key].append(id)
            
            elif j == 2:
                data_dict[key].append(data[j - 1][i])
                
            else:
                data_dict[key].append(data[j - 1])

    return data_dict


def save_to_json(data, filename):

    import json
    
    with open(f'./data/json/{filename}.json', 'w', encoding='utf-8') as f:
        json.dump(data, f)


def save_to_csv(data, filename):

    import pandas as pd
    from datetime import datetime

    filename = datetime.today().strftime('%Y_%m_%d')
    data_df = pd.DataFrame(data=data)
    data_df = data_df.explode('genre')
    data_df.to_csv(f'./data/csv/{filename}.csv', index=False)

    return data_df


def save_to_parquet(data_df, filename):

    import pyarrow as pa
    import pyarrow.parquet as pq

    data_table = pa.Table.from_pandas(data_df)
    pq.write_table(data_table, f'./data/parquet/{filename}.parquet')


def save_data(data):

    from datetime import datetime

    filename = datetime.today().strftime('%Y_%m_%d')

    save_to_json(data, filename)
    data_df = save_to_csv(data, filename)
    save_to_parquet(data_df, filename)


In [14]:
# Infinite scrapper
proceed = True
page = 2
new_checkpoint_data = set()
total_movie_data = []
current_checkpoint_data = load_checkpoint()
while(proceed):

    print(page)
    movie_items = get_movie_items(page)
    for movie in movie_items:

        checkpoint_data = get_checkpoint_data(movie)

        # Check if the movie item has ben processed on the previous batch or not
        if checkpoint_data in current_checkpoint_data:
            save_checkpoint(new_checkpoint_data)  # Save this batch as checkpoint for future scrapping batches
            proceed = False  # Stop this movie item processing batch
            break  

        new_checkpoint_data.add(checkpoint_data)

        movie_data = extract_data(movie)
        if len(movie_data) == 0:  # Skip to next movie in case there is no data
            continue

        total_movie_data.append(convert_to_dict(id=checkpoint_data[0], data=movie_data))
        print(movie_data)

    page += 1
    if page > 3:
        proceed = False

save_data(total_movie_data)

2
('Nghịch Thiên Chí Tôn', ['CN Animation', 'Tiên hiệp', 'Trùng sinh'], 'Đang tiến hành', 7.5, 583, 2021, '380 Tập', 'https://animehay.biz/thong-tin-phim/nghich-thien-chi-ton-3300.html')
('Vô Thượng Thần Đế', ['CN Animation', 'Tiên hiệp', 'Trùng sinh', 'Huyền ảo'], 'Đang tiến hành', 8.0, 748, 2020, '464 Tập', 'https://animehay.biz/thong-tin-phim/vo-thuong-than-de-3051.html')
('Ore wa Subete wo [Parry] Suru: Gyaku Kanchigai no Sekai Saikyou wa Boukensha ni Naritai', ['Anime', 'Hành động', 'Siêu nhiên', 'Phiêu lưu'], 'Đang tiến hành', 8.2, 185, 2024, '?? Tập', 'https://animehay.biz/thong-tin-phim/ore-wa-subete-wo-parry-suru-gyaku-kanchigai-no-sekai-saikyou-wa-boukensha-ni-naritai-4095.html')
('Tasogare Outfocus', ['Anime', 'Học đường', 'Đời thường'], 'Đang tiến hành', 3.2, 116, 2024, '?? Tập', 'https://animehay.biz/thong-tin-phim/tasogare-outfocus-4097.html')
('Hazure Waku no [Joutai Ijou Skill] de Saikyou ni Natta Ore ga Subete wo Juurin Suru made', ['Anime', 'Hành động', 'Drama', 'Siêu

ModuleNotFoundError: No module named 'pyarrow'

In [17]:
import findspark
findspark.init()

In [18]:
from pyspark.sql import SparkSession

In [19]:
spark = SparkSession.builder.appName("Anime").getOrCreate()

In [20]:
df = spark.read.csv("data/csv/2024_09_23.csv",header=True,inferSchema=True)

In [21]:
df_pd = df.toPandas()

In [22]:
df_pd['duration_unit'] = df_pd['duration'].str.split(" ").str[-1]

In [23]:
df_pd['duration_2'] = df_pd['duration'].str.split(" ").str[0]

In [24]:
df_pd_processed = df_pd[['id', 'name', 'genre', 'status', 'score', 'review', 'publish_year',
       'duration_2', 'duration_unit','link']]

In [25]:
df_pd_processed['duration_unit'].value_counts()

duration_unit
Tập     9866
phút    2447
Name: count, dtype: int64

In [26]:
df_pd.loc[df_pd['duration_unit'] == '(Nửa']

Unnamed: 0,id,name,genre,status,score,review,publish_year,duration,link,duration_unit,duration_2


In [27]:
df_pd[df_pd['name'] == 'Shingeki no Kyojin: The Final Season']

Unnamed: 0,id,name,genre,status,score,review,publish_year,duration,link,duration_unit,duration_2
3946,4025,Shingeki no Kyojin: The Final Season,Anime,Đang tiến hành,9.6,1048,2020,16 (Nửa đầu) Tập,https://animehay.in/thong-tin-phim/shingeki-no...,Tập,16
3947,4025,Shingeki no Kyojin: The Final Season,Hành động,Đang tiến hành,9.6,1048,2020,16 (Nửa đầu) Tập,https://animehay.in/thong-tin-phim/shingeki-no...,Tập,16
3948,4025,Shingeki no Kyojin: The Final Season,Giả tưởng,Đang tiến hành,9.6,1048,2020,16 (Nửa đầu) Tập,https://animehay.in/thong-tin-phim/shingeki-no...,Tập,16
3949,4025,Shingeki no Kyojin: The Final Season,Siêu nhiên,Đang tiến hành,9.6,1048,2020,16 (Nửa đầu) Tập,https://animehay.in/thong-tin-phim/shingeki-no...,Tập,16
3950,4025,Shingeki no Kyojin: The Final Season,Shounen,Đang tiến hành,9.6,1048,2020,16 (Nửa đầu) Tập,https://animehay.in/thong-tin-phim/shingeki-no...,Tập,16
3951,4025,Shingeki no Kyojin: The Final Season,Bí ẩn,Đang tiến hành,9.6,1048,2020,16 (Nửa đầu) Tập,https://animehay.in/thong-tin-phim/shingeki-no...,Tập,16
3952,4025,Shingeki no Kyojin: The Final Season,Drama,Đang tiến hành,9.6,1048,2020,16 (Nửa đầu) Tập,https://animehay.in/thong-tin-phim/shingeki-no...,Tập,16
3953,4025,Shingeki no Kyojin: The Final Season,Quân đội,Đang tiến hành,9.6,1048,2020,16 (Nửa đầu) Tập,https://animehay.in/thong-tin-phim/shingeki-no...,Tập,16


In [28]:
spark.stop()

In [29]:
len(df_pd)

12313

In [30]:
9866+2447

12313

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from time import sleep

In [4]:
option = Options()
option.add_argument("--headless")  # Run Edge in headless mode
option.add_argument('--ignore-certificate-errors')
option.add_argument('--allow-insecure-localhost')

driver = webdriver.Edge(options=option)

In [5]:
page = 999

domain_name = 'animehay.biz'
url = f'https://{domain_name}/phim-moi-cap-nhap/trang-{page}.html'
driver.get(url)
sleep(1)

page_html = driver.page_source

In [7]:
soup = BeautifulSoup(page_html, 'html.parser')
movie_items = soup.find_all('div', class_='movie-item')

In [8]:
movie_items

[]