In [3]:
import requests
from bs4 import BeautifulSoup

In [4]:
url = 'https://animehay.in/phim-moi-cap-nhap/trang-1.html'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [5]:
movie_items = soup.find_all('div', class_='movie-item')

In [28]:
def get_checkpoint_data(movie) -> tuple[int, str]:

    movie_id = int(movie['id'][9:])
    movie_latest_ep = movie.find('div', class_='episode-latest').text
    
    return (movie_id, movie_latest_ep)


def save_checkpoint(checkpoint_set):

    import pickle
    CHECKPOINT_PATH = './data/checkpoint.pkl'

    with open(CHECKPOINT_PATH, 'wb') as f:
        pickle.dump(checkpoint_set, f)


def load_checkpoint() -> set[int, str]:
    import pickle
    CHECKPOINT_PATH = './data/checkpoint.pkl'

    with open(CHECKPOINT_PATH, 'rb') as f:
        data = pickle.load(f)

    return data


def get_movie_items(page):

    url = f'https://animehay.in/phim-moi-cap-nhap/trang-{page}.html'
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    movie_items = soup.find_all('div', class_='movie-item')

    return movie_items



def extract_data(movie) -> tuple:
    
    from requests import  ConnectionError

    link = movie.select('a:nth-child(2)')[0]['href']
    response = requests.get(link)
    
    if response.status_code != 200:
        return ()

    soup = BeautifulSoup(response.content, 'html.parser')
    
    movie_data = soup.find('div', class_='last')

    if movie_data is None:
        return ()
    
    name = soup.find('h1', class_='heading_movie').text.strip()

    raw_categories = movie_data.find('div', class_='list_cate') \
                            .text.split('\n')[4:-1:2]
    categories = [category.strip() for category in raw_categories]

    movie_status = movie_data.find('div', class_='status') \
                    .select('div:nth-child(2)')[0] \
                    .text.strip()
    
    score_and_review = movie_data.find('div', class_='score') \
                                 .select('div:nth-child(2)')[0].text.split()[:-2:2]
    score = float(score_and_review[0]) if score_and_review[0] != 'NaN' else None
    review = int(score_and_review[1]) if score_and_review[1] != 'NaN' else None

    publish_year_str = movie_data.find('div', class_='update_time') \
                                 .select('div:nth-child(2)')[0].text.strip()
    publish_year = int(publish_year_str) if publish_year_str != 'NaN' else None
    
    duration = movie_data.find('div', class_='duration') \
                        .select('div:nth-child(2)')[0].text.strip()
    
    return (name, categories, movie_status, score, review, publish_year, duration, link)

def convert_to_dict(id, data) -> dict:

    result_dict = {
        'id': id,
        'name': data[0],
        'genre': data[1],
        'status': data[2],
        'score': data[3],
        'review': data[4],
        'publish_year': data[5],
        'duration': data[6],
        'link': data[7]
    }
    return result_dict


def append_data(id, data, data_dict):
    
    for i in range(len(data[1])):

        for j, key in enumerate(data_dict):

            if j == 0:  
                data_dict[key].append(id)
            
            elif j == 2:
                data_dict[key].append(data[j - 1][i])
                
            else:
                data_dict[key].append(data[j - 1])

    return data_dict


def save_to_json(data, filename):

    import json
    
    with open(f'./data/json/{filename}.json', 'w', encoding='utf-8') as f:
        json.dump(data, f)


def save_to_csv(data, filename):

    import pandas as pd
    from datetime import datetime

    filename = datetime.today().strftime('%Y_%m_%d')
    data_df = pd.DataFrame(data=data)
    data_df = data_df.explode('genre')
    data_df.to_csv(f'./data/csv/{filename}.csv', index=False)

    return data_df


def save_to_parquet(data_df, filename):

    import pyarrow as pa
    import pyarrow.parquet as pq

    data_table = pa.Table.from_pandas(data_df)
    pq.write_table(data_table, f'./data/parquet/{filename}.parquet')


def save_data(data):

    from datetime import datetime

    filename = datetime.today().strftime('%Y_%m_%d')

    save_to_json(data, filename)
    data_df = save_to_csv(data, filename)
    save_to_parquet(data_df, filename)


In [29]:
# Infinite scrapper
proceed = True
page = 14
new_checkpoint_data = set()
total_movie_data = []
current_checkpoint_data = load_checkpoint()
while(proceed):

    print(page)
    movie_items = get_movie_items(page)
    for movie in movie_items:

        checkpoint_data = get_checkpoint_data(movie)

        # # Check if the movie item has ben processed on the previous batch or not
        # if checkpoint_data in current_checkpoint_data:
        #     # save_checkpoint(new_checkpoint_data)  # Save this batch as checkpoint for future scrapping batches
        #     proceed = False  # Stop this movie item processing batch
        #     break  

        new_checkpoint_data.add(checkpoint_data)

        movie_data = extract_data(movie)
        if len(movie_data) == 0:  # Skip to next movie in case there is no data
            continue

        total_movie_data.append(convert_to_dict(id=checkpoint_data[0], data=movie_data))
        print(movie_data)

    page += 1
    if page > 14:
        proceed = False

save_data(total_movie_data)

14
      id                                               name  \
0   3719                                    Đan Đạo Tông Sư   
1   3776        Mobile Suit Gundam: Suisei no Majo Season 2   
2   3781                                               Ở Rể   
3   3727                                         Jigokuraku   
4   3752                                             Mashle   
5   3734  Isekai de Cheat Skill wo Te ni Shita Ore wa, G...   
6   3775                               The Marginal Service   
7   3771                                         Oshi no Ko   
8   3770                                     World Dai Star   
9   3766               Kanojo ga Koushaku-tei ni Itta Riyuu   
10  3760                                   Kizuna no Allele   
11  3733        Jijou wo Shiranai Tenkousei ga Guigui Kuru.   
12  3761                    Mahou Shoujo Magical Destroyers   
13  3762                             Megami no Café Terrace   
14  3763                              Rokudou no Onn

In [26]:
import pickle
with open('./data/checkpoint.pkl', 'wb') as f:
    pickle.dump(set([(2713, '84/84')]), f)

In [28]:
with open('./data/checkpoint.pkl', 'rb') as f:
    movie_data = pickle.load(f)

movie_data

{(2713, '84/84')}

# scrap -> check -> turn page or filter

In [None]:
def get_movie_list