In [None]:
from typing import List, Dict
import os
import time
from file_organizers import read_links_from_file, iterate_on_lib, create_directory

start_time = time.time()

source_file = os.path.join(
    "file_lists", "done - Seminar - 2018 LA - Gogyo No Kata and Hanbo.txt"
)
dist_root = "download"

In [None]:
# Parse site string. Returns video name and url
def parse_site_string(site: str):
    siteParts = site.split(" - ", 1)
    url = siteParts[0]

    if len(siteParts) > 1:
        video_name = siteParts[1]

        # Clean out any special characters which don't fit in a windows file name
        video_name = (
            video_name.replace("/", "、")
            .replace("\\", "、")
            .replace(":", " -")
            .replace("@", "_at_")
            .replace("#", "_")
            .replace("?", "_")
            .replace("*", "_")
            .replace(">", "_")
            .replace("<", "_")
            .replace("|", "、")
        )
    else:
        video_name = ""

    return [video_name, url]

In [None]:
import requests
import re


# Find and return path to the highest-resolution file
def get_media_playlist_file_path(video_name: str, url: str) -> tuple[int, int, str]:
    """NOTE: We do not sort out files which partially match the video_name
    as we always want to know if a higher-resolution file has become available"""

    # Check if we have the correct file ending
    if not url.split("?")[0].endswith(".m3u8"):
        raise Exception(
            f"File {video_name}'s url does not end in m3u8 and so does not promise to be a utf-8 encoded m3u file. \nAt url {url}"
        )

    with requests.get(url, stream=True) as file:
        # Help the response headers a bit with figuring out the file's encoding
        file.encoding = "utf-8"

        # Check first line indicates valid m3u
        if not file.content.startswith(b"#EXTM3U"):
            raise Exception(
                f"File {video_name} is not formatted as m3u file. First line is not '#EXTM3U' \n At url {url}"
            )

        # Store biggest resolution found in master playlist file
        biggest_res: tuple[int, int] = (0, 0)
        best_url: str = url

        # Find all file variants
        for match in re.finditer(
            "RESOLUTION=(\\d{3,5})x(\\d{3,5}).*\\n(.*)",
            file.content.decode(),
            re.MULTILINE,
        ):
            width = int(match.group(1))
            height = int(match.group(2))

            # Store biggest resolution found in master playlist file
            if biggest_res[0] >= width:
                continue

            biggest_res = (width, height)
            best_url = match.group(3)

        # recurse (max 1 iteration expected)
        return (biggest_res[0], biggest_res[1], best_url)

In [None]:
import urllib.request
import urllib.parse
from pathlib import Path


def save_m3u8(url: str, folder_path: str, video_name: str):
    file_path = Path(os.path.join(dist_root, folder_path, f"{video_name}.m3u8"))

    if file_path.is_file():
        os.remove(file_path)

    with urllib.request.urlopen(url) as html:
        with open(file_path, "w", encoding="utf-8") as f:
            html = html.read().decode("utf-8")
            f.write(html)

In [None]:
# Get file, parse data, find files and download m3us (sequential)
def pilfer_and_download(
    folder_sites: List[str],
    folder_path: str = "",
):
    print(
        f"----------------------- Process path {folder_path} start: {round(time.time() - start_time, 3)} seconds"
    )

    create_directory(os.path.join(dist_root, folder_path))

    print(f"--- Directory created {round(time.time() - start_time, 3)} seconds")

    for index, site in enumerate(folder_sites):
        video_name, url = parse_site_string(
            site
        )  # Parse name of video and playlist url from file line

        if url.strip() != "_":
            print(
                f"--------- Process site {video_name}:  {round(time.time() - start_time, 3)} seconds"
            )
        else:
            print(
                f"--------- Ignoring file {video_name} with ignore tag. Counting as index."
            )
            continue

        highest_res_file = get_media_playlist_file_path(
            video_name, url
        )  # Read file URL and get the biggest-resolution media file

        print(f"--- Found biggest file: {round(time.time() - start_time, 3)} seconds")

        full_file_name = f"{index}_{video_name}_{highest_res_file[0]}x{highest_res_file[1]}"  # Create name from data

        save_m3u8(
            highest_res_file[2], folder_path, full_file_name
        )  # Download and save m3u8 file to path

        print(f"--- Saved file: {round(time.time() - start_time, 3)} seconds")

In [None]:
## Run ripper
siteLib: Dict[str, List[str]] = read_links_from_file(source_file)

print(f"--- Links read {round(time.time() - start_time, 3)} seconds")

iterate_on_lib(siteLib, pilfer_and_download)