# YouTube Playlist Extractor

A lot of threads on mafiascum.net contain links to YouTube videos distributed across multiple posts. Particularly when they share the same purpose or theme, it can be useful to extract these links and compile them into a single playlist for easy viewing or listening.

This example demonstrates how to use functions from the [donbot](https://github.com/Computational-Mafia/donbot) module to extract YouTube video links across a thread and compile them into a playlist. By changing the thread URL parameter and maybe adding your account credentials, you can use this example to extract a playlist from any thread on the forum.

The implementation here has so far been validated to work for threads with ~40 songs. It won't work for threads that require user privileges to access.

In [3]:
# @markdown ## Provide a thread url and playlist type. Press the play button the left to generate!
thread = "https://forum.mafiascum.net/viewtopic.php?t=92087" # @param {type:"string"}
playlist_type = "music" # @param ["music", "regular videos"]

import requests
from lxml import html
from math import floor

def count_posts(session: requests.Session, thread: str) -> int:
    """
    Counts the number of posts in the specified thread.

    Parameters
    ----------
    session : requests.Session
        The session object used for making HTTP requests.
    thread : str
        The thread to count posts in.

    Returns
    -------
    int
        The number of posts in the specified thread.
    """
    page = session.get(thread).content
    post_count_path = "(//div[@class='pagination'])[2]/text()"
    numberOfPosts = html.fromstring(page).xpath(post_count_path)[0]
    return int(numberOfPosts[: numberOfPosts.find(" ")].strip())

def get_posts(
    session: requests.Session, thread: str, start: int = 0, end: int = -1
) -> list[dict]:
    """
    Retrieve posts from a thread.

    Parameters
    ----------
    session : requests.Session
        The session object used for making HTTP requests.
    thread : str
        The thread to retrieve the posts of.
    start : int, optional
        The post number to start retrieving from. Default is 0.
    end : int, optional
        The post number to stop retrieving at. Default is infinity.

    Returns
    -------
    list[dict]
        Each post's data, including post `id`, `number`, `user, `time`, and `content`.
    """

    posts_per_page = 25
    post_body_path = "//div[@class='postbody']"
    post_number_path = ".//span[@class='post-number-bolded']//text()"
    post_user_path = ".//a[@class='username' or @class='username-coloured']/text()"
    post_content_path = ".//div[@class='content']"
    post_timestamp_path = ".//p[@class='author modified']/text()"
    post_id_path = ".//a/@href"
    end = end if end != -1 else count_posts(session, thread)

    # identify pages to visit
    start_page_id = floor(start / posts_per_page) * posts_per_page
    end_page_id = floor(end / posts_per_page) * posts_per_page

    # collect on each page key content from posts after current post
    posts = []
    for page_index in range(start_page_id, (end_page_id + 1), posts_per_page):
        page = session.get(f"{thread}&start={str(page_index)}").content
        for raw_post in html.fromstring(page).xpath(post_body_path):
            post_number = int(raw_post.xpath(post_number_path)[0][1:])
            if post_number < start or post_number > end:
                continue
            posts.append({"number": post_number})
            posts[-1]["id"] = raw_post.xpath(post_id_path)[0]
            posts[-1]["id"] = posts[-1]["id"][posts[-1]["id"].rfind("#") + 2 :]
            posts[-1]["user"] = raw_post.xpath(post_user_path)[0]
            posts[-1]["content"] = raw_post.xpath(post_content_path)[0]
            posts[-1]["content"] = html.tostring(raw_post.xpath(post_content_path)[0])
            posts[-1]["content"] = posts[-1]["content"].decode("UTF-8").strip()[21:-6]
            posts[-1]["time"] = raw_post.xpath(post_timestamp_path)[-1]
            posts[-1]["time"] = posts[-1]["time"][
                posts[-1]["time"].find("» ") + 2 :
            ].strip()
            #posts[-1]["time"] = dt.strptime(posts[-1]["time"], "%a %b %d, %Y %I:%M %p")

    return posts

def extract_youtube_links(post_content: str):
    "Return all links to youtube videos in the post content"

    clean_youtube_links = []
    for link in html.fromstring(post_content).xpath("//iframe/@src"):
        if "youtube" not in link:
            continue
        video_id = link.split("/")[-1]
        clean_youtube_links.append(f"https://www.youtube.com/watch?v={video_id}")
    return clean_youtube_links


def create_playlist_url(video_links: list[str]):
    "Return a youtube playlist url from a list of video links"

    video_ids = [link.split("v=")[1] for link in video_links]
    return f"http://www.youtube.com/watch_videos?video_ids={','.join(video_ids)}"


if __name__ == "__main__":
    session = requests.Session()
    posts = get_posts(session, thread)
    youtube_links = []
    for post in posts:
        if post["user"] != posts[0]["user"]:
            continue

        youtube_links.extend(extract_youtube_links(post["content"]))

    playlist_url = create_playlist_url(youtube_links)
    final_playlist_url = session.get(playlist_url).url
    
    print('Playlist URL:')
    if playlist_type == 'music':
        print(final_playlist_url.replace('www', 'music'))
    else:
        print(final_playlist_url)



Playlist URL:
http://www.youtube.com/watch_videos?video_ids=E0jrUBRlAvA,z87nhzCdYEU,0QaefDV7cb8,QApcyPKEwXI,D9o_XTzYoa8,Epj5A84mDp0,LILIDv0JzEM,9b25Balcsj4,seok6lO1n-8,bLaSOG3Vjbo,qlcfgoTRtUQ,RZxK1kDO7K8,b_CpWmkhwq0,3WpdCZC9q6w,E91pJYO_s7I,PEeRO4k40pM,zdU0qwZKLfU,3zb9X_unuqo,Qss4rfv7n88,n87C0iUyrCU,0ckoUgA7UZQ,iKBCVZqqooY,PVVK7HkdW9k,ekU1dQjMsOQ,NRiOvSqn8aw,xFlULnKuxtg,FnjjbqMjVe4,F9oCB6Rsnqg,fj1RPTj-188,tIxLU8WUK1Y,UtfkrGRK8wA,VYW4F5q7XBE,ds18Ozzp8h0,VC_jqzx4QXc,tGVRsIDNuKU,h2kUX_Fmj7k,3uZ9i4QYRLo,OMFAhvcPLrU,rTcF-tZwlXI,ji6wHH0Gf9I,VS6ixn2berk,E0jrUBRlAvA,z87nhzCdYEU,0QaefDV7cb8,QApcyPKEwXI,D9o_XTzYoa8,Epj5A84mDp0,LILIDv0JzEM,9b25Balcsj4,seok6lO1n-8,bLaSOG3Vjbo,qlcfgoTRtUQ,RZxK1kDO7K8,b_CpWmkhwq0,3WpdCZC9q6w,E91pJYO_s7I,PEeRO4k40pM,zdU0qwZKLfU,3zb9X_unuqo,Qss4rfv7n88,n87C0iUyrCU,0ckoUgA7UZQ,iKBCVZqqooY,PVVK7HkdW9k,ekU1dQjMsOQ,NRiOvSqn8aw,xFlULnKuxtg,FnjjbqMjVe4,F9oCB6Rsnqg,fj1RPTj-188,tIxLU8WUK1Y,UtfkrGRK8wA,VYW4F5q7XBE,ds18Ozzp8h0,VC_jqzx4QXc,tGVRsIDNuKU,h2kUX_Fmj7k,3uZ9i4QYRLo,OMFA