In [2]:
import time
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
from concurrent import futures

ARTIST_DATASET_PATH = 'datasets/artists.csv'
VIDEOS_DATASET_PATH = 'datasets/videos.csv'

ID = 'id'
NAME = 'name'
YOUTUBE = 'youtube_url'
SPOTIFY = 'spotify_uri'

CHANNEL = 'channel'
TITLE = 'title'
URL = 'url'

VIDEOS_URL = '%s/videos'
VIDEO_SELECTOR = '#content.ytd-rich-item-renderer'
ANCHOR_SELECTOR = 'a#thumbnail'
TITLE_SELECTOR = '#video-title'

In [12]:

def get_yt_videos(url, options=None):
    video_urls = []
    video_titles = []
    with Chrome(options=options) as driver:
        wait = WebDriverWait(driver,3)
        driver.get(VIDEOS_URL % url)

        cookies_reject = driver.find_element(By.XPATH, "//button[@aria-label='Reject all']")
        cookies_reject.click()
        time.sleep(5)

        last_videos_len = None
        while len(video_urls) != last_videos_len:
            last_videos_len = len(video_urls)
            wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body'))).send_keys(Keys.END)

            for video in driver.find_elements(By.CSS_SELECTOR, VIDEO_SELECTOR):
                anchor_tag = video.find_element(By.CSS_SELECTOR, ANCHOR_SELECTOR)
                link = anchor_tag.get_attribute('href')
                if link not in video_urls:
                    video_urls.append(link)

                    title = video.find_element(By.CSS_SELECTOR, TITLE_SELECTOR).text
                    video_titles.append(title)

            time.sleep(1)

    return list(zip(video_urls, video_titles))


def get_all_yt_videos(yt_channels, videos_df, n=None, timeout=20, verbose=False):
    options = Options()
    options.add_argument('--headless=new')
    options.add_argument('--window-size=2560,1440')
    with futures.ThreadPoolExecutor() as executor:
        all_future_videos = []
        for url in yt_channels:
            all_future_videos.append((
                url,
                executor.submit(get_yt_videos, url, options)
                ))
            if len(all_future_videos) == n:
                break

        for (channel_url, future_videos) in all_future_videos:
            try:        
                videos = future_videos.result(timeout=timeout)

                rows = []
                for (video_url, video_title) in videos:
                    if video_url not in videos_df[URL].values:
                        rows.append({
                            CHANNEL: channel_url,
                            TITLE: video_title,
                            URL: video_url
                        })
                
                new_videos_df = pd.DataFrame(rows, columns=[ID, CHANNEL, URL, TITLE])
                new_videos_df.set_index(ID, inplace=True)
                videos_df = pd.concat([videos_df, new_videos_df], ignore_index=True)
                videos_df.index.name = ID
                
                if verbose:
                    print(f'found {len(rows)} videos for {channel_url}')
                videos_df.to_csv(VIDEOS_DATASET_PATH, index=ID)
            except Exception as exc:
                if verbose:
                    print(f'{channel_url} generated an exception')
                    print(exc)
    return videos_df

In [13]:
try:
    videos = pd.read_csv(VIDEOS_DATASET_PATH, index_col=ID)
except FileNotFoundError:
    videos = pd.DataFrame(columns=[ID, CHANNEL, URL, TITLE])
    videos.set_index(ID, inplace=True)
    videos.to_csv(VIDEOS_DATASET_PATH, index=ID)

artists = pd.read_csv(ARTIST_DATASET_PATH)
videos

Unnamed: 0_level_0,channel,url,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,https://www.youtube.com/@DrakeOfficial,https://www.youtube.com/watch?v=ntp9_iznQ-0,Drake - Search & Rescue (Official Visualizer)
1,https://www.youtube.com/@DrakeOfficial,https://www.youtube.com/watch?v=T8nbNQpRwNo,"Drake, 21 Savage - Spin Bout U"
2,https://www.youtube.com/@DrakeOfficial,https://www.youtube.com/watch?v=Iu9xmEaHwpU,Drake - Jumbotron Shit Poppin
3,https://www.youtube.com/@DrakeOfficial,https://www.youtube.com/watch?v=PrSBuEFdRFU,Drake and 21 Savage - Rich Flex Her Loss Recap
4,https://www.youtube.com/@DrakeOfficial,https://www.youtube.com/watch?v=WlaGHS_qs58,Drake & 21 Savage - Privileged Rappers | A COL...
...,...,...,...
2165,https://www.youtube.com/@ArianaGrande,https://www.youtube.com/watch?v=SaF-hAQA99Y,wannabe
2166,https://www.youtube.com/@ArianaGrande,https://www.youtube.com/watch?v=LBKoSOPUMFE,its not unusual
2167,https://www.youtube.com/@ArianaGrande,https://www.youtube.com/watch?v=qXj9Lw7Geg8,dancing grandparents
2168,https://www.youtube.com/@ArianaGrande,https://www.youtube.com/watch?v=toIjc7ywxc8,Shoes


In [9]:

videos.to_csv(VIDEOS_DATASET_PATH, index=ID)

In [14]:
yt_channels = artists[YOUTUBE].dropna().values[7:8]
videos_df = get_all_yt_videos(yt_channels, videos, verbose=True, timeout=120)

found 250 videos for https://www.youtube.com/@eminem
