In [None]:
import time
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
from concurrent import futures

DATASET_PATH = 'datasets/artists2.csv'

ID = 'id'
NAME = 'name'
YOUTUBE = 'youtube_url'
SPOTIFY = 'spotify_uri'

SEARCH_URL = 'https://www.youtube.com/results?search_query=%s'
ANCHOR_SELECTOR = 'a.channel-link, .ytd-secondary-search-container-renderer a'

In [None]:
def load_csv():
    return pd.read_csv(DATASET_PATH, index_col=ID)

def save_csv(df):
    df.to_csv(DATASET_PATH, encoding='utf-8')  


def get_yt_channel(artist_name, options=None):
    artist_name = artist_name.replace('&', '%26')
    artist_name += ' music'

    with Chrome(options=options) as driver:
        wait = WebDriverWait(driver,5)
        driver.get(SEARCH_URL % artist_name)
        
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ANCHOR_SELECTOR)))
        anchor_tag = driver.find_element(By.CSS_SELECTOR, ANCHOR_SELECTOR)
        return anchor_tag.get_attribute('href')

def get_yt_channels(df, n=None, timeout=20, verbose=False):
    options = Options()
    options.add_argument('--headless=new')
    options.add_argument('--window-size=2560,1440')
    with futures.ThreadPoolExecutor() as executor:
        future_urls = []
        for index, row in df[df[YOUTUBE].isna()].iterrows():
            future_urls.append((
                index, 
                row[NAME], 
                executor.submit(get_yt_channel, row[NAME], options)
                ))
            if len(future_urls) == n:
                break

        for (index, name, future_url) in future_urls:
            try:        
                url = future_url.result(timeout=timeout)
                df.at[index, YOUTUBE] = url
                if verbose:
                    print(f'{name} has channel {url}')  
                save_csv(df)  
            except Exception as exc:
                if verbose:
                    print(f'{name} generated an exception')

In [None]:
df = load_csv()
if YOUTUBE not in df.columns:
    df[YOUTUBE] = np.nan

# loop in case of timeouts
while df[YOUTUBE].isna().sum() > 0:
    get_yt_channels(df, verbose=True)
    print(f'{df[YOUTUBE].isna().sum()} artists left')