In [None]:
import time
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
from concurrent import futures

VIDEOS_DATASET_PATH = 'datasets/videos.csv'

ID = 'id'
NAME = 'name'
YOUTUBE = 'youtube_url'
SPOTIFY = 'spotify_uri'

CHANNEL = 'channel'
TITLE = 'title'
URL = 'url'


In [None]:

def get_comments(url, options=None, max_comments=100):
    comments = []
    with Chrome(options=options) as driver:
        wait = WebDriverWait(driver,2)
        driver.get(url)

        last_comments_len = None
        while len(comments) != last_comments_len and len(comments) < max_comments:
            last_comments_len = len(comments)
            for _ in range(10):
                wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
                time.sleep(1)

            for comment in driver.find_elements(By.CSS_SELECTOR, "#content-text"):
                if comment.text not in comments:
                    comments.append(comment.text)
            print(f'{len(comments) - last_comments_len} new comments')

    return comments

def get_all_comments(urls, options=None, timeout=120):
    options = Options()
    options.add_argument('--headless=new')
    options.add_argument('--mute-audio')
    all_comments = []
    with futures.ThreadPoolExecutor() as executor:
        future_comments = [executor.submit(get_comments, url, options) for url in urls]
        for future_comment, url in zip(future_comments, urls): 
            try:        
                all_comments.append((url, future_comment.result(timeout=timeout))) # can use `timeout` to wait max seconds for each thread               
            except Exception as exc:
                print(f'{url} generated an exception')
                print(exc)
    return all_comments


In [None]:
videos = pd.read_csv(VIDEOS_DATASET_PATH, index_col=ID)

comments = get_all_comments(videos[URL].values[:5])

In [None]:
comments