In [274]:
import pandas as pd
from pytube import Channel
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from webdriver_manager.firefox import GeckoDriverManager
from selenium.common import NoSuchElementException
import logging
import time
from typing import List
from dataclasses import dataclass
from typing import Dict

In [39]:
channels_df = pd.read_csv("channels.csv", sep=";")

In [40]:
channels_df["channel_videos_url"] = channels_df["channel"].apply(lambda c: f"https://www.youtube.com/@{c}/videos")

In [93]:
test_url = channels_df["channel_videos_url"].iloc[0]

In [94]:
test_url

'https://www.youtube.com/@UPA_Federal/videos'

In [290]:
def setup_driver():

    options = Options()
    options.headless = False
    return webdriver.Firefox(options=options)
    

In [291]:
driver = setup_driver()

In [113]:
def video_urls(channel_url:str) -> List[str]:
    # Go to video page
    driver.get(test_url)
    time.sleep(1)
    
    try:
        driver.find_element("xpath", "/html/body/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/form[1]/div/div/button/span").click()
    except NoSuchElementException:
        pass
    
    lastHeight = driver.execute_script("return document.documentElement.scrollHeight")

    while True:

        driver.execute_script(f"window.scrollTo(0, {lastHeight});")
        time.sleep(1)    
        newHeight = driver.execute_script("return document.documentElement.scrollHeight")

        if newHeight == lastHeight:
            break

        lastHeight = newHeight
        
    soup = BeautifulSoup(driver.page_source, "html")
    urls = []
    
    for video in soup.find_all("a", {"class":"yt-simple-endpoint focus-on-expand style-scope ytd-rich-grid-media"}):
        urls.append(f"youtube.com{video.get('href')}")
        
    
    return urls
    

In [126]:
test_urls = channels_df[["channel", "channel_videos_url"]].iloc[:10]
results = {}
for tup in test_urls.itertuples():
    print(f"Parsing urls from channel: {tup.channel}")
    results[tup.channel] = video_urls(tup.channel_videos_url)

Parsing urls from channel: UPA_Federal
Parsing urls from channel: CSIFNacional
Parsing urls from channel: confederaciongeneraldeltra5752
Parsing urls from channel: cgtoficial4015
Parsing urls from channel: CNTsindicato
Parsing urls from channel: avantarvideos
Parsing urls from channel: 1UGT
Parsing urls from channel: UGTPV
Parsing urls from channel: fspugt
Parsing urls from channel: FeSMCUGT


In [130]:
test_df = pd.DataFrame(results).melt().rename(columns={"variable":"channel", "value":"url"})

In [132]:
test_df.to_csv("test_video_urls.csv")

In [133]:
test_df

Unnamed: 0,channel,url
0,UPA_Federal,youtube.com/watch?v=grTRnHWDFeU
1,UPA_Federal,youtube.com/watch?v=cyhdsw2kkiA
2,UPA_Federal,youtube.com/watch?v=Gfe2IIHOe3g
3,UPA_Federal,youtube.com/watch?v=hOR8S5ZnhTM
4,UPA_Federal,youtube.com/watch?v=gAnD56tcIOQ
...,...,...
2605,FeSMCUGT,youtube.com/watch?v=OYS2pt9DVsY
2606,FeSMCUGT,youtube.com/watch?v=yq5LTOSht7M
2607,FeSMCUGT,youtube.com/watch?v=5Q9yXVlAtXA
2608,FeSMCUGT,youtube.com/watch?v=f0QNt-OsmgE


In [138]:
videos_df = pd.read_csv("video_urls.csv", index_col=0)

In [140]:
tst = pd.DataFrame({"col1":[1,2,3], "col2":[4,5,6]})

In [142]:
from collections import defaultdict

In [139]:
videos_df

Unnamed: 0,channel,url
0,UPA_Federal,youtube.com/watch?v=grTRnHWDFeU
1,UPA_Federal,youtube.com/watch?v=cyhdsw2kkiA
2,UPA_Federal,youtube.com/watch?v=Gfe2IIHOe3g
3,UPA_Federal,youtube.com/watch?v=hOR8S5ZnhTM
4,UPA_Federal,youtube.com/watch?v=gAnD56tcIOQ
...,...,...
46527,unionesindacaleitalianafin2917,youtube.com/watch?v=vZ82ph1y3hY
46528,unionesindacaleitalianafin2917,youtube.com/watch?v=znmzMU_QAtc
46529,unionesindacaleitalianafin2917,youtube.com/watch?v=rh0MJviDijI
46530,unionesindacaleitalianafin2917,youtube.com/watch?v=n2BEq5KGNQQ


## Channel metadata

In [273]:
channels

Unnamed: 0,channel,unionname,org,name of the channel if differs from unionname,country,channel_videos_url
0,UPA_Federal,UPA - Unión de Pequeños Agricultores y Ganaderos,UGT,,spain,https://www.youtube.com/@UPA_Federal/videos
1,CSIFNacional,Central Sindical Independiente y de Funcionarios,,CSIF Nacional,spain,https://www.youtube.com/@CSIFNacional/videos
2,confederaciongeneraldeltra5752,Confederación General del Trabajo (CGT),CGT,,spain,https://www.youtube.com/@confederaciongenerald...
3,cgtoficial4015,Confederación General del Trabajo (CGT),CGT,CGT Oficial,spain,https://www.youtube.com/@cgtoficial4015/videos
4,CNTsindicato,Confederación Nacional del Trabajo (CNT),CNT,,spain,https://www.youtube.com/@CNTsindicato/videos
...,...,...,...,...,...,...
271,uil-unioneitalianadellavor4538,UIL - Unione Italiana del Lavoro,UIL,,italy,https://www.youtube.com/@uil-unioneitalianadel...
272,cub-confederazioneunitaria1458,Confederazione del Comitati di Base (CUB),,CUB - Confederazione Unitaria di Base,italy,https://www.youtube.com/@cub-confederazioneuni...
273,confsalVideo,CONFSAL,,,italy,https://www.youtube.com/@confsalVideo/videos
274,unionesindacaleitalianacar4411,Unione Sindacale Italiana Carabinieri,,,italy,https://www.youtube.com/@unionesindacaleitalia...


In [280]:
from datetime import datetime

In [311]:
@dataclass
class ChannelMeta:
    
    channel: str
    channel_url: str
    created_at: datetime
    videos_count: int
    subs_count: int
    views_count: int
    description: str
    links: Dict[str, str]
    

In [312]:
import contextlib

In [318]:
def get_channel_meta(channel:str) -> ChannelMeta:
    
    channel_url = f"https://www.youtube.com/@{channel}"
    
    driver.get(f"{channel_url}/about")
    time.sleep(1)

    with contextlib.suppress(NoSuchElementException):
        driver.find_element(
            "xpath",
            "/html/body/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/form[1]/div/div/button/span",
        ).click()
    time.sleep(1)
    created_at = driver.find_element(
        "xpath", 
        "/html/body/ytd-app/div[1]/ytd-page-manager/ytd-browse/ytd-two-column-browse-results-renderer/div[1]/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-channel-about-metadata-renderer/div[2]/yt-formatted-string[2]/span[2]"
    ).text
    views_count = driver.find_element(
        "xpath", 
        "/html/body/ytd-app/div[1]/ytd-page-manager/ytd-browse/ytd-two-column-browse-results-renderer/div[1]/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-channel-about-metadata-renderer/div[2]/yt-formatted-string[3]"
    ).text
    
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Get links
    links_div = soup.find("div", {"id":"links-container"})
    urls = links_div.find_all("a", {"class":"yt-simple-endpoint style-scope ytd-channel-about-metadata-renderer"})
    names = links_div.find_all("yt-formatted-string", {"class":"info-text style-scope ytd-channel-about-metadata-renderer"})
    links = {n.text:u.get("href") for n, u in zip(names, urls)}
    
    videos_count =  soup.find(id="videos-count").text
    subs_count =  soup.find(id="subscriber-count").text
    description = soup.find("div", {"id":"description-container"}).find(id="description").text
    
    
    return ChannelMeta(
        channel = channel,
        channel_url = channel_url,
        created_at = created_at,
        videos_count= videos_count,
        subs_count=subs_count,
        views_count=views_count,
        description= description,
        links = links
    )
    
    
    

In [319]:
channels = pd.read_csv("channels.csv", index_col=0)

In [320]:
tst_channel = channels["channel"].iloc[1]#[:-7]

In [321]:
tst_channel

'CSIFNacional'

In [322]:
get_channel_meta(tst_channel)

ChannelMeta(channel='CSIFNacional', channel_url='https://www.youtube.com/@CSIFNacional', created_at='Oct 8, 2014', videos_count='2.9K videos', subs_count='3.23K subscribers', views_count='658,005 views', description='Vídeos con entrevistas e información del CSIF, sindicato profesional e independiente. Creciendo en la empresa privada y mayoritario en las administraciones. Somos tu alternativa, otra manera de hacer sindicalismo.', links={'CSIF': 'https://www.youtube.com/redirect?event=channel_description&redir_token=QUFFLUhqbTZYRWJzdXZvY2J1VW5PUWp3aTRra2ltQnJCQXxBQ3Jtc0trVVQyU1hEcHkwTkJ5cm9KSFVnMXc5THBRaXNVYUFLOHJwMDRZMHJjaEtfaWlJRnVBYVZSU29HVFhud2R6UzJRWGlYWWFYUmZRQWFrb3JJc2drbjFEYmtaUm9HNXp0UGhIcC0yTTRxZWdrM08ydVMyOA&q=https%3A%2F%2Fwww.csif.es%2F'})

In [168]:
def channel_metadata(channel_url:str) -> List[str]:
    # Go to video page
    driver.get(channel_url)
    time.sleep(1)
    
    try:
        driver.find_element("xpath", "/html/body/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/form[1]/div/div/button/span").click()
    except NoSuchElementException:
        pass
    
    return BeautifulSoup(driver.page_source, "html.parser")

In [244]:
soup = channel_metadata("https://www.youtube.com/@LinusTechTips/about")

In [170]:
tst_channel

'https://www.youtube.com/@CSIFNacional'

In [278]:
videos_count

'6K videos'

In [279]:
sub_count

'15M subscribers'

In [245]:
videos_count =  soup.find(id="videos-count").text

In [246]:
sub_count =  soup.find(id="subscriber-count").text

In [247]:
description = soup.find("div", {"id":"description-container"}).find(id="description").text

In [248]:
sub_count

'15M subscribers'

In [258]:
links_div = soup.find("div", {"id":"links-container"})

In [268]:
urls = links_div.find_all("a", {"class":"yt-simple-endpoint style-scope ytd-channel-about-metadata-renderer"})
names = links_div.find_all("yt-formatted-string", {"class":"info-text style-scope ytd-channel-about-metadata-renderer"})

In [270]:
{n.text:u.get("href") for n, u in zip(names, urls)}

{'Merch': 'https://www.youtube.com/redirect?event=channel_description&redir_token=QUFFLUhqbjRYdXg3RHlibzh0SDVRaDRNS3M1WEN5YVpfZ3xBQ3Jtc0tsZDFCQy1LMFNVckhyQ1ZQUVZnRjRKREZIdkM5Q182UDUwVTRkMk43U2hsVzIxaG5zVkp3OHRlbUZRSXYwNTlrMmJlZ19OM0dpQ0d5UDZIZkFneGZPSEI2MWstOUNWOEJQMHQ3cmc4YU0zZ2l5Nzk5aw&q=https%3A%2F%2Fwww.lttstore.com%2F',
 'Facebook': 'https://www.youtube.com/redirect?event=channel_description&redir_token=QUFFLUhqazdER0o4VFA1b3FvZUItV3RHc2V1X1FxUUZTd3xBQ3Jtc0ttSWtSVE9uM3FMdWtwX1NnVmZvVlVkdjBXSElsMW44MjVQUDBCRFJwRUxPRERYbENRWF9kWi12dHczUTFOMkxuSTZwUFFOY0pVbC0tZElMUE9YMEphZzNERDlOOTFpS3l1bXJmMWp5SFUyQTQ5eHlzSQ&q=http%3A%2F%2Fwww.facebook.com%2FLinusTech',
 'Twitter': 'https://www.youtube.com/redirect?event=channel_description&redir_token=QUFFLUhqbnlreVA2VU1kcUtKVl9YWlVabXdHYVRuS3VjUXxBQ3Jtc0tsRU5reGRYOXh3RjlTQTZyNE5VWG1KMnZwZW8tRXd1MDg4QlltWWdESGRwTEYzN20tZkRGeDVIOVVVTzVnc1FNeXh1QlkxZ1JsSDNNenpnUjhwbFZ2QXFHWGplMFAtZVpCN2plSFZyNVJCcDh6YzczWQ&q=http%3A%2F%2Ftwitter.com%2FLinusTech',
 'T

In [266]:
links_div

<div class="style-scope ytd-channel-about-metadata-renderer" id="links-container">
<yt-formatted-string class="subheadline style-scope ytd-channel-about-metadata-renderer">Links</yt-formatted-string>
<div class="style-scope ytd-channel-about-metadata-renderer" id="link-list-container">
<a class="yt-simple-endpoint style-scope ytd-channel-about-metadata-renderer" href="https://www.youtube.com/redirect?event=channel_description&amp;redir_token=QUFFLUhqbjRYdXg3RHlibzh0SDVRaDRNS3M1WEN5YVpfZ3xBQ3Jtc0tsZDFCQy1LMFNVckhyQ1ZQUVZnRjRKREZIdkM5Q182UDUwVTRkMk43U2hsVzIxaG5zVkp3OHRlbUZRSXYwNTlrMmJlZ19OM0dpQ0d5UDZIZkFneGZPSEI2MWstOUNWOEJQMHQ3cmc4YU0zZ2l5Nzk5aw&amp;q=https%3A%2F%2Fwww.lttstore.com%2F">
<yt-formatted-string class="info-text style-scope ytd-channel-about-metadata-renderer" no-endpoints="" no-style="">Merch</yt-formatted-string>
</a>
<a class="yt-simple-endpoint style-scope ytd-channel-about-metadata-renderer" href="https://www.youtube.com/redirect?event=channel_description&amp;redir_toke

In [251]:
[i.get("href") for i in soup.find_all("div", {"id":"links-container"})]

[None]

In [237]:
driver.find_element("xpath", "/html/body/ytd-app/div[1]/ytd-page-manager/ytd-browse/ytd-two-column-browse-results-renderer/div[1]/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-channel-about-metadata-renderer/div[2]/yt-formatted-string[2]/span[2]").text

'Oct 8, 2014'

In [239]:
driver.find_element("xpath", "/html/body/ytd-app/div[1]/ytd-page-manager/ytd-browse/ytd-two-column-browse-results-renderer/div[1]/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-channel-about-metadata-renderer/div[2]/yt-formatted-string[3]").text

'657,115 views'

## Youtube API V3

In [344]:
def get_channel_id(channel:str) -> ChannelMeta:
    
    
    driver.get(f"https://www.youtube.com/@{channel}")
    time.sleep(1)

    with contextlib.suppress(NoSuchElementException):
        driver.find_element(
            "xpath",
            "/html/body/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/form[1]/div/div/button/span",
        ).click()
    time.sleep(1)
    
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    return soup.find("link", {"rel":"canonical"}).get("href").split("/")[-1]

In [345]:
tst_id = get_channel_id(tst_channel)

In [346]:
tst_id

'UCp-W5wZk3dOWOPCoLdP4gTg'

In [329]:
api_key = "AIzaSyDf5gV7kqHmTsG5qnvG3cg6ZoPHuLD9xTY"

In [360]:
from googleapiclient.discovery import build

#api_key = '#YOURAPIKEY'

youtube = build('youtube', 'v3', developerKey=api_key)

request = youtube.channels().list(
        part=['statistics', 'snippet'],
        id=tst_id,
    )

response = request.execute()

print(response)

{'kind': 'youtube#channelListResponse', 'etag': 'BnoO8J__Pbc0-26YF3JUjnm2pkA', 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5}, 'items': [{'kind': 'youtube#channel', 'etag': 'f0Id-1IoSsCehGUJNkP5FL9zAcU', 'id': 'UCp-W5wZk3dOWOPCoLdP4gTg', 'snippet': {'title': 'CSIF Nacional', 'description': 'Vídeos con entrevistas e información del CSIF, sindicato profesional e independiente. Creciendo en la empresa privada y mayoritario en las administraciones. Somos tu alternativa, otra manera de hacer sindicalismo.', 'customUrl': '@csifnacional', 'publishedAt': '2014-10-08T15:11:37Z', 'thumbnails': {'default': {'url': 'https://yt3.ggpht.com/ytc/AMLnZu8jQ7uOj8yoQd8GfqUMIIcr1OUZRcp5qVOOOhXxhQ=s88-c-k-c0x00ffffff-no-rj', 'width': 88, 'height': 88}, 'medium': {'url': 'https://yt3.ggpht.com/ytc/AMLnZu8jQ7uOj8yoQd8GfqUMIIcr1OUZRcp5qVOOOhXxhQ=s240-c-k-c0x00ffffff-no-rj', 'width': 240, 'height': 240}, 'high': {'url': 'https://yt3.ggpht.com/ytc/AMLnZu8jQ7uOj8yoQd8GfqUMIIcr1OUZRcp5qVOOOhXxhQ=s800-c-k-c0x

In [361]:
response['statistics']

KeyError: 'statistics'