In [46]:
import requests
import bs4 as bs
import selenium as sel
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import pandas as pd
import tldextract as tld
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import csv
import os

In [47]:
driver = webdriver.Chrome(ChromeDriverManager().install())

In [48]:
def get_sources(driver, video_link: str):
    driver.get(video_link)
    buttons = driver.find_elements('xpath',
                                "//button[contains(@class, 'box-border inline-flex flex-col items-center justify-center gap-1 rounded-full px-0 py-[.625rem] text-xs font-normal focus:outline-none focus-visible:ring-4 disabled:opacity-60 sm:text-sm xl:flex-row xl:gap-2')]")
    for button in buttons:
        if "Facts &amp; Sources" in button.get_attribute('innerHTML'):
            button.click()
    links = driver.find_elements('xpath',
                                "//a[contains(@class, 'box-border cursor-pointer items-center justify-center gap-2 rounded-full font-bold focus:outline-none focus-visible:ring-4 py-[.625rem] text-primary hover:opacity-80 focus-visible:ring-neutral-1/25 inline-flex px-0 text-sm')]") 
    links = [link.get_attribute('href') for link in links]
    return links

In [49]:
def export_sources_csv_one_video(vid_link: str) -> None:
    title = vid_link.split('/')[-1].split('?')[0]
    directory = f"{vid_link.split('/')[-1].split('?')[-1].split('=')[-1]}-playlist"
    with open (f'{directory}/{title}.csv', 'w') as test_file:
        test_writer = csv.writer(test_file)
        test_writer.writerow(['video', 'subdomain', 'domain', 'top_level', 'response'])
        def parse_page(url):
            # perform the HTTP request to the specified URL
            try:
                response = requests.get(url).status_code
            except:
                response = '404'
            extracted = tld.extract(url)
            test_writer.writerow([title, extracted.subdomain, extracted.domain, extracted.suffix, response])
        
        urls = get_sources(driver, vid_link)
        
        # max number of threads to use
        MAX_THREADS = 12

        # initialize ThreadPoolExecutor and use it to call parse_page() in parallel
        with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
            executor.map(parse_page, urls)

In [50]:
def find_links_in_playlist(driver, playlist_link: str):
    driver.get(playlist_link)
    time.sleep(2)
    videos = driver.find_elements('xpath',
                                  "//a[contains(@class, 'grid grid-cols-[142px_1fr] items-center gap-4 overflow-hidden sm:grid-cols-[168px_1fr]')]")
    return videos

In [51]:
def get_sources_of_playlist(driver, play_list_link: 'str'):
    new_dir = f"{play_list_link.split('/')[-1].split('?')[-1].split('=')[-1]}-playlist"
    if new_dir not in os.listdir():
        os.mkdir(f"{play_list_link.split('/')[-1].split('?')[-1].split('=')[-1]}-playlist")
    videos = find_links_in_playlist(driver, play_list_link)

    fixed = []
    for video in videos:
        fixed.append(video.get_attribute('href'))

    for vid in tqdm(range(len(fixed))):
        vid = fixed[vid]
        export_sources_csv_one_video(vid)
    driver.close()

In [52]:
def get_data_frame_for_playlist(directory: str) -> pd.DataFrame:
    files = [_ for _ in os.listdir(directory) if _.endswith('.csv')]
    dfs = []
    for csv_file in files:
        df = pd.read_csv(os.path.join(directory, csv_file))
        dfs.append(df)
    final_df = pd.concat(dfs, ignore_index=True)
    return final_df

In [22]:
get_sources_of_playlist(
    driver=driver,
    play_list_link='https://www.prageru.com/video/how-to-stop-the-border-crisis?playlist=all-about-immigration',
)

100%|██████████| 23/23 [02:03<00:00,  5.37s/it]


In [26]:
df = get_data_frame_for_playlist('all-about-immigration-playlist')

In [44]:
df.loc[df.top_level == 'co.uk']

Unnamed: 0,video,subdomain,domain,top_level,response
177,the-suicide-of-europe,www,independent,co.uk,200
180,the-suicide-of-europe,www,independent,co.uk,200
184,the-suicide-of-europe,www,express,co.uk,200
187,the-suicide-of-europe,www,spectator,co.uk,404
192,the-suicide-of-europe,www,standard,co.uk,200


In [53]:
get_sources_of_playlist(
    driver=driver,
    play_list_link='https://www.prageru.com/video/the-dark-art-of-political-intimidation?playlist=guarding-our-freedoms',
)

100%|██████████| 20/20 [01:22<00:00,  4.13s/it]


In [54]:
df = get_data_frame_for_playlist('guarding-our-freedoms-playlist')

In [62]:
domains = df[['domain']].value_counts()
domains[domains > 3]

domain            
heritage              13
congress              13
amazon                11
constitutioncenter    11
archives               9
nationalreview         8
fee                    7
prageru                7
yale                   5
washingtonpost         5
cornell                5
taxfoundation          4
cjr                    4
mercatus               4
britannica             4
Name: count, dtype: int64

In [63]:
df.top_level.value_counts()

top_level
com       83
org       55
gov       31
edu       19
co.uk      1
edu.au     1
Name: count, dtype: int64

In [66]:
df.loc[df['domain'] == 'mercatus']

Unnamed: 0,video,subdomain,domain,top_level,response
122,what-is-crony-capitalism,www,mercatus,org,403
123,what-is-crony-capitalism,www,mercatus,org,403
124,what-is-crony-capitalism,www,mercatus,org,403
129,what-is-crony-capitalism,www,mercatus,org,403
