# Scraping Video Information from Youtube Channel : https://www.youtube.com/@ExploreWithUs/videos

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd

In [2]:
def capture_whole_page(driver, sleep_time=4):
    while True:
        # Get the current scroll position
        current_scroll_position = driver.execute_script("return window.scrollY;")
        
        # Scroll to the bottom of the page
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)

        # Wait for some time to allow content to load
        time.sleep(sleep_time)

        # Get the new scroll position
        new_scroll_position = driver.execute_script("return window.scrollY;")
        
        # If the scroll position did not increase, break the loop
        if new_scroll_position <= current_scroll_position:
            break

In [3]:
def store_video_data(videos):
    list_of_dicts = []
    for video in videos:
        data_dict = {}
        title_and_link = video.find("a", {"id" : "video-title-link"})
        meta = video.find("div", {"id" : "metadata-line"}).find_all("span")
        data_dict["views"] = meta[0].text.split()[0]
        data_dict["video_age"] = meta[1].text
        data_dict["title"] = title_and_link.text
        data_dict["url"] = "https://www.youtube.com" + title_and_link['href']
        list_of_dicts.append(data_dict)
    return list_of_dicts

In [4]:
def get_video_data_from_channel(driver,channel_url,channel_name=None, use_name = True, sleep_time=4):
    if use_name:
        channel_url =  f"https://www.youtube.com/@{channel_name}/videos"

    # Open the channel url    
    driver.get(channel_url)

    # Scroll down to the bottom of the page until all videos are loaded
    capture_whole_page(driver)

    # Get the page source
    html = driver.page_source

    # Parse the page source using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Get all the video containers
    videos = soup.find_all('div', {'id' : 'dismissible'})

    # Store the video data in a list of dictionaries
    list_of_dicts = store_video_data(videos)
    
    # Create a dataframe from the list of dictionaries
    df = pd.DataFrame(list_of_dicts)
    return df

# Preprocessing the data into correct format

In [8]:
from datetime import datetime, timedelta
# Convert relative time to datetime object
def convert_relative_time(relative_time):
    if "hours" in relative_time or "hour" in relative_time:
        hours = int(relative_time.split()[0])
        return datetime.now() - timedelta(hours=hours)
    elif "minutes" in relative_time or "minute" in relative_time:
        minutes = int(relative_time.split()[0])
        return datetime.now() - timedelta(minutes=minutes)
    elif "seconds" in relative_time or "second" in relative_time:
        seconds = int(relative_time.split()[0])
        return datetime.now() - timedelta(seconds=seconds)
    elif "days" in relative_time or "day" in relative_time:
        days = int(relative_time.split()[0])
        return datetime.now() - timedelta(days=days)
    elif "weeks" in relative_time or "week" in relative_time:
        weeks = int(relative_time.split()[0])
        return datetime.now() - timedelta(weeks=weeks)
    elif "months" in relative_time or "month" in relative_time:
        months = int(relative_time.split()[0])
        return datetime.now() - timedelta(days=30 * months)  # Assuming a month is 30 days
    elif "years" in relative_time or "year" in relative_time:
        years = int(relative_time.split()[0])
        return datetime.now() - timedelta(days=365 * years)  # Assuming a year is 365 days
    else:
        raise ValueError("Unsupported time format")

In [9]:
def preprocess_and_save_csv(df, channel_name):
    # Remove "B", "M" and "K" from the views column and convert it to integer
    df["views"] = df["views"].str.replace("B", "000000000")
    df["views"] = df["views"].str.replace("M", "000000")
    df["views"] = df["views"].str.replace("K", "000")
    df["views"] = df["views"].str.replace(".", "")
    df["views"] = df["views"].astype(int)

    # Get the estimation of the upload date from video age
    df["upload_date(estimate)"] = df["video_age"].apply(convert_relative_time)
    df["upload_date(estimate)"] = df["upload_date(estimate)"].dt.date

    # Save the dataframe as a csv file
    df.to_csv(f"{channel_name}_videos.csv", index=False)
    return df

In [29]:
option = webdriver.ChromeOptions()
option.add_argument("start-maximized")
# Set the language to English
option.add_argument("--lang=en")


driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)

# Type the channel name here to scrape the videos
channel_name = "ExploreWithUs"

# Customize the sleep time according to your internet speed
df = get_video_data_from_channel(driver,channel_url=None,channel_name=channel_name, use_name = True, sleep_time=4)

df = preprocess_and_save_csv(df, channel_name)

df.head()

  df["views"] = df["views"].str.replace(".", "")


Unnamed: 0,views,video_age,title,url,upload_date(estimate)
0,49000,58 minutes ago,When a Killer Vlogs His Murders,https://www.youtube.com/watch?v=-vorUEpbHNU,2023-11-24
1,34000000,5 days ago,When Evil Teens Think Murder is Hilarious,https://www.youtube.com/watch?v=KC0GtU_Dtdo,2023-11-20
2,38000000,3 weeks ago,The Disturbing Case of Daniel Marsh,https://www.youtube.com/watch?v=Fwvm3pQCJlg,2023-11-04
3,28000000,3 weeks ago,The Disturbing Case of Ethan Windom,https://www.youtube.com/watch?v=nInpsReK8Sk,2023-11-04
4,24000000,1 month ago,Killer LAUGHS About Mugshot After Murdering Ex,https://www.youtube.com/watch?v=OTE59yYMm0Q,2023-10-26
