### This notebook focuses on scraping the YouTube video titles, description, views, length, likes, dislikes, captions and comments

In [0]:
#!pip install selenium
#!pip install youtube_transcript_api
import requests
import bs4 as bs
import time
import json
import pprint
import os
from selenium import webdriver 
import pandas as pd 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from youtube_transcript_api import YouTubeTranscriptApi

### Part 1
Getting the links on fle

In [0]:
current_wkd = os.getcwd()

In [0]:
# Using Selenium to access the main Tasty videos page
header = {'User-agent': 'Mozilla/5.0'} 
video_url = "https://www.youtube.com/channel/UCJFp8uSYCjXOMnkUyb3CQ3Q/videos"
driver = webdriver.Chrome('/Users/linyandai/anaconda3/lib/python3.7/site-packages/chromedriver')  
driver.get(video_url)

In [0]:
# Saving the main tasty page
with open('tasty_page.htm', 'w') as f:
    f.write(driver.page_source)

In [0]:
# Extracting and saving video links 
# We manually scroll through the page to get videos that are in the year 2019
video = driver.find_elements_by_xpath('//*[@id="video-title"]')
video_ids = []
videos = []
for count, vid in enumerate(video):
    # getting each url
    url = vid.get_attribute('href')
    
    response = requests.get(url, headers=header)
    vid_name = 'video'+ str(count+1)
    pagename = vid_name +".htm"
    vid_id = url[-11:]
    
    os.chdir(current_wkd+'/tasty_videos')
    f = open(pagename, 'w')
    f.write(response.text)
    f.close()
    
    video_ids.append(vid_id)
    videos.append(vid.get_attribute('href'))

print(len(videos))

1379


In [0]:
# setting directory again
os.chdir(current_wkd) 

In [0]:
# Quitting driver cuz I don't want youtube to block me
driver.quit()

### Part 2
We won't be sending a request to the page again. We will be using the video links stored on file to furthur scrape

In [0]:
len(video_ids)

1379

In [0]:
# Saving IDs to file
f = open('video_ids.txt', 'w')
for ids in video_ids:
    f.write(ids+'\n')
f.close()

In [0]:
# Creating tasty video table. Adding the video ids to table
tasty_table = pd.DataFrame()
tasty_table["ids"] = video_ids

In [0]:
# Getting video titles
os.chdir(current_wkd+'/tasty_videos')
tasty_titles = []
for index in range(len(video_ids)):
    try:
        vid_title ={}
        f = open("video"+str(index+1)+".htm", 'r')
        page = f.read()
        f.close()
    
        tasty_soup = bs.BeautifulSoup(page,'html.parser')
        vid_title = tasty_soup.find("span", attrs={"class": "watch-title"}).text.strip()
        tasty_titles.append(vid_title)
    except:
        vid_title = "N/A"
        tasty_titles.append(vid_title)
        pass
    
os.chdir(current_wkd)

In [0]:
tasty_table['titles'] = tasty_titles

In [0]:
# Getting video descriptions
os.chdir(current_wkd+'/tasty_videos')
tasty_desc = []
for index in range(len(video_ids)):
    try:
        vid_desc ={}
        f = open("video"+str(index+1)+".htm", 'r')
        page = f.read()
        f.close()
    
        tasty_soup = bs.BeautifulSoup(page,'html.parser')
        vid_desc = tasty_soup.find("p", attrs={"id": "eow-description"}).text
        tasty_desc.append(vid_desc)
    except:
        vid_desc = "N/A"
        tasty_desc.append(vid_desc)
        pass
    
os.chdir(current_wkd)

In [0]:
tasty_table['description'] = tasty_desc

In [0]:
# Getting video views
os.chdir(current_wkd+'/tasty_videos')
tasty_views = []
for index in range(len(video_ids)):
    try:
        vid_views ={}
        f = open("video"+str(index+1)+".htm", 'r')
        page = f.read()
        f.close()
    
        tasty_soup = bs.BeautifulSoup(page,'html.parser')
        vid_views = int(tasty_soup.find("div", attrs={"class": "watch-view-count"}).text[:-6].replace(",", ""))
        tasty_views.append(vid_views)
    except:
        vid_views = "N/A"
        tasty_views.append(vid_views)
        pass
    
os.chdir(current_wkd)

In [0]:
tasty_table['views'] = tasty_views

In [0]:
# Getting date of video upload
os.chdir(current_wkd+'/tasty_videos')
tasty_date = []
for index in range(len(video_ids)):
    try:
        vid_date ={}
        f = open("video"+str(index+1)+".htm", 'r')
        page = f.read()
        f.close()
    
        tasty_soup = bs.BeautifulSoup(page,'html.parser')
        vid_date = tasty_soup.find("strong", attrs={"class": "watch-time-text"}).text
        tasty_date.append(vid_date)
    except:
        vid_date = "N/A"
        tasty_date.append(vid_date)
        pass
    
os.chdir(current_wkd)

In [0]:
tasty_table['date'] = tasty_date

In [0]:
# Getting likes and dislikes
os.chdir(current_wkd+'/tasty_videos')
tasty_likes = []
tasty_dislikes = []
for index in range(len(video_ids)):
    try:
        vid_likes ={}
        vid_dislikes ={}
        f = open("video"+str(index+1)+".htm", 'r')
        page = f.read()
        f.close()
    
        tasty_soup = bs.BeautifulSoup(page,'html.parser')
        vid_likes = int(tasty_soup.find("button", attrs={"title": "I like this"}).text.replace(",", ""))
        vid_dislikes = int(tasty_soup.find("button", attrs={"title": "I dislike this"}).text.replace(",", ""))
        tasty_likes.append(vid_likes)
        tasty_dislikes.append(vid_dislikes)
    except:
        vid_likes = "N/A"
        vid_dislikes = "N/A"
        tasty_likes.append(vid_likes)
        tasty_dislikes.append(vid_dislikes)
        pass
    
os.chdir(current_wkd)

In [0]:
tasty_table['likes'] = tasty_likes
tasty_table['dislikes'] = tasty_dislikes

In [0]:
# Using the main page to get video lengths
f = open("tasty_page.htm", 'r')
main_page = f.read()
f.close()
page_soup = bs.BeautifulSoup(main_page,'html.parser')
video_len = page_soup.find_all("span", attrs={"class": "style-scope ytd-thumbnail-overlay-time-status-renderer"})

In [0]:
tasty_len =[]
for vid_len in video_len:
    try:
        tasty_len.append(vid_len.text.strip())
    except: 
        tasty_len.append("N/A")
        pass   

In [0]:
tasty_table['length'] = tasty_len

In [0]:
# Getting subtitles using API
tasty_captions =[]

for video in video_ids:
    try:
        text = ''
        sub = YouTubeTranscriptApi.get_transcript(video)
        for index in range(len(sub)):
            t = sub[index]['text']
            text = text + " " + t
        tasty_captions.append(text)
    except: 
        tasty_captions.append("N/A")
        pass   

In [0]:
tasty_table['subtitle'] = tasty_captions

In [0]:
tasty_captions[1]

' [Music]'

In [0]:
tasty_table.head()

Unnamed: 0,ids,titles,description,views,date,likes,dislikes,length,subtitle
0,8K7XiEOx3Fw,I Tested Amazon's Best-Selling Air Fryer • Alix,Alix is testing Amazon's best selling air frye...,104887,"Published on Mar 9, 2020",5108,307,12:30,three two one boop-boo-boo-boop I did it and ...
1,ysVnhqwlDbo,16 Ways To Up Your Breakfast Toast Game • Tasty,Toast in 2020 is far superior to anything you'...,128680,"Published on Mar 7, 2020",3932,88,5:42,[Music]
2,OoH1oGAJ7jI,Tasty Producers Swap Their Favorite Snacks • R...,Andrew and Rie are swapping their favorite sna...,832586,"Published on Mar 7, 2020",23600,341,12:54,it is a little weird now that I'm doing it in...
3,7xycoVXyFGE,Stuffed French Toast by Chef Andrea Drummer,"Inspired by bread pudding and crème brûlée, th...",105588,"Published on Mar 5, 2020",4421,73,3:34,I love french toast I'm not really a pancake ...
4,oUeanf1tg7U,5 Homemade Dumplings To Feast On • Tasty,The tastiest dumplings you ever did see. Shop ...,324847,"Published on Mar 5, 2020",8608,175,6:23,


In [0]:
tasty_table.to_csv(r'Tasty_Videos.csv')