# YouTube:
In this notebook we'll use selenium to crawl YouTube.  
For each YouTube video - we first get the URL and break it down in order to get the video ID.  
After we get the ID we'll use it to get the stats of that video from YouTube API.  

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# >>>>--- imports for Google & YouTube API ---<<<<<
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

import urllib.parse as p
import re
import os
import pickle

SCOPES = ["https://www.googleapis.com/auth/youtube.force-ssl"]

In [3]:
#selenium imports to search in youtube using selenium
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

#chrome webdriver path
PATH = "C:\Program Files (x86)\chromedriver.exe"

In [3]:
# At this point the dataframe is already cleaned.
# NaN values and duplicates are already taken care of. 
games_df = pd.read_csv("steam_spy_clean.csv")

# Creating a lst of all video games in the dataframe
video_game_names = [name for name in games_df["name"]]

# YouTube API:
Setting up functions to interact with the API.  
    1. _youtube_authenticate_ - Authentication using the credentials. (credentials saved as Json file).  
    2. _get_video_id_by_url_ - The function gets a URL and breaks it down in order to get the ID of a video.  
    3. _get_video_details_ - The function gets the video details using the ID provided.  
    4. _get_video_stats_ - Returns video stattistics.
    
###### NOTE:
YouTube made the 'dislikes' count private so there's no way to get that info, even from the API.  

In [4]:
# YouTube API Authentication 

def youtube_authenticate():
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
    api_service_name = "youtube"
    api_version = "v3"
    client_secrets_file = "credentials_omer.json"
    #client_secrets_file = "credentials.json"
    creds = None
    
    # the file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first time
    if os.path.exists("token_omer.pickle"):
        with open("token_omer.pickle", "rb") as token:
            creds = pickle.load(token)
   
    # if there are no (valid) credentials availablle, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(client_secrets_file, SCOPES)
            creds = flow.run_local_server(port=0)
            
        # save the credentials for the next run
        with open("token_omer.pickle", "wb") as token:
            pickle.dump(creds, token)

    return build(api_service_name, api_version, credentials=creds)

In [5]:
# Returns video id from the given url

def get_video_id_by_url(url):
    # split URL parts
    parsed_url = p.urlparse(url)
    # get the video ID by parsing the query of the URL
    video_id = p.parse_qs(parsed_url.query).get("v")
    try:
        return video_id[0]
    except:
        return 'X' 
    #selenium could not open some of the videos, so instead of raising exception and stopping the code, 
    #the func. returns X so we can handle it later

In [6]:
# Returns videos details using video ID that we got in the cell above

def get_video_details(youtube2, **kwargs):
    return youtube2.videos().list(part="statistics", **kwargs).execute()

In [7]:
# NOTE:
# YouTube made 'dislike count' private (on 10/11/2021).

def get_video_stats(video_response):
    try:
        items = video_response.get("items")[0]
    except IndexError:
        return 0,0,0
    
    statistics = items["statistics"]
    
    # get stats infos
    
    try:
        comment_count = statistics["commentCount"]
    except KeyError:
        comment_count = 0 #in some videos the comments section is disabled
   
    try:
        like_count = statistics["likeCount"]
    except KeyError:
        like_count = statistics["favoriteCount"]
    
    view_count = statistics["viewCount"]
    
    
    return comment_count, like_count, view_count

In [59]:
# authenticate to YouTube API
youtube = youtube_authenticate()

# ------>> Selenium set up <<------#
driver = webdriver.Chrome(PATH)
driver.maximize_window()
driver.get("https://www.youtube.com")
#--------------------------------------

vid_id = [] #list of videos ID

# Searching for the videos.

#had to manualy set range for each batch because i got errors at random points every time
for i in range(3951,len(video_game_names)): 
    searchbox = driver.find_element_by_name("search_query")
    
    # The addition of 'official trailer' guarantees that we get a legit and not some 'fan made' trailer.
    searchbox.send_keys(video_game_names[i]+" official trailer")
    driver.find_element_by_id("search-icon-legacy").click()
    time.sleep(2)
    
    #clicks on the first search result of the YouTUbe search
    driver.find_element(By.CLASS_NAME, "style-scope ytd-video-renderer").click()
    temp_url = driver.current_url
    v_id = get_video_id_by_url(temp_url)
    vid_id.append(v_id)
    
    driver.find_element_by_name("search_query").clear() #clears the search box 
    time.sleep(3)
    
driver.close() #closing the browser after collecting all the data of all games

  driver = webdriver.Chrome(PATH)
  searchbox = driver.find_element_by_name("search_query")
  driver.find_element_by_id("search-icon-legacy").click()
  driver.find_element_by_name("search_query").clear() #clears the search box


In [83]:
len(vid_id)

853

Selenium had some issues with YouTue so we had to acquire the IDs in batches.

In [22]:
first_batch = pd.DataFrame({"name":video_game_names[:1162], "vid_id":vid_id})
first_batch.to_csv("first_batch.csv")

In [25]:
first_batch

Unnamed: 0,name,vid_id
0,Counter-Strike: Global Offensive,edYCtaNueQY
1,Dota 2,-cSFPIwMEq4
2,Grand Theft Auto V,QkkoHAzjnUs
3,PUBG: BATTLEGROUNDS,fDLAFIhfFy4
4,Tom Clancy's Rainbow Six Siege,KlbLLRdg9u8
...,...,...
1157,Lucy -The Eternity She Wished For-,X
1158,Stronghold HD,PzohNEcbwqw
1159,Tannenberg,9RW4pfqNyeg
1160,The Messenger,qnm885kLggY


In [37]:
video_game_names[1162:1995]
second_batch = pd.DataFrame({"name":video_game_names[1162:1995], "vid_id":vid_id})
second_batch.to_csv("second_batch.csv")
second_batch

Unnamed: 0,name,vid_id
0,Styx: Master of Shadows,HpVuf0VZ-fs
1,Call of Cthulhu,Va7Zmu1dd1E
2,shapez.io,KyorY1uIqiQ
3,The Long Drive,Z_PrCO7HZFU
4,Cube World,xXyBVbryR9M
...,...,...
828,Red Faction: Armageddon,I_9UnBm40zA
829,Suzerain,hrdLTySs128
830,Shadow Arena,qvUPNKx5LZw
831,Evil Genius,Qpdb9zo_1t8


In [97]:
video_game_names[1995:3951]

['Ashes of the Singularity: Escalation',
 'Turbo Pug',
 'The Dark Pictures Anthology: House of Ashes',
 'Moon Hunters',
 'OshiRabu: Waifus Over Husbandos',
 'Oh...Sir!! The Insult Simulator',
 'X Rebirth',
 'SHENZHEN I/O',
 'Cube Escape Collection',
 'Thumper',
 'Rock of Ages',
 'Unreal Tournament 3 Black',
 'Never Split the Party',
 'Stoneshard: Prologue',
 'Pumpkin Jack',
 'Tomb Raider I',
 'Thea: The Awakening',
 'Terraforming Mars',
 'Hyperdimension Neptunia Re;Birth3 V Generation',
 'Dead Maze',
 'Gas Guzzlers Extreme',
 'ESEA',
 'Command & Conquer: Red Alert 3 - Uprising',
 'Electronic Super Joy',
 'Enemy Front',
 'Cthulhu Saves the World',
 'Space Hulk: Deathwing',
 'Rampage Knights',
 'HighFleet',
 'Iconoclasts',
 'Last Day of June',
 'Bloody Trapland',
 'Civilization IV: Beyond the Sword',
 'Hard West',
 'Cliff Empire',
 'Project Highrise',
 'Pinball FX3',
 'Crashlands',
 'Tanki Online',
 'Hell Yeah! Wrath of the Dead Rabbit',
 'Ratz Instagib',
 'Dead Realm',
 'Seek Girl:Fog Ⅰ

In [58]:
third_batch = pd.DataFrame({"name":video_game_names[1995:3951], "vid_id":vid_id})
third_batch.to_csv("third_batch.csv")
third_batch

Unnamed: 0,name,vid_id
0,Ashes of the Singularity: Escalation,zV0fao1rRUs
1,Turbo Pug,Hlcz79GEcfk
2,The Dark Pictures Anthology: House of Ashes,QBZgZdC989c
3,Moon Hunters,atrpfIj50fY
4,OshiRabu: Waifus Over Husbandos,ZvxdMtQz-Ts
...,...,...
1951,HeXen: Beyond Heretic,rvpdrvLA0s8
1952,Star Wolves 3: Civil War,_Uu1pKI5dJs
1953,Requiem: Rise of the Reaver,A8_R4xQhwEs
1954,Daikatana,X


In [68]:
video_game_names[3951:]
fourth_batch = pd.DataFrame({"name":video_game_names[3951:], "vid_id":vid_id})
fourth_batch.to_csv("fourth_batch.csv")
fourth_batch

Unnamed: 0,name,vid_id
0,RPG Maker 2003,vuloA4EgpnQ
1,Elmarion: Dragon time,sYQh_FNFHQ0
2,LuckCatchers,85WPEZRJplQ
3,Greed Corp,6dnmhI0W3vg
4,Wildfire,5uDf-sf8leo
...,...,...
848,ESCAPE FROM VOYNA: Dead Forest,EmjuGy0BX-E
849,RACE On,LQIe4JKjYn4
850,F.E.A.R.,p2AlffKozbg
851,F.E.A.R.,p2AlffKozbg


In [71]:
all_batches = [first_batch, second_batch, third_batch, fourth_batch]
vid_id_df = pd.concat(all_batches)

In [85]:
vid_id_df = vid_id_df.reset_index(drop=True)
vid_id_df.to_csv("video_id.csv")
vid_id_df

Unnamed: 0,name,vid_id
0,Counter-Strike: Global Offensive,edYCtaNueQY
1,Dota 2,-cSFPIwMEq4
2,Grand Theft Auto V,QkkoHAzjnUs
3,PUBG: BATTLEGROUNDS,fDLAFIhfFy4
4,Tom Clancy's Rainbow Six Siege,KlbLLRdg9u8
...,...,...
4799,ESCAPE FROM VOYNA: Dead Forest,EmjuGy0BX-E
4800,RACE On,LQIe4JKjYn4
4801,F.E.A.R.,p2AlffKozbg
4802,F.E.A.R.,p2AlffKozbg


In [9]:
v_id_df = pd.read_csv("video_id.csv")
v_id_df.drop(columns=["Unnamed: 0"], inplace=True)

In [43]:
v_id_df

Unnamed: 0,name,vid_id
0,Counter-Strike: Global Offensive,edYCtaNueQY
1,Dota 2,-cSFPIwMEq4
2,Grand Theft Auto V,QkkoHAzjnUs
3,PUBG: BATTLEGROUNDS,fDLAFIhfFy4
4,Tom Clancy's Rainbow Six Siege,KlbLLRdg9u8
...,...,...
4799,ESCAPE FROM VOYNA: Dead Forest,EmjuGy0BX-E
4800,RACE On,LQIe4JKjYn4
4801,F.E.A.R.,p2AlffKozbg
4802,F.E.A.R.,p2AlffKozbg


In [68]:
driver = webdriver.Chrome(PATH)
driver.maximize_window()
driver.get("https://www.youtube.com")

row_num = 0
#Handling 'X' id --> videos that selenium could not 'click' and open to get the relevant ID
for g_name, gid in zip(v_id_df["name"], v_id_df["vid_id"]):
    if gid == 'X':
        searchbox = driver.find_element_by_name("search_query")
        
        # The addition of 'official trailer' guarantees that we get a legit and not some 'fan made' trailer.
        searchbox.send_keys(g_name+" official trailer")
        driver.find_element_by_id("search-icon-legacy").click()
        time.sleep(2)
        
        #clicks on the first search result of the YouTUbe search
        #driver.find_element(By.CLASS_NAME, "style-scope ytd-video-renderer")
        driver.find_element(By.CSS_SELECTOR, 'div#contents ytd-item-section-renderer>div#contents a#thumbnail').click()
        temp_url = driver.current_url
        v_id = get_video_id_by_url(temp_url)
        v_id_df.at[row_num, "vid_id"] = v_id
        
        driver.find_element_by_name("search_query").clear() #clears the search box 
        time.sleep(3)
    row_num += 1
    
driver.close() #closing the browser after collecting all the data of all games

  driver = webdriver.Chrome(PATH)
  searchbox = driver.find_element_by_name("search_query")
  driver.find_element_by_id("search-icon-legacy").click()
  driver.find_element_by_name("search_query").clear() #clears the search box


In [73]:
#checking how many 'X' are left in the dataframe
counter = 0
for k in v_id_df["vid_id"]:
    if k == 'X':
        counter+=1
counter

0

In [86]:
v_id_df.dropna(axis=1, inplace=True)

In [87]:
v_id_df.to_csv("video_id.csv")

In [8]:
#last checkpoint
v_id_df = pd.read_csv("video_id.csv")

In [9]:
#daily quotas limit = 10K
youtube = youtube_authenticate()

titles = []
comments_amount = []
likes_amount = []
views_amount = []

# looping through list of video id to get the info using the API
for name, video_id in zip(v_id_df["name"], v_id_df["vid_id"]):
    temp_res = get_video_details(youtube, id=video_id)
    comments, likes, views = get_video_stats(temp_res)
    titles.append(name)
    comments_amount.append(comments)
    likes_amount.append(likes)
    views_amount.append(views)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=760070128422-eklqre50jr1onl7vidujef4t0k0l5v7c.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A64650%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.force-ssl&state=YdKU6aE630VQeCi5l3tSpUi1qA7WAq&access_type=offline


In [10]:
df_youtube_stats = pd.DataFrame({"title":titles, "views":views_amount, "likes":likes_amount, "comments":comments_amount})

In [11]:
df_youtube_stats.to_csv("youtube_stats.csv")

In [12]:
df_youtube_stats

Unnamed: 0,title,views,likes,comments
0,Counter-Strike: Global Offensive,20055366,426246,45340
1,Dota 2,11063025,77961,22805
2,Grand Theft Auto V,79933514,1591274,326264
3,PUBG: BATTLEGROUNDS,643933,3160,173
4,Tom Clancy's Rainbow Six Siege,2716326,21901,902
...,...,...,...,...
4799,ESCAPE FROM VOYNA: Dead Forest,23402,830,111
4800,RACE On,1949619,14513,1178
4801,F.E.A.R.,2841564,14184,788
4802,F.E.A.R.,2841564,14184,788
