# YouTube Comments Scrapping 
    
   **BY - AYUSH KUMAR MISHRA**

In [2]:
#importing the required Libraries
import pandas as pd
import numpy as np
from selenium import webdriver 
import urllib
import pathlib
import time
from datetime import datetime

In [3]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--incognito')
driver = webdriver.Chrome( options=chrome_options)

search_item = ['Physical health']

Making a function for getting the links from the web

In [4]:
# Function to get links
def get_links(item) :    
    query = urllib.parse.quote(item)
    url = "https://www.youtube.com/results?search_query=" + query
    driver.get(url)
    driver.execute_script("window.scrollTo(0, 1000000)")
    driver.maximize_window()
    container=driver.find_elements_by_xpath('//*[(@id = "video-title")]')
    links=[]
    for page in container:
        url=page.get_attribute("href")
        links.append(url)
    return list(set(links))

In [5]:
dfs = []
for item in search_item:
    links = get_links(item)
    df = pd.DataFrame({'Item': [item] * len(links), 'Links': links})
    dfs.append(df)
final_df = pd.concat(dfs, ignore_index=True)  
final_df.to_csv('Links.csv', index=False)  
final_df

Unnamed: 0,Item,Links
0,Physical health,https://www.youtube.com/watch?v=q1Ss8sTbFBY&pp...
1,Physical health,https://www.youtube.com/watch?v=fPZxIbm1Wbw&pp...
2,Physical health,https://www.youtube.com/watch?v=REtcF5qjm7Q&pp...
3,Physical health,https://www.youtube.com/shorts/R3Deqa-f6yg
4,Physical health,https://www.youtube.com/watch?v=c3RNBmsLc-M&pp...
5,Physical health,https://www.youtube.com/watch?v=x-aqSUJA0i4&pp...
6,Physical health,https://www.youtube.com/watch?v=37UhELFvPec&pp...
7,Physical health,
8,Physical health,https://www.youtube.com/watch?v=Srvnee0ha3g&pp...
9,Physical health,https://www.youtube.com/watch?v=bgY0KnNGXZY&pp...


In [6]:
len(final_df)

28

In [7]:
null_counts = final_df.groupby('Item')['Links'].apply(lambda x: x.isnull().sum())
print(null_counts)

Item
Physical health    1
Name: Links, dtype: int64


In [8]:
df = final_df.dropna()
null_counts = df.groupby('Item')['Links'].apply(lambda x: x.isnull().sum())
print(null_counts)

Item
Physical health    0
Name: Links, dtype: int64


In [9]:
videos_dictionary = {}
videos_dictionary['Comments'] = {}
videos_dictionary['Video Link'] = {}

Defining the function to scrap the required information from the links

In [10]:
def scrap(url,i):
    print('Fetched date and time - ',datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    try :
        videos_dictionary['Video Link'].update({i : url})
    except:        
        videos_dictionary['Video Link'].update({i :''})
    driver.get(url)
    time.sleep(3)
    comments =  driver.find_element_by_xpath('//*[@id="comments"]')
    driver.execute_script("arguments[0].scrollIntoView();", comments)
    last_height = driver.execute_script("return document.documentElement.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(1)
        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    username_elems = driver.find_elements_by_xpath('//*[@id="author-text"]')
    comment_elems = driver.find_elements_by_xpath('//*[@id="content-text"]')
    comment_string = ''
    for user, comment in zip(username_elems, comment_elems):
        comment_string = comment_string + user.text + ' - ' + comment.text + '\n'
    try :
        videos_dictionary['Comments'].update({i : comment_string})
    except :
        videos_dictionary['Comments'].update({i : ''})

First check the file exist to save the data that has been scrapped from the links. If the file does not exist, then create one. If it exist then check whether it is empty or not.

If not empty, then extract data drom the links not present in the file.

In [11]:
start = 0
name = 'Youtube_scrapping_comments.csv'
file_name = pathlib.Path(name)
if file_name.exists():
    print ("File exist.")
    try :
        print("Reading the file now")
        df_temp = pd.read_csv(name, index_col = 0)
        start = (len(df_temp) )
    except:
        print("File is empty")
else:
    print ("File does not exist\n", "Creating the file")
    file = open(name,"w+")
    print("File created successfully with filename - ",name)
print("Starting from position - ", start)

File exist.
Reading the file now
File is empty
Starting from position -  0


In [12]:
# Scrapping the comments
for num in range(start, len(df)):
    url = df['Links'].iloc[num]  
    print("Loop entered")
    try:
        print("getting link-", num)
        driver.get(url)
    except:
        print("Not getting")
        continue
    print("=" * 40) 
    print("Scraping " + url)
    scrap(url, num)
    print("=" * 40) 
print("Process ended successfully")


Loop entered
getting link- 0
Scraping https://www.youtube.com/watch?v=q1Ss8sTbFBY&pp=ygUPUGh5c2ljYWwgaGVhbHRo
Fetched date and time -  28/09/2023 19:34:35
Loop entered
getting link- 1
Scraping https://www.youtube.com/watch?v=fPZxIbm1Wbw&pp=ygUPUGh5c2ljYWwgaGVhbHRo
Fetched date and time -  28/09/2023 19:35:34
Loop entered
getting link- 2
Scraping https://www.youtube.com/watch?v=REtcF5qjm7Q&pp=ygUPUGh5c2ljYWwgaGVhbHRo
Fetched date and time -  28/09/2023 19:35:46
Loop entered
getting link- 3
Scraping https://www.youtube.com/shorts/R3Deqa-f6yg
Fetched date and time -  28/09/2023 19:36:15
Loop entered
getting link- 4
Scraping https://www.youtube.com/watch?v=c3RNBmsLc-M&pp=ygUPUGh5c2ljYWwgaGVhbHRo
Fetched date and time -  28/09/2023 19:36:22
Loop entered
getting link- 5
Scraping https://www.youtube.com/watch?v=x-aqSUJA0i4&pp=ygUPUGh5c2ljYWwgaGVhbHRo
Fetched date and time -  28/09/2023 19:36:30
Loop entered
getting link- 6
Scraping https://www.youtube.com/watch?v=37UhELFvPec&pp=ygUPUGh5c2ljYW

In [13]:
# Converting into table for better visuals
data = pd.DataFrame.from_dict(videos_dictionary)
try:
    result = pd.concat([df_temp,data], ignore_index = True)
    result
except :
    result = data
result

Unnamed: 0,Comments,Video Link
0,@michaelsoareverix5373 - Here are my notes on ...,https://www.youtube.com/watch?v=q1Ss8sTbFBY&pp...
1,"- Thank You! \n\nMore than 11,152 people look...",https://www.youtube.com/watch?v=fPZxIbm1Wbw&pp...
2,- What do you think about this video? This is...,https://www.youtube.com/watch?v=REtcF5qjm7Q&pp...
3,,https://www.youtube.com/shorts/R3Deqa-f6yg
4,,https://www.youtube.com/watch?v=c3RNBmsLc-M&pp...
5,,https://www.youtube.com/watch?v=x-aqSUJA0i4&pp...
6,@phanivarmak5794 - Important things we get for...,https://www.youtube.com/watch?v=37UhELFvPec&pp...
7,@calliedurling3292 - They do not cover a Gym M...,https://www.youtube.com/watch?v=Srvnee0ha3g&pp...
8,@priyankamehta6632 - Dear mam lots of respect ...,https://www.youtube.com/watch?v=bgY0KnNGXZY&pp...
9,@vatimati-kq4dh - I can’t believe Peter went i...,https://www.youtube.com/watch?v=ufsIA5NARIo&pp...


In [16]:
results = result.dropna()
videos_dictionary = {}
results

Unnamed: 0,Comments,Video Link
0,@michaelsoareverix5373 - Here are my notes on ...,https://www.youtube.com/watch?v=q1Ss8sTbFBY&pp...
1,"- Thank You! \n\nMore than 11,152 people look...",https://www.youtube.com/watch?v=fPZxIbm1Wbw&pp...
2,- What do you think about this video? This is...,https://www.youtube.com/watch?v=REtcF5qjm7Q&pp...
3,,https://www.youtube.com/shorts/R3Deqa-f6yg
4,,https://www.youtube.com/watch?v=c3RNBmsLc-M&pp...
5,,https://www.youtube.com/watch?v=x-aqSUJA0i4&pp...
6,@phanivarmak5794 - Important things we get for...,https://www.youtube.com/watch?v=37UhELFvPec&pp...
7,@calliedurling3292 - They do not cover a Gym M...,https://www.youtube.com/watch?v=Srvnee0ha3g&pp...
8,@priyankamehta6632 - Dear mam lots of respect ...,https://www.youtube.com/watch?v=bgY0KnNGXZY&pp...
9,@vatimati-kq4dh - I can’t believe Peter went i...,https://www.youtube.com/watch?v=ufsIA5NARIo&pp...


In [23]:
# results.iloc[[0]]['Comments']

# Print all comments for the first video link
for i in range(len(df)):
     vn = results.iloc[i]['Video Link']
     print(vn)
     print("=="*40)
     first_video_comments = results.iloc[i]['Comments']
     print(first_video_comments)
    

https://www.youtube.com/watch?v=q1Ss8sTbFBY&pp=ygUPUGh5c2ljYWwgaGVhbHRo
@michaelsoareverix5373 - Here are my notes on the protocol:

Sunday: Long endurance workout

Hiking, running, Zone 2 cardio for 60-75 minutes, maybe hiking for three hours or so
@rawsonband4582 - Thank you for your service! I am a 52 yo man that got sober after a lifetime of alcohol and drug abuse and I was a smoker. Came totally clean December 2020. Completely turned my life around. 2 months ago I started a simple workout routine after reading “Can’t hurt Me” by David Goggins. Your routine is one to aim for. The beautiful thing about the podcast is your choice of topics that are so beneficial to a guy like me. I had already read Dr Anna Lembke’s book, “Dopamine Nation” and consumed dozens of hours listening to Dr Jordan Peterson…and there you all are…top of the elite list of massively influential players driving me to be a better person spiritually, mentally, physically and emotionally. Thank you for putting this 

In [24]:
driver.close()

# END

**Made By - AYUSH KUMAR MISHRA**