# YouTube Comments Scrapping 
    
   **BY - AYUSH KUMAR MISHRA**

In [27]:
#importing the required Libraries
import pandas as pd
import numpy as np
from selenium import webdriver 
import urllib
import pathlib
import time
from datetime import datetime

In [28]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--incognito')
driver = webdriver.Chrome( options=chrome_options)

search_item = ['Physical health']

Making a function for getting the links from the web

In [29]:
# Function to get links
def get_links(item) :    
    query = urllib.parse.quote(item)
    url = "https://www.youtube.com/results?search_query=" + query
    driver.get(url)
    driver.execute_script("window.scrollTo(0, 1000000)")
    driver.maximize_window()
    container=driver.find_elements_by_xpath('//*[(@id = "video-title")]')
    links=[]
    for page in container:
        url=page.get_attribute("href")
        links.append(url)
    return list(set(links))

In [30]:
dfs = []
for item in search_item:
    links = get_links(item)
    df = pd.DataFrame({'Item': [item] * len(links), 'Links': links})
    dfs.append(df)
final_df = pd.concat(dfs, ignore_index=True)  
final_df.to_csv('Links.csv', index=False)  
final_df

Unnamed: 0,Item,Links
0,Physical health,https://www.youtube.com/watch?v=ufsIA5NARIo&pp...
1,Physical health,https://www.youtube.com/watch?v=0MM5NwGbg7c&pp...
2,Physical health,https://www.youtube.com/watch?v=q1Ss8sTbFBY&pp...
3,Physical health,https://www.youtube.com/watch?v=AnL028p41M8&pp...
4,Physical health,https://www.youtube.com/watch?v=phLE3ibA7Io&pp...
5,Physical health,https://www.youtube.com/shorts/0CHunCV-Y4I
6,Physical health,https://www.youtube.com/watch?v=K60xHx836T0&pp...
7,Physical health,https://www.youtube.com/watch?v=hkvXRhZsfvY&pp...
8,Physical health,https://www.youtube.com/watch?v=REtcF5qjm7Q&pp...
9,Physical health,https://www.youtube.com/watch?v=EORrojq9CbE&pp...


In [31]:
len(final_df)

28

In [32]:
null_counts = final_df.groupby('Item')['Links'].apply(lambda x: x.isnull().sum())
print(null_counts)

Item
Physical health    1
Name: Links, dtype: int64


In [33]:
df = final_df.dropna()
null_counts = df.groupby('Item')['Links'].apply(lambda x: x.isnull().sum())
print(null_counts)

Item
Physical health    0
Name: Links, dtype: int64


In [34]:
videos_dictionary = {}
videos_dictionary['Comments'] = {}
videos_dictionary['Video Link'] = {}

Defining the function to scrap the required information from the links

In [35]:
def scrap(url,i):
    print('Fetched date and time - ',datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    try :
        videos_dictionary['Video Link'].update({i : url})
    except:        
        videos_dictionary['Video Link'].update({i :''})
    driver.get(url)
    time.sleep(3)
    comments =  driver.find_element_by_xpath('//*[@id="comments"]')
    driver.execute_script("arguments[0].scrollIntoView();", comments)
    last_height = driver.execute_script("return document.documentElement.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(1)
        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    username_elems = driver.find_elements_by_xpath('//*[@id="author-text"]')
    comment_elems = driver.find_elements_by_xpath('//*[@id="content-text"]')
    comment_string = ''
    for user, comment in zip(username_elems, comment_elems):
        comment_string = comment_string + user.text + ' - ' + comment.text + '\n'
    try :
        videos_dictionary['Comments'].update({i : comment_string})
    except :
        videos_dictionary['Comments'].update({i : ''})

First check the file exist to save the data that has been scrapped from the links. If the file does not exist, then create one. If it exist then check whether it is empty or not.

If not empty, then extract data drom the links not present in the file.

In [37]:
start = 0
name = 'Youtube_scrapping_comments.csv'
file_name = pathlib.Path(name)
if file_name.exists():
    print ("File exist.")
    try :
        print("Reading the file now")
        df_temp = pd.read_csv(name, index_col = 0)
        start = (len(df_temp) )
    except:
        print("File is empty")
else:
    print ("File does not exist\n", "Creating the file")
    file = open(name,"w+")
    print("File created successfully with filename - ",name)
print("Starting from position - ", start)

File exist.
Reading the file now
File is empty
Starting from position -  0


In [39]:
for num in range(start, len(df)):
    url = df['Links'].iloc[num]  
    print("Loop entered")
    try:
        print("getting link-", num)
        driver.get(url)
    except:
        print("Not getting")
        continue
    print("=" * 40) 
    print("Scraping " + url)
    scrap(url, num)
    print("=" * 40) 
print("Process ended successfully")


Loop entered
getting link- 0
Scraping https://www.youtube.com/watch?v=ufsIA5NARIo&pp=ygUPUGh5c2ljYWwgaGVhbHRo
Fetched date and time -  29/09/2023 20:19:38
Loop entered
getting link- 1
Scraping https://www.youtube.com/watch?v=0MM5NwGbg7c&pp=ygUPUGh5c2ljYWwgaGVhbHRo
Fetched date and time -  29/09/2023 20:20:58
Process ended successfully


### Here i just run the loop to 0 to 2 so that full it takes low time

In [40]:
data = pd.DataFrame.from_dict(videos_dictionary)
try:
    result = pd.concat([df_temp,data], ignore_index = True)
    result
except :
    result = data
result
print(videos_dictionary)



In [46]:
results = result.dropna()
videos_dictionary = {}
results

Unnamed: 0,Comments,Video Link
0,@vatimati-kq4dh - I can’t believe Peter went i...,https://www.youtube.com/watch?v=ufsIA5NARIo&pp...
1,@joshwagnerfilms - We use this to help our kid...,https://www.youtube.com/watch?v=0MM5NwGbg7c&pp...
2,@michaelsoareverix5373 - Here are my notes on ...,https://www.youtube.com/watch?v=q1Ss8sTbFBY&pp...
3,@godsentjesustosetusallfree9859 - They never e...,https://www.youtube.com/watch?v=AnL028p41M8&pp...
4,,https://www.youtube.com/watch?v=phLE3ibA7Io&pp...
5,,https://www.youtube.com/shorts/0CHunCV-Y4I


In [55]:
#Removing URLs
import re
def remove_URL(text):
    return re.sub(r"https?://\S+|www\.\S+", "", text)

results['Cleaned_Comments'] = results['Comments'].apply(remove_URL)
results


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['Cleaned_Comments'] = results['Comments'].apply(remove_URL)


Unnamed: 0,Comments,Video Link,Cleaned_Comments
0,@vatimati-kq4dh - I can’t believe Peter went i...,https://www.youtube.com/watch?v=ufsIA5NARIo&pp...,@vatimati-kq4dh - I can’t believe Peter went i...
1,@joshwagnerfilms - We use this to help our kid...,https://www.youtube.com/watch?v=0MM5NwGbg7c&pp...,@joshwagnerfilms - We use this to help our kid...
2,@michaelsoareverix5373 - Here are my notes on ...,https://www.youtube.com/watch?v=q1Ss8sTbFBY&pp...,@michaelsoareverix5373 - Here are my notes on ...
3,@godsentjesustosetusallfree9859 - They never e...,https://www.youtube.com/watch?v=AnL028p41M8&pp...,@godsentjesustosetusallfree9859 - They never e...
4,,https://www.youtube.com/watch?v=phLE3ibA7Io&pp...,
5,,https://www.youtube.com/shorts/0CHunCV-Y4I,


## I do not prefer to remove the smileys as they also show emotion so instead of removing them we can change to unicode-8 such that out model can also understand our the commenter wants to say.

In [56]:
# def remove_special_characters(text):
#     emoji_pattern = re.compile(
#         '['
#         u'\U0001F600-\U0001F64F'  # emoticons
#         u'\U0001F300-\U0001F5FF'  # symbols & pictographs
#         u'\U0001F680-\U0001F6FF'  # transport & map symbols
#         u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
#         u'\U00002702-\U000027B0'
#         u'\U000024C2-\U0001F251'
#         ']+',
#         flags=re.UNICODE)
#     return emoji_pattern.sub(r'', text)
# results['Cleaned_Comments'] = results['Cleaned_Comments'].apply(remove_special_characters)
# results
csv_file_path = 'Youtube_scrapping_comments.csv'
df.to_csv(csv_file_path, encoding='utf-8', index=False)


In [57]:
driver.close()