# YouTube Comments Scrapping 
    
   **BY - AYUSH KUMAR MISHRA**

In [None]:
#importing the required Libraries
import pandas as pd
import numpy as np
from selenium import webdriver 
import urllib
import pathlib
import time
from datetime import datetime

In [None]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--incognito')
driver = webdriver.Chrome( options=chrome_options)

Making a function for getting the links from the web

In [None]:
def get_links(item):
    query = urllib.parse.quote(item)
    url = "https://www.youtube.com/results?search_query=" + query
    driver.get(url)
    driver.execute_script("window.scrollTo(0, 1000000)")
    driver.maximize_window()
    container = driver.find_elements_by_xpath('//*[(@id = "video-title")]')
    titles = [element.text for element in container] 
    links = [page.get_attribute("href") for page in container]
    return list(zip(titles, links))  
dfs = []
search_item = ["Physical Health","Mental health","Social health","Emotional health","Spiritual health","Environmental health","Intellectual health","Occupational health","Financial health","Interpersonal health","Cultural health","Sexual health","Reproductive health","Personal health","Community health","Global health","Public health","Population health","Personal hygiene","Hygiene","Hygiene practices"] 
for item in search_item:
    data = get_links(item)
    df = pd.DataFrame({'Item': [item] * len(data), 'Video Title': [d[0] for d in data], 'Links': [d[1] for d in data]})
    dfs.append(df)
final_df = pd.concat(dfs, ignore_index=True)
final_df.to_csv('Links.csv', index=False)
final_df.shape

In [None]:
final_df.sample(10)

In [None]:
null_counts = final_df.groupby('Item')['Links'].apply(lambda x: x.isnull().sum())
print(null_counts)

In [None]:
final_df.dropna(inplace=True)
final_df.shape

Defining the function to scrap the required information from the links

In [None]:
videos_dictionary = {
    'Comments': {},
    'Video Link': {},
    'Video Title': {},
    'Item': {}
}

def scrap(url, i):
    print('Fetched date and time - ', datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    try:
        videos_dictionary['Video Link'][i] = url
    except Exception as e:
        print(f"Error updating Video Link for index {i}: {e}")
    
    driver.get(url)
    time.sleep(3) 
    try:
        video_title = driver.find_element_by_xpath('//*[@id="video-title"]').text
        item = "Physical Health" 
        videos_dictionary['Video Title'][i] = video_title
        videos_dictionary['Item'][i] = item
    except Exception as e:
        print(f"Error updating Video Title or Item for index {i}: {e}")
    
    # Scroll down to load comments
    comments = driver.find_element_by_xpath('//*[@id="comments"]')
    driver.execute_script("arguments[0].scrollIntoView();", comments)
    last_height = driver.execute_script("return document.documentElement.scrollHeight")
    
    while True:
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(1)
        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    username_elems = driver.find_elements_by_xpath('//*[@id="author-text"]')
    comment_elems = driver.find_elements_by_xpath('//*[@id="content-text"]')
    
    comment_string = ''
    
    for user, comment in zip(username_elems, comment_elems):
        comment_string = comment_string + user.text + ' - ' + comment.text + '\n'
    
    try:
        videos_dictionary['Comments'][i] = comment_string
    except Exception as e:
        print(f"Error updating Comments for index {i}: {e}")
    


First check the file exist to save the data that has been scrapped from the links. If the file does not exist, then create one. If it exist then check whether it is empty or not.

If not empty, then extract data drom the links not present in the file.

In [None]:
start = 0
name = 'Youtube_scrapping_comments.csv'
file_name = pathlib.Path(name)
if file_name.exists():
    print ("File exist.")
    try :
        print("Reading the file now")
        df_temp = pd.read_csv(name, index_col = 0)
        start = (len(df_temp) )
    except:
        print("File is empty")
else:
    print ("File does not exist\n", "Creating the file")
    file = open(name,"w+")
    print("File created successfully with filename - ",name)
print("Starting from position - ", start)

In [None]:
for num in range(start, 309):
    url = final_df['Links'].iloc[num]  
    print("Loop entered")
    try:
        print("getting link-", num)
        driver.get(url)
    except:
        print("Not getting")
        continue
    print("=" * 40) 
    print("Scraping " + url)
    scrap(url, num)
    print("=" * 40) 
print("Process ended successfully")


In [None]:
for num in range(310, len(final_df)):
    url = final_df['Links'].iloc[num]  
    print("Loop entered")
    try:
        print("getting link-", num)
        driver.get(url)
    except:
        print("Not getting")
        continue
    print("=" * 40) 
    print("Scraping " + url)
    scrap(url, num)
    print("=" * 40) 
print("Process ended successfully")

In [None]:
data = pd.DataFrame.from_dict(videos_dictionary)
data.sample(10)

In [None]:
data.shape

In [None]:
filtered_data = data[data['Comments'] != '']
data = data[data['Comments'] != '']
data.reset_index(drop=True, inplace=True)

In [None]:
data.head()

In [None]:
first_row = data.iloc[0]
comment = first_row['Cleaned_Comments']
print(comment)

In [None]:
#Removing URLs
import re
def remove_URL(text):
    return re.sub(r"https?://\S+|www\.\S+", "", text)

data.loc[:, 'Cleaned_Comments'] = data['Comments'].apply(remove_URL)
data.head()

In [None]:
data.head()

In [None]:
data.drop(['Comments'], axis = 1, inplace = True)
data.head()

#### I do not prefer to remove the smileys as they also show emotion so instead of removing them we can change to unicode-8 such that out model can also understand our the commenter wants to say.

In [None]:
# def remove_special_characters(text):
#     emoji_pattern = re.compile(
#         '['
#         u'\U0001F600-\U0001F64F'  # emoticons
#         u'\U0001F300-\U0001F5FF'  # symbols & pictographs
#         u'\U0001F680-\U0001F6FF'  # transport & map symbols
#         u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
#         u'\U00002702-\U000027B0'
#         u'\U000024C2-\U0001F251'
#         ']+',
#         flags=re.UNICODE)
#     return emoji_pattern.sub(r'', text)
# results['Cleaned_Comments'] = results['Cleaned_Comments'].apply(remove_special_characters)
# results
csv_file_path = 'Youtube_scrapping_comments.csv'
data.to_csv(csv_file_path, encoding='utf-8', index=False)


In [None]:
driver.close()