# Selenium based Web Scraping of TOI

### AUTOMATED COMMENTS EXTRACTION

In [1]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException, TimeoutException, NoSuchElementException
import time
import os
import wget
import json
import csv

In [2]:
# Set up ChromeDriver
service = Service('E:/Programs/Chrome Driver/chromedriver.exe')
driver = webdriver.Chrome(service=service)

In [3]:
# Open the Times of India headlines page
driver.get("https://timesofindia.indiatimes.com/news")

In [4]:
# Locate all headline links using the provided XPaths
# headline_links = driver.find_elements(By.XPATH, "//li//a[contains(@class, 'nmRcl')]")
# links = [link.get_attribute('href') for link in headline_links]
# links = list(set(links))

# print(f"Found {len(links)} headline links.")

links = ["https://timesofindia.indiatimes.com/india/disturbing-priyanka-gandhi-voices-concern-over-bangladeshs-religious-violence-seeks-urgent-action/articleshow/112469272.cms"]

for idx, link in enumerate(links, start=1):
    print(f"{idx}: {link}")

1: https://timesofindia.indiatimes.com/india/disturbing-priyanka-gandhi-voices-concern-over-bangladeshs-religious-violence-seeks-urgent-action/articleshow/112469272.cms


In [13]:
def scroll_into_view(driver, element):
    driver.execute_script("arguments[0].scrollIntoView(true);", element)
    time.sleep(1)
def extract_comments(driver, article_url):
    try:
        driver.get(article_url)
        
        # Locate the "End of Article" element
        end_of_article_div = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' mj2wg ')]//span[contains(text(), 'End of Article')]")
        
        # Get the position of the element
        element_position = end_of_article_div.location['y']
        
        # Get the viewport height
        viewport_height = driver.execute_script("return window.innerHeight")
        
        # Calculate the scroll position to center the element in the viewport
        scroll_position = element_position - (viewport_height / 2)
        
        # Scroll to the calculated position
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(2)
    except WebDriverException as e:
        print(f"Failed to load page {article_url}: {e}")
        return []  # Return an empty list to skip this article
    
    # Prepare a list to store extracted comments data
    comments_data = []
    
    try:
        # Locate the comment button using the corrected XPath
        view_comment_btn = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' GzGIQ ') and (((count(preceding-sibling::*) + 1) = 1) and parent::*)]")
        view_comment_btn.click()
        time.sleep(2)
        
        # Try clicking "VIEW MORE COMMENTS" buttons if available
        while True:
            try:
                # Wait for the "View more comments" button to become clickable
                view_more_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, "//div[contains(concat(' ', @class, ' '), ' hduJ6 ') and contains(text(), 'View more comments')]"))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", view_more_button)
                view_more_button.click()
            except TimeoutException:
                # No more "View more comments" button found, exit loop
                print("Finished loading all the comments")
                break
            except NoSuchElementException:
                # Button not found, break the loop
                print("No 'View more comments' button found.")
                break
        
        # Locate and extract all comments
        comments = driver.find_elements(By.XPATH, "//*[@id='comment-container-112469272']/div/div[1]/div[2]/div[1]/div/div/ul/li")

        for comment in comments:
            try:
                # Extract details from each comment
                user = comment.find_element(By.CLASS_NAME, "ZJ4ae").text
                comment_text = comment.find_element(By.CLASS_NAME, "mxnGH").text
                
                # Extract upvotes and downvotes using XPath
                upvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
                downvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
        
                # Initialize replies list before processing
                replies = []
        
                # Check and click "Show responses" link to load nested replies
                try:
                    show_responses_link = comment.find_element(By.XPATH, "./div[4]/a")
                    scroll_into_view(driver, show_responses_link)
                    show_responses_link.click()
                    time.sleep(2)
                except NoSuchElementException:
                    pass
                
                # Locate nested replies after they are loaded
                while True:
                    try:
                        show_all_responses_button = comment.find_element(By.XPATH, ".//div[contains(concat(' ', @class, ' '), ' evC4f ') and contains(text(), 'Show all responses')]")
                        show_all_responses_button.click()
                        time.sleep(2)
                    except NoSuchElementException:
                        break  # Exit loop if no more "Show all responses" button is found
                
                reply_elements = comment.find_elements(By.XPATH, ".//div[5]/ul/li")
                
                for reply in reply_elements:
                    try:
                        reply_user = reply.find_element(By.XPATH, "./div[1]/h3").text
                        reply_text = reply.find_element(By.XPATH, "./div[3]").text
                        reply_upvotes = reply.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
                        reply_downvotes = reply.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
                        
                        replies.append({
                            'user': reply_user,
                            'comment_text': reply_text,
                            'upvotes': reply_upvotes,
                            'downvotes': reply_downvotes,
                            'article_link': article_url
                        })
                    except Exception as e:
                        print(f"Error extracting reply comment: {e}")
                
                # Append the extracted details to comments_data with article link
                comments_data.append({
                    'user': user,
                    'comment_text': comment_text,
                    'upvotes': upvotes,
                    'downvotes': downvotes,
                    'replies': replies,
                    'article_link': article_url
                })
            except Exception as e:
                print(f"Error extracting comment: {e}")

    except NoSuchElementException:
        print("No comment button found, skipping this article.")
    except Exception as e:
        print(f"Error clicking comment button: {e}")
    
    return comments_data


all_comments = []
for link in links:
    print(f"Processing article: {link}")
    comments = extract_comments(driver, link)
    all_comments.extend(comments)
    print(f"Extracted {len(comments)} comments from {link}")

print(f"Total comments extracted: {len(all_comments)}")


Processing article: https://timesofindia.indiatimes.com/india/disturbing-priyanka-gandhi-voices-concern-over-bangladeshs-religious-violence-seeks-urgent-action/articleshow/112469272.cms
Finished loading all the comments
Extracted 51 comments from https://timesofindia.indiatimes.com/india/disturbing-priyanka-gandhi-voices-concern-over-bangladeshs-religious-violence-seeks-urgent-action/articleshow/112469272.cms
Total comments extracted: 51


In [11]:
print(comments)

[{'user': 'Ashish Kumar', 'comment_text': "Priyanka Gandhi, your originality is minority in India. don't try to fool Hindus. you have dragons tear", 'upvotes': '31', 'downvotes': '5', 'replies': [], 'article_link': 'https://timesofindia.indiatimes.com/india/disturbing-priyanka-gandhi-voices-concern-over-bangladeshs-religious-violence-seeks-urgent-action/articleshow/112469272.cms'}, {'user': 'Nation First', 'comment_text': "Priyanka Vadra is either ashamed or scared of calling the minorities as 'Hindus'. This is the mentality of the Vadra Khangress, a torchbearer of fake secularism. Shame on this so-called national party.", 'upvotes': '27', 'downvotes': '2', 'replies': [], 'article_link': 'https://timesofindia.indiatimes.com/india/disturbing-priyanka-gandhi-voices-concern-over-bangladeshs-religious-violence-seeks-urgent-action/articleshow/112469272.cms'}, {'user': 'S. S. Rana', 'comment_text': 'Bachhiya (बछिया ) ko to hosh aa gaya. Bachhda (बछड़ा) to abhi bi behosh hai.', 'upvotes': '23'

In [None]:
# def scroll_into_view(driver, element):
#     driver.execute_script("arguments[0].scrollIntoView(true);", element)
#     time.sleep(1)
    
# def extract_comments(driver, article_url):
#     try:
#         driver.get(article_url)
        
#         # Locate the "End of Article" element
#         end_of_article_div = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' mj2wg ')]//span[contains(text(), 'End of Article')]")
        
#         # Get the position of the element
#         element_position = end_of_article_div.location['y']
        
#         # Get the viewport height
#         viewport_height = driver.execute_script("return window.innerHeight")
        
#         # Calculate the scroll position to center the element in the viewport
#         scroll_position = element_position - (viewport_height / 2)
        
#         # Scroll to the calculated position
#         driver.execute_script(f"window.scrollTo(0, {scroll_position});")
#         time.sleep(2)
#     except WebDriverException as e:
#         print(f"Failed to load page {article_url}: {e}")
#         return []  # Return an empty list to skip this article
    
#     # Locate comment buttons using the corrected XPath
#     comment_button = driver.find_elements(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' GzGIQ ') and @data-ga='click|readComments_click']")
    
#     # Prepare a list to store extracted comments data
#     comments_data = []

#     if not comment_button:
#             print(f"No comment buttons found on {article_url}")
#             return comments_data

#     for btn in comment_button:
#         try:
#             btn.click()
#             time.sleep(2)
            
#             # Keep clicking "VIEW MORE COMMENTS" until it is no longer available
#             while True:
#                 try:
#                     view_more_button = WebDriverWait(driver, 5).until(
#                         EC.element_to_be_clickable((By.XPATH, "//div[contains(concat(' ', @class, ' '), ' hduJ6 ') and contains(text(), 'View more comments')]"))
#                     )
#                     driver.execute_script("arguments[0].scrollIntoView(true);", view_more_button)
#                     view_more_button.click()
#                     time.sleep(5)
#                 except NoSuchElementException:
#                     break  # Exit loop if no more "VIEW MORE COMMENTS" button is found
            
#             # Locate and extract all comments
#             comments = driver.find_elements(By.XPATH, "//li[@class='Nuk1p']")
#             print(f"comments: {comments}")
#             if not comments:
#                     print("No comments found even after loading the comment section.")
            
#             for comment in comments:
#                 try:
#                     # Extract details from each comment
#                     user = comment.find_element(By.CLASS_NAME, "ZJ4ae").text
#                     comment_text = comment.find_element(By.CLASS_NAME, "mxnGH").text
                    
#                     # Extract upvotes and downvotes using XPath
#                     upvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
#                     downvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
                    
#                     # # Check and click "Show responses" link to load nested replies
#                     # try:
#                     #     show_responses_link = comment.find_element(By.XPATH, ".//a[contains(concat(' ', @class, ' '), ' hwy4z ') and contains(text(), 'Show responses')]")
#                     #     scroll_into_view(driver, show_responses_link)
#                     #     show_responses_link.click()
#                     #     time.sleep(2)
#                     # except NoSuchElementException:
#                     #     pass
                    
#                     # # Locate nested replies after they are loaded
#                     # replies = []
#                     # while True:
#                     #     try:
#                     #         show_all_responses_button = comment.find_element(By.XPATH, ".//div[contains(concat(' ', @class, ' '), ' evC4f ') and contains(text(), 'Show all responses')]")
#                     #         show_all_responses_button.click()
#                     #         time.sleep(2)  # Wait for more replies to load
#                     #     except NoSuchElementException:
#                     #         break  # Exit loop if no more "Show all responses" button is found
                    
#                     # reply_elements = comment.find_elements(By.XPATH, ".//ul[contains(@class, 'reply-comments')]//li[@class='Nuk1p   ']")
                    
#                     # for reply in reply_elements:
#                     #     try:
#                     #         reply_user = reply.find_element(By.CLASS_NAME, "ZJ4ae").text
#                     #         reply_text = reply.find_element(By.CLASS_NAME, "mxnGH").text
#                     #         reply_upvotes = reply.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
#                     #         reply_downvotes = reply.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
                            
#                     #         replies.append({
#                     #             'user': reply_user,
#                     #             'comment_text': reply_text,
#                     #             'upvotes': reply_upvotes,
#                     #             'downvotes': reply_downvotes,
#                     #             'article_link': article_url
#                     #         })
#                     #     except Exception as e:
#                     #         print(f"Error extracting reply comment: {e}")
                    
#                     # Append the extracted details to comments_data with nested replies
#                     comments_data.append({
#                         'user': user,
#                         'comment_text': comment_text,
#                         'upvotes': upvotes,
#                         'downvotes': downvotes,
#                         'article_link': article_url,
#                         # 'replies': replies
#                     })
#                 except Exception as e:
#                     print(f"Error extracting comment: {e}")
#         except Exception as e:
#             print(f"Error clicking comment button: {e}")
    
#     return comments_data


# all_comments = []
# for link in links:
#     print(f"Processing article: {link}")
#     comments = extract_comments(driver, link)
#     all_comments.extend(comments)
#     print(f"Extracted {len(comments)} comments from {link}")

# print(f"Total comments extracted: {len(all_comments)}")


In [14]:
import csv
import json
import os

# Save to JSON file
json_output_path = os.path.join(os.getcwd(), "times_of_india_comments.json")
with open(json_output_path, 'w') as json_file:
    json.dump(all_comments, json_file, indent=4)

print(f"Comments saved to JSON file: {json_output_path}")

# Save to CSV file
csv_output_path = os.path.join(os.getcwd(), "times_of_india_comments.csv")
with open(csv_output_path, 'w', newline='', encoding='utf-8') as csv_file:
    # Define the fieldnames, including for replies
    fieldnames = ['user', 'comment_text', 'upvotes', 'downvotes', 'article_link', 'reply_user', 'reply_text', 'reply_upvotes', 'reply_downvotes']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    
    writer.writeheader()
    
    for comment in all_comments:
        # Write the main comment row
        writer.writerow({
            'user': comment['user'],
            'comment_text': comment['comment_text'],
            'upvotes': comment['upvotes'],
            'downvotes': comment['downvotes'],
            'article_link': comment['article_link'],
            'reply_user': '',  # No reply for the main comment
            'reply_text': '',
            'reply_upvotes': '',
            'reply_downvotes': ''
        })
        
        # Write each reply as a separate row
        for reply in comment['replies']:
            writer.writerow({
                'user': '',  # No main comment info for replies
                'comment_text': '',
                'upvotes': '',
                'downvotes': '',
                'article_link': reply['article_link'],
                'reply_user': reply['user'],
                'reply_text': reply['comment_text'],
                'reply_upvotes': reply['upvotes'],
                'reply_downvotes': reply['downvotes']
            })

print(f"Comments saved to CSV file: {csv_output_path}")


Comments saved to JSON file: E:\BTP\Selenium Scrapping\TOI_bot\times_of_india_comments.json
Comments saved to CSV file: E:\BTP\Selenium Scrapping\TOI_bot\times_of_india_comments.csv
