# Selenium based Web Scraping of TOI

### AUTOMATED COMMENTS EXTRACTION

In [41]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import WebDriverException, NoSuchElementException
import time
import os
import wget
import json
import csv

In [42]:
# Set up ChromeDriver
service = Service('E:/Programs/Chrome Driver/chromedriver.exe')
driver = webdriver.Chrome(service=service)

In [46]:
# Open the Times of India headlines page
driver.get("https://timesofindia.indiatimes.com/news")

In [47]:
# Locate all headline links using the provided XPaths
headline_links = driver.find_elements(By.XPATH, "//li//a[contains(@class, 'nmRcl')]")
links = [link.get_attribute('href') for link in headline_links]
links = list(set(links))

print(f"Found {len(links)} headline links.")

# links = ["https://timesofindia.indiatimes.com/world/south-asia/i-could-have-stayed-in-power-if-sheikh-hasina-claims-us-role-in-her-ouster-from-bangladesh/articleshow/112441112.cms"]

for idx, link in enumerate(links, start=1):
    print(f"{idx}: {link}")

Found 30 headline links.
1: https://timesofindia.indiatimes.com/india/disturbing-priyanka-gandhi-voices-concern-over-bangladeshs-religious-violence-seeks-urgent-action/articleshow/112469272.cms
2: https://timesofindia.indiatimes.com/india/kolkata-doctor-rape-murder-case-accused-was-addicted-to-violent-porn-known-as-womaniser/articleshow/112475334.cms
3: https://timesofindia.indiatimes.com/world/europe/how-billions-in-dollar-euro-notes-are-entering-russia-despite-strict-global-sanctions-report/articleshow/112469939.cms
4: https://timesofindia.indiatimes.com/sports/cricket/news/rohit-sharma-virat-kohli-can-play-for-harbhajan-singh-makes-a-big-statement-on-star-indian-duos-future/articleshow/112470145.cms
5: https://timesofindia.indiatimes.com/sports/paris-olympics-2024/indiaparis/making-excuses-is-where-our-country-will-win-gold-medals-gavaskar-backs-prakash-padukone-over-lakshya-sen-criticism/articleshow/112470389.cms
6: https://timesofindia.indiatimes.com/india/umpire-compromised-vs-un

In [48]:
# WORKING CODE 1 ->

def extract_comments(driver, article_url):
    try:
        driver.get(article_url)
        
        # Locate the "End of Article" element
        end_of_article_div = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' mj2wg ')]//span[contains(text(), 'End of Article')]")
        
        # Get the position of the element
        element_position = end_of_article_div.location['y']
        
        # Get the viewport height
        viewport_height = driver.execute_script("return window.innerHeight")
        
        # Calculate the scroll position to center the element in the viewport
        scroll_position = element_position - (viewport_height / 2)
        
        # Scroll to the calculated position
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(2)
    except WebDriverException as e:
        print(f"Failed to load page {article_url}: {e}")
        return []  # Return an empty list to skip this article
    
    # Locate comment buttons using the corrected XPath
    comment_buttons = driver.find_elements(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' GzGIQ ') and @data-ga='click|readComments_click']")
    
    # Prepare a list to store extracted comments data
    comments_data = []

    for btn in comment_buttons:
        try:
            btn.click()
            time.sleep(2)
            
            # Keep clicking "VIEW MORE COMMENTS" until it is no longer available
            while True:
                try:
                    view_more_button = driver.find_element(By.XPATH, "//div[contains(concat(' ', @class, ' '), ' hduJ6 ') and contains(text(), 'View more comments')]")
                    view_more_button.click()
                    time.sleep(2)
                except NoSuchElementException:
                    break  # Exit loop if no more "VIEW MORE COMMENTS" button is found
            
            # Locate and extract all comments
            comments = driver.find_elements(By.XPATH, "//li[@class='Nuk1p   ']")
            
            for comment in comments:
                try:
                    # Extract details from each comment
                    user = comment.find_element(By.CLASS_NAME, "ZJ4ae").text
                    comment_text = comment.find_element(By.CLASS_NAME, "mxnGH").text
                    
                    # Extract upvotes and downvotes using XPath
                    upvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
                    downvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
                    
                    # Append the extracted details to comments_data with article link
                    comments_data.append({
                        'user': user,
                        'comment_text': comment_text,
                        'upvotes': upvotes,
                        'downvotes': downvotes,
                        'article_link': article_url
                    })
                except Exception as e:
                    print(f"Error extracting comment: {e}")
        except Exception as e:
            print(f"Error clicking comment button: {e}")
    
    return comments_data


all_comments = []
for link in links:
    print(f"Processing article: {link}")
    comments = extract_comments(driver, link)
    all_comments.extend(comments)
    print(f"Extracted {len(comments)} comments from {link}")

print(f"Total comments extracted: {len(all_comments)}")

Processing article: https://timesofindia.indiatimes.com/india/disturbing-priyanka-gandhi-voices-concern-over-bangladeshs-religious-violence-seeks-urgent-action/articleshow/112469272.cms
Error clicking comment button: Message: element not interactable
  (Session info: chrome=127.0.6533.100)
Stacktrace:
	GetHandleVerifier [0x00007FF7693B9632+30946]
	(No symbol) [0x00007FF76936E3C9]
	(No symbol) [0x00007FF769266E09]
	(No symbol) [0x00007FF7692B9712]
	(No symbol) [0x00007FF7692AC271]
	(No symbol) [0x00007FF7692DCA6A]
	(No symbol) [0x00007FF7692ABBB6]
	(No symbol) [0x00007FF7692DCC80]
	(No symbol) [0x00007FF7692FB041]
	(No symbol) [0x00007FF7692DC813]
	(No symbol) [0x00007FF7692AA6E5]
	(No symbol) [0x00007FF7692AB021]
	GetHandleVerifier [0x00007FF7694EF83D+1301229]
	GetHandleVerifier [0x00007FF7694FBDB7+1351783]
	GetHandleVerifier [0x00007FF7694F2A03+1313971]
	GetHandleVerifier [0x00007FF7693EDD06+245686]
	(No symbol) [0x00007FF76937758F]
	(No symbol) [0x00007FF769373804]
	(No symbol) [0x00

In [56]:
import csv
import json
import os

# Save to JSON file
json_output_path = os.path.join(os.getcwd(), "times_of_india_comments.json")
with open(json_output_path, 'w') as json_file:
    json.dump(all_comments, json_file, indent=4)

print(f"Comments saved to JSON file: {json_output_path}")

# Save to CSV file
csv_output_path = os.path.join(os.getcwd(), "times_of_india_comments.csv")
with open(csv_output_path, 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['user', 'comment_text', 'upvotes', 'downvotes', 'article_link']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    
    writer.writeheader()
    for comment in all_comments:
        writer.writerow(comment)

print(f"Comments saved to CSV file: {csv_output_path}")


Comments saved to JSON file: E:\BTP\Selenium Scrapping\TOI_bot\times_of_india_comments.json
Comments saved to CSV file: E:\BTP\Selenium Scrapping\TOI_bot\times_of_india_comments.csv
