# Selenium based Web Scraping of TOI

### AUTOMATED COMMENTS EXTRACTION

In [30]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException, TimeoutException, NoSuchElementException
import time
import os
import wget
import json
import csv

In [41]:
# Set up ChromeDriver
service = Service('E:/Programs/Chrome Driver/chromedriver.exe')
driver = webdriver.Chrome(service=service)

In [42]:
# Open the Times of India page
driver.get("https://timesofindia.indiatimes.com/")

In [4]:
# Sleep for a specific duration (e.g., 5 seconds)
time.sleep(5)

# Locate the element and click it
target_div = driver.find_element(By.XPATH, '//*[@id="app"]/div/div[3]/div/div[3]/div/div/div/div[3]')
target_div.click()

# Wait for the new div to be visible
new_div = driver.find_element(By.XPATH, '//*[@id="app"]/div/div[3]/div/div[3]/div/div/div/div[4]')
# Initialize an empty list to store all hrefs
hrefs_with_categories = []

# Define the maximum number of vertical divs for each column
max_j_values = [2, 4, 5, 5, 3]

# Iterate over each column (div[i])
for i in range(1, 6):  # i goes from 1 to 5
    for j in range(1, max_j_values[i - 1] + 1):  # j goes from 1 to max value for that column
        try:
            # Construct the XPath for the ul element inside div[i]/div[j]
            ul_xpath = f'//*[@id="app"]/div/div[3]/div/div[3]/div/div/div/div[4]/div[{i}]/div[{j}]/ul'
            # Construct the XPath for the category name inside div[i]/div[j]
            category_xpath_a = f'//*[@id="app"]/div/div[3]/div/div[3]/div/div/div/div[4]/div[{i}]/div[{j}]/h3/a'
            category_xpath_h3 = f'//*[@id="app"]/div/div[3]/div/div[3]/div/div/div/div[4]/div[{i}]/div[{j}]/h3'

            # Find the ul element
            ul_element = driver.find_element(By.XPATH, ul_xpath)

            # Try to find the category name within an <a> tag
            try:
                category_element = driver.find_element(By.XPATH, category_xpath_a)
                category_name = category_element.text
            except:
                # If <a> tag is not found, try to find the category name directly within <h3>
                category_element = driver.find_element(By.XPATH, category_xpath_h3)
                category_name = category_element.text
            
            # Find all a tags inside li elements within this ul
            a_tags = ul_element.find_elements(By.XPATH, './/li/a')
            
            # Extract href attribute from each a tag and add it to the list
            for a in a_tags:
                href = a.get_attribute('href')
                hrefs_with_categories.append((href, category_name))
        
        except Exception as e:
            print(f"Error extracting from div[{i}]/div[{j}]: {e}")
            continue

print(len(hrefs_with_categories))

# for item in hrefs_with_categories:
#     print(item)

88


In [7]:
# Initialize a list to store href-category pairs (assuming you already have hrefs_with_categories list)
filtered_pairs = [
    (href, category) 
    for href, category in hrefs_with_categories 
    if href.startswith("https://timesofindia.indiatimes.com/") and not href.endswith(".cms")
]

# Extract tags from the filtered pairs
tags = [category for _, category in filtered_pairs]

# Convert tags list to a set to remove duplicates and then sort
tags_set = set(tags)
sorted_tags = sorted(tags_set)

# Print the number of filtered hrefs and the sorted tags
print("Number of filtered hrefs:", len(filtered_pairs))
print("Number of tags:", len(sorted_tags))

for item in filtered_pairs:
    print(item)

Number of filtered hrefs: 62
Number of tags: 17
('https://timesofindia.indiatimes.com/city', 'NEWS HOME')
('https://timesofindia.indiatimes.com/live-breaking-news', 'NEWS HOME')
('https://timesofindia.indiatimes.com/india', 'NEWS HOME')
('https://timesofindia.indiatimes.com/elections', 'NEWS HOME')
('https://timesofindia.indiatimes.com/politics', 'NEWS HOME')
('https://timesofindia.indiatimes.com/world', 'NEWS HOME')
('https://timesofindia.indiatimes.com/home/headlines', 'NEWS HOME')
('https://timesofindia.indiatimes.com/podcasts', 'NEWS HOME')
('https://timesofindia.indiatimes.com/specials', 'NEWS HOME')
('https://timesofindia.indiatimes.com/times-fact-check', 'NEWS HOME')
('https://timesofindia.indiatimes.com/india/the-times-of-a-better-india', 'NEWS HOME')
('https://timesofindia.indiatimes.com/south-pole-by-chennai-times', 'NEWS HOME')
('https://timesofindia.indiatimes.com/blogs/toi-editorials/', 'EDITORIALS')
('https://timesofindia.indiatimes.com/entertainment/movie-reviews', 'ENTE

In [37]:
test_pair = []
test_pair.append(('https://timesofindia.indiatimes.com/auto/motorsports/', 'AUTO'))
test_pair.append(('https://timesofindia.indiatimes.com/technology/mobiles-tabs', 'TECHNOLOGY'))

xpaths = [
    "//figcaption",
    "//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'sNF1c', ' ' ))] | //h5",
    "//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'top-newslist', ' ' ))]//a",
    "//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'title', ' ' ))]//span",
    "//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'desc', ' ' ))]//a"
]

all_links = []

for item in filtered_pairs:
    webpage_link = item[0]
    webpage_tag = item[1]
    
    # Navigate to the webpage
    driver.get(webpage_link)
    
    captions = []  # Reset captions for each webpage
    
    for xpath in xpaths:
        try:
            # Find elements for the current XPath
            elements = driver.find_elements(By.XPATH, xpath)
            
            # If elements are found, extend the captions list
            if elements:
                captions.extend(elements)
        except Exception as e:
            print(f"Error occurred for XPath {xpath}: {e}")  # Log the error for debugging
    
    # Print captions, if any
    if captions:
        hrefs = []

        for caption in captions:
            try:
                parent = caption
                anchor_found = False
                
                # Traverse up to 5 levels in the hierarchy to search for the <a> tag
                for i in range(5):
                    try:
                        # Try finding the <a> tag in the current parent level
                        parent_a_tag = parent.find_element(By.XPATH, "./parent::a")
                        
                        # If found, extract the href and break the loop
                        href = parent_a_tag.get_attribute("href")
                        if href:
                            hrefs.append(href)
                            anchor_found = True
                            break
                    except:
                        # Move to the next level in the hierarchy
                        parent = parent.find_element(By.XPATH, "./parent::*")
                
                # If no <a> tag is found after 5 levels, log the information
                if not anchor_found:
                    print(f"No <a> tag found within 5 levels for category: {webpage_tag}")
        
            except Exception as e:
                # Handle other unexpected exceptions
                print(f"Error processing caption: {caption.text} - {e}")

        # Now create the pairs of webpage_tag and hrefs, and add them to the all_links list
        for href in hrefs:
            all_links.append((webpage_tag, href))

    else:
        print(f"No captions found for {webpage_tag} ({webpage_link})")

for pair in all_links:
    print(f"Category: {pair[0]}, Link: {pair[1]}")

No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 levels for category: NEWS HOME
No <a> tag found within 5 level

In [43]:
def scroll_into_view(driver, element):
    driver.execute_script("arguments[0].scrollIntoView(true);", element)
    time.sleep(1)
def extract_comments(driver, article_url, cat_tag):
    try:
        driver.get(article_url)
        
        # Locate the "End of Article" element
        end_of_article_div = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' mj2wg ')]//span[contains(text(), 'End of Article')]")
        
        # Get the position of the element
        element_position = end_of_article_div.location['y']
        
        # Get the viewport height
        viewport_height = driver.execute_script("return window.innerHeight")
        
        # Calculate the scroll position to center the element in the viewport
        scroll_position = element_position - (viewport_height / 2)
        
        # Scroll to the calculated position
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(2)
    except WebDriverException as e:
        print(f"Failed to load page {article_url}: {e}")
        return []  # Return an empty list to skip this article
    
    # Prepare a list to store extracted comments data
    comments_data = []
    
    try:
        # Locate the comment button using the corrected XPath
        view_comment_btn = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' GzGIQ ') and (((count(preceding-sibling::*) + 1) = 1) and parent::*)]")
        view_comment_btn.click()
        time.sleep(2)
        
        # Try clicking "VIEW MORE COMMENTS" buttons if available
        while True:
            try:
                # Wait for the "View more comments" button to become clickable
                view_more_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, "//div[contains(concat(' ', @class, ' '), ' hduJ6 ') and contains(text(), 'View more comments')]"))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", view_more_button)
                view_more_button.click()
            except TimeoutException:
                # No more "View more comments" button found, exit loop
                print("Finished loading all the comments")
                break
            except NoSuchElementException:
                # Button not found, break the loop
                print("No 'View more comments' button found.")
                break
        
        # Locate and extract all comments
        comments = driver.find_elements(By.XPATH, "//*[contains(concat(' ', @class, ' '), concat(' ', 'Nuk1p', ' '))]")

        for comment in comments:
            try:
                try:
                    # Check if the "Read More" link exists 
                    read_more_link = comment.find_element(By.XPATH, ".//span[contains(text(), 'Read More')]")
                    driver.execute_script("arguments[0].click();", read_more_link)
                except NoSuchElementException:
                    # If no "Read More" link is found
                    pass
                    
                # Extract details from each comment
                user = comment.find_element(By.CLASS_NAME, "ZJ4ae").text
                comment_text = comment.find_element(By.CLASS_NAME, "mxnGH").text
                
                # Extract upvotes and downvotes using XPath
                upvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
                downvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
        
                # Initialize replies list before processing
                replies = []
        
                # Check and click "Show responses" link to load nested replies
                try:
                    show_responses_link = comment.find_element(By.XPATH, "./div[4]/a")
                    scroll_into_view(driver, show_responses_link)
                    show_responses_link.click()
                    time.sleep(2)
                except NoSuchElementException:
                    pass
                
                # Locate nested replies after they are loaded
                while True:
                    try:
                        show_all_responses_button = comment.find_element(By.XPATH, ".//div[contains(concat(' ', @class, ' '), ' evC4f ') and contains(text(), 'Show all responses')]")
                        show_all_responses_button.click()
                        time.sleep(2)
                    except NoSuchElementException:
                        break  # Exit loop if no more "Show all responses" button is found
                
                reply_elements = comment.find_elements(By.XPATH, ".//div[5]/ul/li")
                
                for reply in reply_elements:
                    try:
                        try:
                            # Check if the "Read More" link exists 
                            read_more_link = reply.find_element(By.XPATH, ".//span[contains(text(), 'Read More')]")
                            driver.execute_script("arguments[0].click();", read_more_link)
                        except NoSuchElementException:
                            # If no "Read More" link is found
                            pass
                            
                        reply_user = reply.find_element(By.XPATH, "./div[1]/h3").text
                        reply_to = reply.find_element(By.XPATH, "./div[2]/span[1]").text
                        reply_text = reply.find_element(By.XPATH, "./div[3]").text
                        reply_upvotes = reply.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
                        reply_downvotes = reply.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
                        
                        replies.append({
                            'user': reply_user,
                            'reply_to': reply_to,
                            'comment_text': reply_text,
                            'upvotes': reply_upvotes,
                            'downvotes': reply_downvotes,
                        })
                    except Exception as e:
                        print(f"Error extracting reply comment: {e}")
                
                # Append the extracted details to comments_data with article link
                comments_data.append({
                    'tag': cat_tag, 
                    'user': user,
                    'comment_text': comment_text,
                    'upvotes': upvotes,
                    'downvotes': downvotes,
                    'replies': replies,
                    'article_link': article_url
                })
            except Exception as e:
                print(f"Error extracting comment: {e}")

    except NoSuchElementException:
        print("No comment button found, skipping this article.")
    except Exception as e:
        print(f"Error clicking comment button: {e}")
    
    return comments_data


all_comments = []
for link in all_links:
    print(f"Processing article: {link[1]}")
    comments = extract_comments(driver, link[1], link[0])
    all_comments.extend(comments)
    print(f"Extracted {len(comments)} comments from {link[1]}")


Processing article: https://timesofindia.indiatimes.com/city/kochi/kerala-actor-mla-m-mukesh-named-in-rape-case-gets-court-shield-till-september-3/articleshow/112912562.cms
Finished loading all the comments
Extracted 4 comments from https://timesofindia.indiatimes.com/city/kochi/kerala-actor-mla-m-mukesh-named-in-rape-case-gets-court-shield-till-september-3/articleshow/112912562.cms
Processing article: https://timesofindia.indiatimes.com/city/mumbai/govt-nod-for-6ft-only-but-shivaji-statue-that-fell-stood-35ft-tall/articleshow/112909597.cms
Finished loading all the comments
Error extracting comment: Message: element click intercepted: Element <a data-ga="comment-show_responses-comment_section|commentid_2675773231" class="hwy4z">...</a> is not clickable at point (1005, 61). Other element would receive the click: <iframe id="google_ads_iframe_/7176/TOI/TOI_ROS/TOI_ROS_Comment_ATF_300_1" name="google_ads_iframe_/7176/TOI/TOI_ROS/TOI_ROS_Comment_ATF_300_1" title="3rd party ad content" widt

In [44]:
import csv
import json
import os

# Save to JSON file
json_output_path = os.path.join(os.getcwd(), "times_of_india_comments.json")
with open(json_output_path, 'w') as json_file:
    json.dump(all_comments, json_file, indent=4)

print(f"Comments saved to JSON file: {json_output_path}")

# Save to CSV file
csv_output_path = os.path.join(os.getcwd(), "times_of_india_comments.csv")
with open(csv_output_path, 'w', newline='', encoding='utf-8') as csv_file:
    # Define the fieldnames, including for replies
    fieldnames = ['tag', 'user', 'comment_text', 'upvotes', 'downvotes', 'article_link', 'reply_user', 'reply_to', 'reply_text', 'reply_upvotes', 'reply_downvotes']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    
    writer.writeheader()
    
    for comment in all_comments:
        # Write the main comment row
        writer.writerow({
            'tag': comment['tag'],
            'user': comment['user'],
            'comment_text': comment['comment_text'],
            'upvotes': comment['upvotes'],
            'downvotes': comment['downvotes'],
            # No reply for the main comment
            'reply_user': '',
            'reply_to': '',
            'reply_text': '',
            'reply_upvotes': '',
            'reply_downvotes': '',
            'article_link': comment['article_link']
        })
        
        # Write each reply as a separate row
        for reply in comment['replies']:
            writer.writerow({
                'tag': '',
                'user': '',  # No main comment info for replies
                'comment_text': '',
                'upvotes': '',
                'downvotes': '',
                'reply_user': reply['user'],
                'reply_to': reply['reply_to'],
                'reply_text': reply['comment_text'],
                'reply_upvotes': reply['upvotes'],
                'reply_downvotes': reply['downvotes'],
                'article_link': ''
            })

print(f"Comments saved to CSV file: {csv_output_path}")


Comments saved to JSON file: E:\BTP\Selenium Automation\TOI_bot\times_of_india_comments.json
Comments saved to CSV file: E:\BTP\Selenium Automation\TOI_bot\times_of_india_comments.csv
