# Selenium based Web Scraping of TOI

### AUTOMATED COMMENTS EXTRACTION

In [1]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException, TimeoutException, NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException
from datetime import datetime
import time
import os
import wget
import json
import csv

In [2]:
# Set up ChromeDriver
service = Service('E:/Programs/Chrome Driver/chromedriver.exe')
driver = webdriver.Chrome(service=service)

In [3]:
# Open the Times of India page
driver.get("https://timesofindia.indiatimes.com/")

In [4]:
# Sleep for a specific duration (e.g., 5 seconds)
time.sleep(5)

# Locate the element and click it
target_div = driver.find_element(By.XPATH, '//*[@id="app"]/div/div[3]/div/div[3]/div/div/div/div[3]')
target_div.click()

In [5]:
from selenium.common.exceptions import NoSuchElementException

# Wait for the new div to be visible
new_div = driver.find_element(By.XPATH, '//*[@id="app"]/div/div[3]/div/div[3]/div/div/div/div[4]')

# Initialize an empty list to store all hrefs
hrefs_with_categories = []

# Define the maximum number of vertical divs for each column
max_j_values = [2, 4, 5, 5, 3]

# Iterate over each column (div[i])
for i in range(1, 6):  # i goes from 1 to 5 - vertical columns 
    for j in range(1, max_j_values[i - 1] + 1):  # j goes from 1 to max value for that column
        try:
            # XPaths for category and <a> tags
            ul_xpath = f'//*[@id="app"]/div/div[3]/div/div[3]/div/div/div/div[4]/div[{i}]/div[{j}]/ul' 
            category_xpath_a = f'//*[@id="app"]/div/div[3]/div/div[3]/div/div/div/div[4]/div[{i}]/div[{j}]/h3/a'
            category_xpath_h3 = f'//*[@id="app"]/div/div[3]/div/div[3]/div/div/div/div[4]/div[{i}]/div[{j}]/h3'

            # Try to find the category name within an <a> tag or h3 tag
            try:
                category_element = driver.find_element(By.XPATH, category_xpath_a)
                category_name = category_element.text.strip('"').capitalize()
            except NoSuchElementException:
                category_element = driver.find_element(By.XPATH, category_xpath_h3)
                category_name = category_element.text.strip('"').capitalize()

            # Skip the extraction of specific categories under "Sports" and get only the parent href
            if category_name == "Sports":
                parent_href = category_element.get_attribute('href')
                hrefs_with_categories.append((parent_href, category_name))
                continue  # Skip further processing for this parent category

            # Find the ul element for categories
            ul_element = driver.find_element(By.XPATH, ul_xpath)

            # Find all <a> tags inside the bold category elements
            a_tags = ul_element.find_elements(By.XPATH, './/li/a')

            valid_categories = ["Politics", "Good News", "India", "Tech News"]
            valid_parent_categories = ["Education"]

            for a in a_tags:
                a_tag_text = a.text.strip()  # Strip whitespace
                if a_tag_text in valid_categories:
                    href = a.get_attribute('href')
                    hrefs_with_categories.append((href, a_tag_text))
                elif category_name in valid_parent_categories:
                    href = a.get_attribute('href')
                    hrefs_with_categories.append((href, category_name))

        except NoSuchElementException as e:
            print(f"Error extracting from div[{i}]/div[{j}]: {e}")
            continue
        except Exception as e:
            print(f"Unexpected error in div[{i}]/div[{j}]: {e}")
            continue

# Print the number of hrefs collected and the collected data
print(f"Total hrefs collected: {len(hrefs_with_categories)}")

for item in hrefs_with_categories:
    print(item)


Total hrefs collected: 9
('https://timesofindia.indiatimes.com/india', 'India')
('https://timesofindia.indiatimes.com/politics', 'Politics')
('https://timesofindia.indiatimes.com/good-news/goodnews.cms', 'Good News')
('https://timesofindia.indiatimes.com/sports', 'Sports')
('https://www.bennett.edu.in/', 'Education')
('https://toistudent.timesofindia.indiatimes.com/', 'Education')
('https://timesofindia.indiatimes.com/education/times-study-abroad', 'Education')
('https://timesofindia.indiatimes.com/education/jobs', 'Education')
('https://timesofindia.indiatimes.com/technology/tech-news', 'Tech News')


In [6]:
import pickle

# Define the file path to save the data (e.g., hrefs_with_categories.pkl)
file_path = 'hrefs_with_categories.pkl'

# Save the data using pickle
with open(file_path, 'wb') as file:
    pickle.dump(hrefs_with_categories, file)

In [7]:
driver.quit()

In [13]:
# FOR EACH CATEGORY, I NEED TO MAKE ANOTHER FILE FOR CRAWLING, TOTAL OF 9 FILES FOR THE FOLLOWING LINKS:
# https://timesofindia.indiatimes.com/technology/tech-news
# https://timesofindia.indiatimes.com/education/jobs
# https://timesofindia.indiatimes.com/education/times-study-abroad
# https://timesofindia.indiatimes.com/sports/football
# https://timesofindia.indiatimes.com/sports/cricket/ipl
# https://timesofindia.indiatimes.com/sports/cricket
# https://timesofindia.indiatimes.com/politics
# https://timesofindia.indiatimes.com/india
# https://timesofindia.indiatimes.com/good-news/goodnews.cms
# REST LEFT ARE NOT TOI PORTALS

In [5]:
# Initialize a list to store href-category pairs (assuming you already have hrefs_with_categories list)
filtered_pairs = [
    (href, category) 
    for href, category in hrefs_with_categories 
    if href.startswith("https://timesofindia.indiatimes.com/") and not href.endswith(".cms")
]

# Extract tags from the filtered pairs
tags = [category for _, category in filtered_pairs]

# Convert tags list to a set to remove duplicates and then sort
tags_set = set(tags)
sorted_tags = sorted(tags_set)

# Print the number of filtered hrefs and the sorted tags
print("Number of filtered hrefs:", len(filtered_pairs))
print("Number of tags:", len(sorted_tags))

for item in filtered_pairs:
    print(item)

Number of filtered hrefs: 62
Number of tags: 17
('https://timesofindia.indiatimes.com/city', 'NEWS HOME')
('https://timesofindia.indiatimes.com/live-breaking-news', 'NEWS HOME')
('https://timesofindia.indiatimes.com/india', 'NEWS HOME')
('https://timesofindia.indiatimes.com/elections', 'NEWS HOME')
('https://timesofindia.indiatimes.com/politics', 'NEWS HOME')
('https://timesofindia.indiatimes.com/world', 'NEWS HOME')
('https://timesofindia.indiatimes.com/home/headlines', 'NEWS HOME')
('https://timesofindia.indiatimes.com/podcasts', 'NEWS HOME')
('https://timesofindia.indiatimes.com/specials', 'NEWS HOME')
('https://timesofindia.indiatimes.com/times-fact-check', 'NEWS HOME')
('https://timesofindia.indiatimes.com/india/the-times-of-a-better-india', 'NEWS HOME')
('https://timesofindia.indiatimes.com/south-pole-by-chennai-times', 'NEWS HOME')
('https://timesofindia.indiatimes.com/blogs/toi-editorials/', 'EDITORIALS')
('https://timesofindia.indiatimes.com/entertainment/movie-reviews', 'ENTE

In [40]:
test_pair = []
test_pair.append(('https://timesofindia.indiatimes.com/auto/motorsports/', 'AUTO'))
test_pair.append(('https://timesofindia.indiatimes.com/life-style/health-fitness', 'LIFE STYLE'))
# print("1 passed")
xpaths = [
    "//figcaption",
    "//*[contains(concat(' ', @class, ' '), ' sNF1c ')] | //h5",
    "//*[contains(concat(' ', @class, ' '), ' top-newslist ')]//a",
    "//*[contains(concat(' ', @class, ' '), ' top-newslist ')]//*[contains(concat(' ', @class, ' '), ' w_tle ')]//a",
    "//*[contains(concat(' ', @class, ' '), ' title ')]//span",
    "//*[contains(concat(' ', @class, ' '), ' WavNE ')]",
    "//*[contains(concat(' ', @class, ' '), ' desc ')]//a",
    "//*[contains(concat(' ', @class, ' '), ' leadstorties ')]//span",
    "//*[contains(concat(' ', @class, ' '), ' I4QgS ')]",
    "//*[contains(concat(' ', @class, ' '), ' null ')]//span | //figcaption | //*[contains(concat(' ', @class, ' '), ' linktype1 ')]//span",
    "//*[contains(concat(' ', @class, ' '), ' clearfix ')]//div//div//li//a",
    "//*[contains(concat(' ', @class, ' '), ' news-card-img ')]//*[contains(concat(' ', @class, ' '), ' w_tle ')]//a",
    "//div[(count(preceding-sibling::*) + 1 = 8)]//*[contains(concat(' ', @class, ' '), ' M194D ')]//figcaption | //div[(count(preceding-sibling::*) + 1 = 8)]//*[contains(concat(' ', @class, ' '), ' hoid1 ')]//figcaption | //div[(count(preceding-sibling::*) + 1 = 9)]//figcaption | //div[(count(preceding-sibling::*) + 1 = 6)]//figcaption | //*[contains(concat(' ', @class, ' '), ' zxvyz ')]//figcaption | //*[contains(concat(' ', @class, ' '), ' sNF1c ')] | //div[(count(preceding-sibling::*) + 1 = 7)]//figcaption",
    "//*[@id = 'c_headlines_wdt_1']//*[contains(concat(' ', @class, ' '), ' w_tle ')]//a",
    "//*[@id = 'content']//*[contains(concat(' ', @class, ' '), ' w_tle ')]//a",
    "//*[contains(concat(' ', @class, ' '), ' chng_lfttxt ')]//h3 | //*[contains(concat(' ', @class, ' '), ' mrB20 ')]//h3",
]

# print("2 passed")

all_links = []

for item in filtered_pairs:
    webpage_link = item[0]
    webpage_tag = item[1]
    
    # Navigate to the webpage
    driver.get(webpage_link)
    time.sleep(2)
    
    captions = []  # Reset captions for each webpage
    not_found_count = 0
    
    for xpath in xpaths:
        try:
            # Find elements for the current XPath
            elements = driver.find_elements(By.XPATH, xpath)
            
            # If elements are found, extend the captions list
            if elements:
                captions.extend(elements)
        except Exception as e:
            print(f"Error occurred for XPath {xpath}: {e}")  # Log the error for debugging
    print(len(captions))
    # Process the captions if any are found
    if captions:
        hrefs = []

        for caption in captions:
                try:
                    parent = caption
                    anchor_found = False
                    
                    # Traverse up to 5 levels in the hierarchy to search for the <a> tag
                    for i in range(5):
                        try:
                            # Try finding the <a> tag in the current parent level
                            parent_a_tag = parent.find_element(By.XPATH, "./parent::a")
                            
                            # If found, extract the href and break the loop
                            href = parent_a_tag.get_attribute("href")
                            if href:
                                hrefs.append(href)
                                print(f"Current href count: {len(hrefs)}")
                                anchor_found = True
                            # print("pass")
                        except:
                            # Move to the next level in the hierarchy
                            parent = parent.find_element(By.XPATH, "./parent::*")
                    
                    # Move to the next caption if anchor is not found
                    if not anchor_found:
                        not_found_count += 1

                except StaleElementReferenceException:
                    print(f"StaleElementReferenceException encountered. Skipping this caption.")
                    break  # Exit loop after max retries
                
                except Exception as e:
                    # Handle other unexpected exceptions
                    print(f"Error processing caption: {caption.text} - {e}")

        # Create pairs of webpage_tag and hrefs, and add them to the all_links list
        for href in hrefs:
            # print(f"{href}")
            all_links.append((webpage_tag, href))

    else:
        print(f"No captions found for {webpage_tag} ({webpage_link})")

    print(f"Count of 'No <a> tag found within 5 levels' for category {webpage_tag}: {not_found_count}")

all_links = list(set(all_links))
print(len(all_links))
for pair in all_links:
    print(f"Category: {pair[0]}, Link: {pair[1]}")

90
Current href count: 1
Current href count: 2
Current href count: 3
Current href count: 4
Current href count: 5
Current href count: 6
Current href count: 7
Current href count: 8
Current href count: 9
Current href count: 10
Current href count: 11
Current href count: 12
Current href count: 13
Current href count: 14
Current href count: 15
Current href count: 16
Current href count: 17
Current href count: 18
Current href count: 19
Current href count: 20
Current href count: 21
Current href count: 22
Current href count: 23
Current href count: 24
Current href count: 25
Current href count: 26
Current href count: 27
Current href count: 28
Current href count: 29
Current href count: 30
Current href count: 31
Current href count: 32
Current href count: 33
Current href count: 34
Current href count: 35
Current href count: 36
Current href count: 37
Current href count: 38
Current href count: 39
Current href count: 40
StaleElementReferenceException encountered. Skipping this caption.
Count of 'No <a> ta

In [41]:
# print(all_links)

In [42]:
def scroll_into_view(driver, element):
    driver.execute_script("arguments[0].scrollIntoView(true);", element)
    time.sleep(1)

def click_element(driver, element):
    try:
        actions = ActionChains(driver)
        actions.move_to_element(element).click().perform()
    except Exception as e:
        print(f"Error interacting with element: {e}")

def click_show_more_comments(driver):
    while True:
        try:
            show_more_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//*[@id='comment-container-112968501']/div/div[1]/div[2]/div[1]/div/div/div[2]"))
            )
            scroll_into_view(driver, show_more_button)
            show_more_button.click()
            time.sleep(2)
        except (TimeoutException, ElementClickInterceptedException, StaleElementReferenceException) as e:
            print(f"Error with 'Show More Comments' button: {e}")
            break

def extract_comments(driver, article_url, cat_tag):
    try:
        driver.get(article_url)

        # Get the timestamp and author of the article
        timestamp_element = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), concat(' ', 'byline', ' '))]//span")
        timestamp = timestamp_element.text.strip()
        try:
            author_element = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' mMwSH ')]")
            author = author_element.text
        except NoSuchElementException:
            author = "TOI"
        article_div = driver.find_element(By.XPATH, "//div[contains(@class, 'clearfix') and contains(@class, '_s30J')]")
        article_text = article_div.text
        
        # Remove common prefixes like 'Updated:' if present
        if "Updated:" in timestamp:
            timestamp = timestamp.replace("Updated:", "").strip()
        
        # Split the timestamp to remove the time part and clean up the string
        try:
            date_part = timestamp.split(",")[0] + " " + timestamp.split(",")[1].strip()
            date_object = datetime.strptime(date_part, "%b %d %Y")
            formatted_date = date_object.strftime("%d.%m.%Y")
            timestamp = formatted_date
        
        except (IndexError, ValueError) as e:
            print(f"Error processing timestamp '{timestamp}': {e}")
            timestamp = None
        
        # Locate the "End of Article" element
        end_of_article_div = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' mj2wg ')]//span[contains(text(), 'End of Article')]")
        element_position = end_of_article_div.location['y']
        viewport_height = driver.execute_script("return window.innerHeight")
        scroll_position = element_position - (viewport_height / 2)
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(2)
    except WebDriverException as e:
        print(f"Failed to load page {article_url}: {e}")
        return {}  # Return an empty list to skip this article
    
    # Prepare a list to store extracted comments data
    comments_data = []
    
    try:
        # Locate and click the comment button
        try:
            view_comment_btn = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//*[contains(concat(' ', @class, ' '), ' GzGIQ ') and (((count(preceding-sibling::*) + 1) = 1) and parent::*)]"))
            )
            view_comment_btn.click()
            time.sleep(2)
        except TimeoutException as e:
            print(f"Timeout waiting for comment button: {e}")
            return {}
        except ElementClickInterceptedException as e:
            print(f"Comment button click intercepted: {e}")
            return {}
        
        # Continuously check for "Show More Comments" button and click if found
        while True:
            try:
                # Find the "Show More Comments" button
                show_more_button = WebDriverWait(driver, 15).until(
                    EC.element_to_be_clickable((By.XPATH, "//div[contains(concat(' ', @class, ' '), ' hduJ6 ') and contains(text(), 'View more comments')]"))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
                show_more_button.click()
                time.sleep(2)  # Wait for new comments to load
            except TimeoutException:
                # No more "Show More Comments" button found, exit loop
                print("Finished loading all the comments")
                break
            except NoSuchElementException:
                # Button not found, break the loop
                print("No 'Show More Comments' button found.")
                break
            except ElementClickInterceptedException:
                # Handle cases where the button is covered or not clickable
                print("Failed to click 'Show More Comments' button due to an interception.")
                break
        
        # Locate and extract all comments
        comments = driver.find_elements(By.XPATH, "//*[contains(concat(' ', @class, ' '), concat(' ', 'Nuk1p', ' '))]")
        # Filter out any elements that contain an iframe (ads) or advertisement-specific divs
        comments = [comment for comment in comments
                    if len(comment.find_elements(By.XPATH, ".//div[contains(@class, 'paisa-wrapper')]")) == 0]

        for comment in comments:
            try:
                try:
                    # Check if the "Read More" link exists 
                    read_more_link = comment.find_element(By.XPATH, ".//span[contains(text(), 'Read More')]")
                    driver.execute_script("arguments[0].click();", read_more_link)
                except NoSuchElementException:
                    # If no "Read More" link is found
                    pass
                    
                # Extract user and comment details
                user = comment.find_element(By.CLASS_NAME, "ZJ4ae").text
                comment_text = comment.find_element(By.CLASS_NAME, "mxnGH").text
                upvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
                downvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
        
                # Initialize replies list before processing
                replies = []
        
                # Check and click "Show responses" link to load nested replies
                try:
                    show_responses_link = comment.find_element(By.XPATH, "./div[4]/a")
                    scroll_into_view(driver, show_responses_link)
                    show_responses_link.click()
                    time.sleep(2)
                except NoSuchElementException:
                    pass
                
                # Locate nested replies after they are loaded
                while True:
                    try:
                        show_all_responses_button = comment.find_element(By.XPATH, ".//div[contains(concat(' ', @class, ' '), ' evC4f ') and contains(text(), 'Show all responses')]")
                        show_all_responses_button.click()
                        time.sleep(2)
                    except NoSuchElementException:
                        break  # Exit loop if no more "Show all responses" button is found
                
                reply_elements = comment.find_elements(By.XPATH, ".//div[5]/ul/li")
                reply_elements = [reply for reply in reply_elements
                  if len(reply.find_elements(By.XPATH, ".//iframe")) == 0 and
                     len(reply.find_elements(By.XPATH, ".//div[contains(@class, 'paisa-wrapper')]")) == 0]
                
                for reply in reply_elements:
                    try:
                        try:
                            # Check if the "Read More" link exists 
                            read_more_link = reply.find_element(By.XPATH, ".//span[contains(text(), 'Read More')]")
                            driver.execute_script("arguments[0].click();", read_more_link)
                        except NoSuchElementException:
                            # If no "Read More" link is found
                            pass
                            
                        reply_user = reply.find_element(By.XPATH, "./div[1]/h3").text
                        reply_to = reply.find_element(By.XPATH, "./div[2]/span[1]").text
                        reply_text = reply.find_element(By.XPATH, "./div[3]").text
                        reply_upvotes = reply.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
                        reply_downvotes = reply.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
                        
                        replies.append({
                            'user': reply_user,
                            'reply_to': reply_to,
                            'comment_text': reply_text,
                            'upvotes': reply_upvotes,
                            'downvotes': reply_downvotes,
                        })
                    except Exception as e:
                        print(f"Error extracting reply comment: {e}")
                
                # Append the extracted details to comments_data with article link
                if user and upvotes.isdigit() and downvotes.isdigit():
                    comments_data.append({
                        'user': user,
                        'comment_text': comment_text,
                        'upvotes': upvotes,
                        'downvotes': downvotes,
                        'replies': replies
                    })
                else:
                    print(f"Skipped an empty comment")

            except Exception as e:
                print(f"Error extracting comment: {e}")

    except NoSuchElementException:
        print("No comment button found, skipping this article.")
    except Exception as e:
        print(f"Error clicking comment button: {e}")
    
    article_data = {
        'author': author,
        'tag': cat_tag,
        'timestamp': timestamp,
        'article_content': article_text,
        'comments': comments_data,
        'article_link': article_url
    }
    
    return article_data


all_articles = []
test_href_pair = []
test_href_pair.append(('BUSINESS', 'https://timesofindia.indiatimes.com/business/financial-literacy/investing/new-mutual-fund-kyc-rules-is-your-kyc-validated-verified-registered-or-on-hold-find-out/articleshow/109568327.cms'))
test_href_pair.append(('SPEAKING_TREE', 'https://timesofindia.indiatimes.com/india/silence-much-worse-vp-jagdeep-dhankhar-fires-salvo-at-certain-stray-voices-and-ngos-over-kolkata-rape-murder-case-humanity-shamed-kapil-sibal/articleshow/112968501.cms'))

for link in all_links:
    print(f"Processing article: {link[1]}")
    article_data = extract_comments(driver, link[1], link[0])
    # Check if 'comments' field is not empty before adding to all_articles
    if 'comments' in article_data and article_data['comments']:
        all_articles.append(article_data)
        print(f"Extracted {len(article_data['comments'])} comments from {link[1]}")
    else:
        print(f"No comments extracted from {link[1]}, skipping this article.")

Processing article: https://timesofindia.indiatimes.com/business/financial-literacy/savings/which-corporate-fixed-deposits-offer-highest-interest-rates-check-list-of-10-corporate-fds/articleshow/110337327.cms
Finished loading all the comments
Extracted 1 comments from https://timesofindia.indiatimes.com/business/financial-literacy/savings/which-corporate-fixed-deposits-offer-highest-interest-rates-check-list-of-10-corporate-fds/articleshow/110337327.cms
Processing article: https://timesofindia.indiatimes.com/business/financial-literacy/debt-management/rbi-new-loan-rules-is-your-bank-overcharging-you-on-interest-4-ways-in-which-customers-may-be-paying-extra/articleshow/109781728.cms
Finished loading all the comments
Skipped an empty comment
Extracted 5 comments from https://timesofindia.indiatimes.com/business/financial-literacy/debt-management/rbi-new-loan-rules-is-your-bank-overcharging-you-on-interest-4-ways-in-which-customers-may-be-paying-extra/articleshow/109781728.cms
Processing 

In [43]:
print(all_articles)



In [44]:
import csv
import json
import os

# Save to JSON file
json_output_path = os.path.join(os.getcwd(), "times_of_india_comments.json")
with open(json_output_path, 'w') as json_file:
    json.dump(all_articles, json_file, indent=4)

print(f"Comments saved to JSON file: {json_output_path}")

# Save to CSV file
csv_output_path = os.path.join(os.getcwd(), "times_of_india_comments.csv")
with open(csv_output_path, 'w', newline='', encoding='utf-8') as csv_file:
    # Define the fieldnames, including for replies
    fieldnames = ['author', 'tag', 'timestamp', 'article_content', 'article_link', 'user', 'comment_text', 'upvotes', 'downvotes', 'reply_user', 'reply_to', 'reply_text', 'reply_upvotes', 'reply_downvotes']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    
    writer.writeheader()
    
    for article in all_articles:
        writer.writerow({
            'author': article['author'],
            'tag': article['tag'],
            'timestamp': article['timestamp'],
            'article_content': article['article_content'],
            'article_link': article['article_link'],
            'user': '',  # No main comment info for replies
            'comment_text': '',
            'upvotes': '',
            'downvotes': '',
            'reply_user': '',
            'reply_to': '',
            'reply_text': '',
            'reply_upvotes': '',
            'reply_downvotes': '',
        })
        for comment in article['comments']:
            writer.writerow({
                'author': '',
                'tag': '',
                'timestamp': '',
                'article_content': '',
                'article_link': '',
                'user': comment['user'],
                'comment_text': comment['comment_text'],
                'upvotes': comment['upvotes'],
                'downvotes': comment['downvotes'],
                'reply_user': '',  # No reply info for the main comment
                'reply_to': '',
                'reply_text': '',
                'reply_upvotes': '',
                'reply_downvotes': '',
            })
            for reply in comment['replies']:
                writer.writerow({
                    'author': '',  # No article info for replies
                    'tag': '',
                    'timestamp': '',
                    'article_content': '',
                    'article_link': '',  # No article link for replies
                    'user': '',  # No main comment info for replies
                    'comment_text': '',
                    'upvotes': '',
                    'downvotes': '',
                    'reply_user': reply['user'],
                    'reply_to': reply['reply_to'],
                    'reply_text': reply['comment_text'],
                    'reply_upvotes': reply['upvotes'],
                    'reply_downvotes': reply['downvotes'],
                })

print(f"Comments saved to CSV file: {csv_output_path}")


Comments saved to JSON file: E:\BTP\Selenium Automation\TOI_bot\times_of_india_comments.json
Comments saved to CSV file: E:\BTP\Selenium Automation\TOI_bot\times_of_india_comments.csv


In [45]:
driver.quit()