In [1]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException, TimeoutException, NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException
from datetime import datetime, timedelta
import time
import os
import wget
import json
import csv

In [2]:
# Set up ChromeDriver
service = Service('E:/Programs/Chrome Driver/chromedriver.exe')
driver = webdriver.Chrome(service=service)

In [3]:
def scroll_into_view(driver, element):
    driver.execute_script("arguments[0].scrollIntoView(true);", element)
    time.sleep(2)

def click_element(driver, element):
    try:
        actions = ActionChains(driver)
        actions.move_to_element(element).click().perform()
    except Exception as e:
        print(f"Error interacting with element: {e}")

def click_show_more_comments(driver):
    while True:
        try:
            show_more_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//*[@id='comment-container-112968501']/div/div[1]/div[2]/div[1]/div/div/div[2]"))
            )
            scroll_into_view(driver, show_more_button)
            show_more_button.click()
            time.sleep(2)
        except (TimeoutException, ElementClickInterceptedException, StaleElementReferenceException) as e:
            print(f"Error with 'Show More Comments' button: {e}")
            break

def extract_comments(driver, article_url, cat_tag):
    try:
        driver.get(article_url)

        # Get the timestamp and author of the article
        timestamp_element = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), concat(' ', 'byline', ' '))]//span")
        timestamp = timestamp_element.text.strip()
        try:
            author_element = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' mMwSH ')]")
            author = author_element.text
        except NoSuchElementException:
            author = "TOI"
        article_div = driver.find_element(By.XPATH, "//div[contains(@class, 'clearfix') and contains(@class, '_s30J')]")
        article_text = article_div.text.replace("\n", " ").replace("\'", "")
        
        # Remove common prefixes like 'Updated:' if present
        if "Updated:" in timestamp:
            timestamp = timestamp.replace("Updated:", "").strip()
        
        # Split the timestamp to remove the time part and clean up the string
        try:
            date_part = timestamp.split(",")[0] + " " + timestamp.split(",")[1].strip()  # Example: 'Sep 29 2024'
            time_part = timestamp.split(",")[2].strip()  # Example: '04:58 IST'
        
            # Convert the date part to 'dd.mm.yyyy' format
            date_object = datetime.strptime(date_part, "%b %d %Y")
            formatted_date = date_object.strftime("%d.%m.%Y")
        
            # Store the date and time separately
            timestamp_date = formatted_date
            timestamp_time = time_part
        
        except (IndexError, ValueError) as e:
            print(f"Error processing timestamp '{timestamp}': {e}")
            timestamp_date = None
            timestamp_time = None
        
        # Locate the "End of Article" element
        end_of_article_div = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' mj2wg ')]//span[contains(text(), 'End of Article')]")
        element_position = end_of_article_div.location['y']
        viewport_height = driver.execute_script("return window.innerHeight")
        scroll_position = element_position - (viewport_height / 2)
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(2)
    except WebDriverException as e:
        print(f"Failed to load page {article_url}: {e}")
        return {}  # Return an empty list to skip this article
    
    # Prepare a list to store extracted comments data
    comments_data = []
    
    try:
        # Locate and click the comment button
        try:
            view_comment_btn = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//*[contains(concat(' ', @class, ' '), ' GzGIQ ') and (((count(preceding-sibling::*) + 1) = 1) and parent::*)]"))
            )
            time.sleep(2)
            view_comment_btn.click()
        except TimeoutException as e:
            print(f"Timeout waiting for comment button: {e}")
            return {}
        except ElementClickInterceptedException as e:
            print(f"Comment button click intercepted: {e}")
            return {}
        
        # Continuously check for "Show More Comments" button and click if found
        while True:
            try:
                # Find the "Show More Comments" button
                show_more_button = WebDriverWait(driver, 15).until(
                    EC.element_to_be_clickable((By.XPATH, "//div[contains(concat(' ', @class, ' '), ' hduJ6 ') and contains(text(), 'View more comments')]"))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
                show_more_button.click()
                time.sleep(2)  # Wait for new comments to load
            except TimeoutException:
                # No more "Show More Comments" button found, exit loop
                print("Finished loading all the comments")
                break
            except NoSuchElementException:
                # Button not found, break the loop
                print("No 'Show More Comments' button found.")
                break
            except ElementClickInterceptedException:
                # Handle cases where the button is covered or not clickable
                print("Failed to click 'Show More Comments' button due to an interception.")
                break
        
        # Locate and extract all comments
        comments = driver.find_elements(By.XPATH, "//*[contains(concat(' ', @class, ' '), concat(' ', 'Nuk1p', ' '))]")
        # Filter out any elements that contain an iframe (ads) or advertisement-specific divs
        comments = [comment for comment in comments
                    if len(comment.find_elements(By.XPATH, ".//div[contains(@class, 'paisa-wrapper')]")) == 0]

        for comment in comments:
            try:
                try:
                    # Check if the "Read More" link exists 
                    read_more_link = comment.find_element(By.XPATH, ".//span[contains(text(), 'Read More')]")
                    driver.execute_script("arguments[0].click();", read_more_link)
                except NoSuchElementException:
                    # If no "Read More" link is found
                    pass
                    
                # Extract user and comment details
                user = comment.find_element(By.CLASS_NAME, "ZJ4ae").text
                comment_text = comment.find_element(By.CLASS_NAME, "mxnGH").text
                upvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
                downvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
        
                # Initialize replies list before processing
                replies = []
        
                # Check and click "Show responses" link to load nested replies
                try:
                    show_responses_link = comment.find_element(By.XPATH, "./div[4]/a")
                    scroll_into_view(driver, show_responses_link)
                    show_responses_link.click()
                    time.sleep(2)
                except NoSuchElementException:
                    pass
                
                # Locate nested replies after they are loaded
                while True:
                    try:
                        show_all_responses_button = comment.find_element(By.XPATH, ".//div[contains(concat(' ', @class, ' '), ' evC4f ') and contains(text(), 'Show all responses')]")
                        show_all_responses_button.click()
                        time.sleep(2)
                    except NoSuchElementException:
                        break  # Exit loop if no more "Show all responses" button is found
                
                reply_elements = comment.find_elements(By.XPATH, ".//div[5]/ul/li")
                reply_elements = [reply for reply in reply_elements
                  if len(reply.find_elements(By.XPATH, ".//iframe")) == 0 and
                     len(reply.find_elements(By.XPATH, ".//div[contains(@class, 'paisa-wrapper')]")) == 0]
                
                for reply in reply_elements:
                    try:
                        try:
                            # Check if the "Read More" link exists 
                            read_more_link = reply.find_element(By.XPATH, ".//span[contains(text(), 'Read More')]")
                            driver.execute_script("arguments[0].click();", read_more_link)
                        except NoSuchElementException:
                            # If no "Read More" link is found
                            pass
                            
                        reply_user = reply.find_element(By.XPATH, "./div[1]/h3").text
                        reply_to = reply.find_element(By.XPATH, "./div[2]/span[1]").text
                        reply_text = reply.find_element(By.XPATH, "./div[3]").text
                        reply_upvotes = reply.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
                        reply_downvotes = reply.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
                        
                        replies.append({
                            'user': reply_user,
                            'reply_to': reply_to,
                            'comment_text': reply_text,
                            'upvotes': reply_upvotes,
                            'downvotes': reply_downvotes,
                        })
                    except Exception as e:
                        print(f"Error extracting reply comment: {e}")
                
                # Append the extracted details to comments_data with article link
                if user and upvotes.isdigit() and downvotes.isdigit():
                    comments_data.append({
                        'user': user,
                        'comment_text': comment_text,
                        'upvotes': upvotes,
                        'downvotes': downvotes,
                        'replies': replies
                    })
                else:
                    print(f"Skipped an empty comment")

            except Exception as e:
                print(f"Error extracting comment: {e}")

    except NoSuchElementException:
        print("No comment button found, skipping this article.")
    except Exception as e:
        print(f"Error clicking comment button: {e}")
    
    article_data = {
        'article_link': article_url,
        'author': author,
        'tag': cat_tag,
        'date': timestamp_date,
        'time': timestamp_time,
        'article_content': article_text,
        'comments': comments_data,
    }
    
    return article_data

In [4]:
import pickle

# Load the data from the combined_articles.pkl file
with open('combined_articles.pkl', 'rb') as file:
    combined_articles = pickle.load(file)

# Initialize the all_article_hrefs list
all_article_hrefs = []

# Assuming combined_articles is a list of pairs, append them to all_article_hrefs
for article in combined_articles:
    all_article_hrefs.append(article)


In [5]:
# Initialize lists to hold articles based on the date criteria
recent_articles = []
six_month_plus = []

# Get today's date
today = datetime.now()

# Define a date threshold for 6 months back
six_months_ago = today - timedelta(days=6 * 30)  # Approximately 6 months

# Assuming all_article_hrefs is a list of article data with their respective dates
for link in all_article_hrefs:
    print(f"Processing article: {link[0]}")
    article_data = extract_comments(driver, link[0], link[1])
    
    # Convert the article date string to a datetime object
    article_date_str = article_data['date']  # Assuming 'date' is in "dd.mm.yyyy" format
    article_date = datetime.strptime(article_date_str, "%d.%m.%Y")

    # Check if the article date is within the last 6 months
    if article_date >= six_months_ago:
        recent_articles.append(article_data)
        print(f"Extracted {len(article_data['comments'])} comments")
    else:
        six_month_plus.append(article_data)

Processing article: https://timesofindia.indiatimes.com/india/rs-45-lakh-seized-bjp-picks-team-says-his-dads-money/articleshow/113776837.cms
Finished loading all the comments
Extracted 2 comments
Processing article: https://timesofindia.indiatimes.com/india/bjp-slams-rahul-for-ayodhya-event-remark/articleshow/113776781.cms
Finished loading all the comments
Skipped an empty comment
Error extracting comment: Message: element not interactable
  (Session info: chrome=129.0.6668.72)
Stacktrace:
	GetHandleVerifier [0x00007FF67118B125+29573]
	(No symbol) [0x00007FF6710FFF50]
	(No symbol) [0x00007FF670FBB519]
	(No symbol) [0x00007FF671010C8F]
	(No symbol) [0x00007FF6710036BE]
	(No symbol) [0x00007FF6710372FA]
	(No symbol) [0x00007FF671002FF6]
	(No symbol) [0x00007FF671037510]
	(No symbol) [0x00007FF6710586BC]
	(No symbol) [0x00007FF6710370A3]
	(No symbol) [0x00007FF6710012DF]
	(No symbol) [0x00007FF671002441]
	GetHandleVerifier [0x00007FF6714BC76D+3377613]
	GetHandleVerifier [0x00007FF671507B6

KeyError: 'date'

In [None]:
# Now you have two lists: recent_articles and six_month_plus
print(f"Total articles in the last 6 months: {len(recent_articles)}")
print(f"Total articles older than 6 months: {len(six_month_plus)}")

In [None]:
# # TEST
# today = datetime.now()
# six_months_ago = today - timedelta(days=6 * 30)
# print(six_months_ago)
# date = recent_articles[3]['date']
# article_date = datetime.strptime(date, "%d.%m.%Y")
# print(article_date)
# if article_date >= six_months_ago: 
#     print("1")
# else:
#     print("0")

In [6]:
# TEST
test = []
test.append(['https://timesofindia.indiatimes.com/india/cockroach-in-ai-del-ny-flight-meal-kid-falls-ill/articleshow/113776272.cms', 'INDIA'])
t = datetime.now()
six_ago = t - timedelta(days=6 * 30)
for link in test:
    a = extract_comments(driver, link[0], link[1])
    a_date_str = a['date']
    a_date = datetime.strptime(a_date_str, "%d.%m.%Y")

    # Check if the article date is within the last 6 months
    if a_date >= six_ago:
        print("outside 6")
    else:
        print("in 6")


KeyboardInterrupt



In [None]:
driver.quit()