In [1]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException, TimeoutException, NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException
from datetime import datetime, timedelta
import numpy as np
import time
import os
import wget
import json
import csv

In [2]:
# Set up ChromeDriver
service = Service('E:/Programs/Chrome Driver/chromedriver.exe')
driver = webdriver.Chrome(service=service)

In [3]:
def scroll_into_view(driver, element):
    driver.execute_script("arguments[0].scrollIntoView(true);", element)
    time.sleep(2)

def click_element(driver, element):
    try:
        actions = ActionChains(driver)
        actions.move_to_element(element).click().perform()
    except Exception as e:
        print(f"Error interacting with element: {e}")

def click_show_more_comments(driver):
    while True:
        try:
            show_more_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//*[@id='comment-container-112968501']/div/div[1]/div[2]/div[1]/div/div/div[2]"))
            )
            scroll_into_view(driver, show_more_button)
            show_more_button.click()
            time.sleep(2)
        except (TimeoutException, ElementClickInterceptedException, StaleElementReferenceException) as e:
            print(f"Error with 'Show More Comments' button: {e}")
            break

def extract_comments(driver, article_url, cat_tag):
    try:
        driver.get(article_url)

        # Get the timestamp and author of the article
        timestamp_element = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), concat(' ', 'byline', ' '))]//span")
        timestamp = timestamp_element.text.strip()
        try:
            author_element = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' mMwSH ')]")
            author = author_element.text
        except NoSuchElementException:
            author = "TOI"
        article_div = driver.find_element(By.XPATH, "//div[contains(@class, 'clearfix') and contains(@class, '_s30J')]")
        article_text = article_div.text.replace("\n", " ").replace("\'", "")
        
        # Remove common prefixes like 'Updated:' if present
        if "Updated:" in timestamp:
            timestamp = timestamp.replace("Updated:", "").strip()
        
        # Split the timestamp to remove the time part and clean up the string
        try:
            date_part = timestamp.split(",")[0] + " " + timestamp.split(",")[1].strip()  # Example: 'Sep 29 2024'
            time_part = timestamp.split(",")[2].strip()  # Example: '04:58 IST'
        
            # Convert the date part to 'dd.mm.yyyy' format
            date_object = datetime.strptime(date_part, "%b %d %Y")
            formatted_date = date_object.strftime("%d.%m.%Y")
        
            # Store the date and time separately
            timestamp_date = formatted_date
            timestamp_time = time_part
        
        except (IndexError, ValueError) as e:
            print(f"Error processing timestamp '{timestamp}': {e}")
            timestamp_date = None
            timestamp_time = None
        
        # Locate the "End of Article" element
        end_of_article_div = driver.find_element(By.XPATH, "//*[contains(concat(' ', @class, ' '), ' mj2wg ')]//span[contains(text(), 'End of Article')]")
        element_position = end_of_article_div.location['y']
        viewport_height = driver.execute_script("return window.innerHeight")
        scroll_position = element_position - (viewport_height / 2)
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(2)
    except WebDriverException as e:
        print(f"Failed to load page {article_url}: {e}")
        return {}  # Return an empty list to skip this article
    
    # Prepare a list to store extracted comments data
    comments_data = []
    
    try:
        # Locate and click the comment button
        try:
            view_comment_btn = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//*[contains(concat(' ', @class, ' '), ' GzGIQ ') and (((count(preceding-sibling::*) + 1) = 1) and parent::*)]"))
            )
            time.sleep(2)
            view_comment_btn.click()
        except TimeoutException as e:
            print(f"Timeout waiting for comment button: {e}")
            return {}
        except ElementClickInterceptedException as e:
            print(f"Comment button click intercepted: {e}")
            return {}
        
        # Continuously check for "Show More Comments" button and click if found
        while True:
            try:
                # Find the "Show More Comments" button
                show_more_button = WebDriverWait(driver, 15).until(
                    EC.element_to_be_clickable((By.XPATH, "//div[contains(concat(' ', @class, ' '), ' hduJ6 ') and contains(text(), 'View more comments')]"))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
                show_more_button.click()
                time.sleep(2)  # Wait for new comments to load
            except TimeoutException:
                # No more "Show More Comments" button found, exit loop
                print("Finished loading all the comments")
                break
            except NoSuchElementException:
                # Button not found, break the loop
                print("No 'Show More Comments' button found.")
                break
            except ElementClickInterceptedException:
                # Handle cases where the button is covered or not clickable
                print("Failed to click 'Show More Comments' button due to an interception.")
                break
        
        # Locate and extract all comments
        comments = driver.find_elements(By.XPATH, "//*[contains(concat(' ', @class, ' '), concat(' ', 'Nuk1p', ' '))]")
        # Filter out any elements that contain an iframe (ads) or advertisement-specific divs
        comments = [comment for comment in comments
                    if len(comment.find_elements(By.XPATH, ".//div[contains(@class, 'paisa-wrapper')]")) == 0]

        for comment in comments:
            try:
                try:
                    # Check if the "Read More" link exists 
                    read_more_link = comment.find_element(By.XPATH, ".//span[contains(text(), 'Read More')]")
                    driver.execute_script("arguments[0].click();", read_more_link)
                except NoSuchElementException:
                    # If no "Read More" link is found
                    pass
                    
                # Extract user and comment details
                user = comment.find_element(By.CLASS_NAME, "ZJ4ae").text
                comment_text = comment.find_element(By.CLASS_NAME, "mxnGH").text
                upvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
                downvotes = comment.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
        
                # Initialize replies list before processing
                replies = []
        
                # Check and click "Show responses" link to load nested replies
                try:
                    show_responses_link = comment.find_element(By.XPATH, "./div[4]/a")
                    scroll_into_view(driver, show_responses_link)
                    show_responses_link.click()
                    time.sleep(2)
                except NoSuchElementException:
                    pass
                
                # Locate nested replies after they are loaded
                while True:
                    try:
                        show_all_responses_button = comment.find_element(By.XPATH, ".//div[contains(concat(' ', @class, ' '), ' evC4f ') and contains(text(), 'Show all responses')]")
                        show_all_responses_button.click()
                        time.sleep(2)
                    except NoSuchElementException:
                        break  # Exit loop if no more "Show all responses" button is found
                
                reply_elements = comment.find_elements(By.XPATH, ".//div[5]/ul/li")
                reply_elements = [reply for reply in reply_elements
                  if len(reply.find_elements(By.XPATH, ".//iframe")) == 0 and
                     len(reply.find_elements(By.XPATH, ".//div[contains(@class, 'paisa-wrapper')]")) == 0]
                
                for reply in reply_elements:
                    try:
                        try:
                            # Check if the "Read More" link exists 
                            read_more_link = reply.find_element(By.XPATH, ".//span[contains(text(), 'Read More')]")
                            driver.execute_script("arguments[0].click();", read_more_link)
                        except NoSuchElementException:
                            # If no "Read More" link is found
                            pass
                            
                        reply_user = reply.find_element(By.XPATH, "./div[1]/h3").text
                        reply_to = reply.find_element(By.XPATH, "./div[2]/span[1]").text
                        reply_text = reply.find_element(By.XPATH, "./div[3]").text
                        reply_upvotes = reply.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' LmcfZ ')]//span").text
                        reply_downvotes = reply.find_element(By.XPATH, ".//*[contains(concat(' ', @class, ' '), ' Qh8bj ')]//span").text
                        
                        replies.append({
                            'user': reply_user,
                            'reply_to': reply_to,
                            'comment_text': reply_text,
                            'upvotes': reply_upvotes,
                            'downvotes': reply_downvotes,
                        })
                    except Exception as e:
                        print(f"Error extracting reply comment: {e}")
                
                # Append the extracted details to comments_data with article link
                if user and upvotes.isdigit() and downvotes.isdigit():
                    comments_data.append({
                        'user': user,
                        'comment_text': comment_text,
                        'upvotes': upvotes,
                        'downvotes': downvotes,
                        'replies': replies
                    })
                else:
                    print(f"Skipped an empty comment")

            except Exception as e:
                print(f"Error extracting comment: {e}")

    except NoSuchElementException:
        print("No comment button found, skipping this article.")
    except Exception as e:
        print(f"Error clicking comment button: {e}")
    
    article_data = {
        'article_link': article_url,
        'author': author,
        'tag': cat_tag,
        'date': timestamp_date,
        'time': timestamp_time,
        'article_content': article_text,
        'comments': comments_data,
    }
    
    return article_data

In [4]:
# Assuming all_article_hrefs is already defined and contains 28,000 elements
recent_articles = []
six_month_plus = []

# Define the date range for the last 6 months
six_months_ago = datetime.now() - timedelta(days=6*30)  # Roughly 6 months


In [5]:
def process_chunk(chunk):
    for link in chunk:
        print(f"Processing article: {link[0]}")
        article_data = extract_comments(driver, link[0], link[1])

        try:
            # Convert the article date string to a datetime object
            article_date_str = article_data['date']  # Assuming 'date' is in "dd.mm.yyyy" format
            article_date = datetime.strptime(article_date_str, "%d.%m.%Y")

            # Check if the article date is within the last 6 months
            if article_date >= six_months_ago:
                recent_articles.append(article_data)
                print(f"Extracted {len(article_data['comments'])} comments")
            else:
                six_month_plus.append(article_data)

        except KeyError:
            print(f"KeyError: 'date' not found in article data for {link[0]}, skipping this article.")
            continue


In [None]:
import pickle
import os

# Define the folder and file path for chunk 1
folder_name = 'article_chunks'
chunk_9_file = os.path.join(folder_name, 'article_hrefs_chunk_9.pkl')

# Check if the file exists before loading
if os.path.exists(chunk_9_file):
    # Load chunk 1
    with open(chunk_9_file, 'rb') as file:
        chunk_9_data = pickle.load(file)
    
    # Use the chunk data (for example, print it)
    print("Chunk 9 data:", chunk_9_data)
else:
    print(f"File {chunk_9_file} not found.")


In [None]:
process_chunk(chunk_9_data)

In [None]:
print(f"Total recent articles: {len(recent_articles)}")
print(f"Total articles older than 6 months: {len(six_month_plus)}")

In [None]:
with open('recent9.pkl', 'wb') as recent_file:
    pickle.dump(recent_articles, recent_file)
    print("Saved recent articles to recent9.pkl")

# Save six months plus articles to six_plus1.pkl
with open('six_plus9.pkl', 'wb') as six_month_file:
    pickle.dump(six_month_plus, six_month_file)
    print("Saved older articles to six_plus9.pkl")

In [10]:
driver.quit()