In [1]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException, TimeoutException, NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException
from datetime import datetime
import time
import os
import wget
import json
import csv

In [2]:
# Set up ChromeDriver
service = Service('E:/Programs/Chrome Driver/chromedriver.exe')
driver = webdriver.Chrome(service=service)

In [3]:
import pickle

# Define the file path to load the data
file_path = 'hrefs_with_categories.pkl'

# Load the data using pickle
with open(file_path, 'rb') as file:
    hrefs_with_categories = pickle.load(file)

In [4]:
for href, category in hrefs_with_categories:
    if category.lower() == "politics":
        driver.get(href)
        break 

In [5]:
# Find the "more top stories" element and click it
more_button_xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "top-newslist", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "more-btn", " " ))]//span'

more_button = driver.find_element(By.XPATH, more_button_xpath)
more_button.click()

In [12]:
# List to store href and corresponding last updated values
hrefs_with_lastupd = []

# Base URL for the first page
base_url = "https://timesofindia.indiatimes.com/politics/news"
suffix = 1  # Start without a suffix for the first page

def calculate_days_ago(date_str):
    """Calculate the number of days ago from a date string like '12 Oct 2023, 18:56'."""
    try:
        date_obj = datetime.strptime(date_str, '%d %b %Y, %H:%M')
        current_date = datetime.now()
        days_ago = (current_date - date_obj).days
        return days_ago
    except ValueError:
        return None

def extract_days_from_string(last_updated_raw):
    """Extract days from a string like '324 DAYS AGO', '1 YEAR AGO', etc."""
    if "DAYS AGO" in last_updated_raw:
        return int(last_updated_raw.split()[0])
    elif "YEAR AGO" in last_updated_raw or "YEARS AGO" in last_updated_raw:
        years_ago = int(last_updated_raw.split()[0])
        return years_ago * 365
    else:
        return calculate_days_ago(last_updated_raw)

while True:
    if suffix == 1:
        url = base_url
    else:
        url = f"{base_url}/{suffix}"
    
    print(f"Visiting: {url}")
    
    try:
        driver.get(url)
        li_elements = driver.find_elements(By.XPATH, '//*[contains(concat( " ", @class, " " ), concat( " ", "top-newslist", " " ))]//li')
        
        stop_navigating = False  # Flag to stop navigating to further URLs

        for li in li_elements:
            try:
                a_tag = li.find_element(By.XPATH, './/a')
                href = a_tag.get_attribute('href')
                last_updated_raw = li.find_element(By.XPATH, './/*[contains(concat( " ", @class, " " ), concat( " ", "strlastupd", " " ))]').text.strip()
                
                days_ago = extract_days_from_string(last_updated_raw)

                if days_ago is None:
                    continue
                
                if days_ago > 364:
                    print("Reached more than 364 days ago. Stopping further appending and navigation.")
                    stop_navigating = True  # Set the flag to stop navigation
                    break
                
                hrefs_with_lastupd.append((href, f"{days_ago} DAYS AGO"))
                
            except NoSuchElementException:
                print(f"Skipping li element due to missing href or last updated value.")
                continue
        
        if stop_navigating:
            break  # Break out of the main loop to stop further navigation

        suffix += 1
        time.sleep(1)
        
    except NoSuchElementException:
        print(f"No more pages found after page {suffix}. Stopping.")
        break

# Output all collected hrefs with last updated values
print("Collected hrefs with last updated values:")
for href, last_updated in hrefs_with_lastupd:
    print(f"Href: {href}, Last Updated: {last_updated}")

Visiting: https://timesofindia.indiatimes.com/politics/news
Visiting: https://timesofindia.indiatimes.com/politics/news/2
Reached more than 364 days ago. Stopping further appending and navigation.
Collected hrefs with last updated values:
Href: https://timesofindia.indiatimes.com/politics/news/tipra-motha-chief-meets-congress-leaders-as-bjp-eyes-pact/articleshow/107980204.cms, Last Updated: 208 DAYS AGO
Href: https://timesofindia.indiatimes.com/politics/news/ahmed-patels-legacy-wont-go-in-vain-mumtaz-reacts-after-congress-announces-giving-late-fathers-seat-to-aap/articleshow/107966185.cms, Last Updated: 209 DAYS AGO
Href: https://timesofindia.indiatimes.com/politics/news/buzz-over-navjot-singh-sidhu-return-to-bjp-gurdaspur-ticket-for-yuvraj-singh/articleshow/107882378.cms, Last Updated: 211 DAYS AGO
Href: https://timesofindia.indiatimes.com/politics/news/victory-of-voters-sharad-pawar-after-supreme-court-ruling-on-his-factions-name-symbol/articleshow/107829961.cms, Last Updated: 214 DA

In [14]:
import pickle

# Define the file path to save the data (e.g., articles_from_politics.pkl)
file_path = 'articles_from_politics.pkl'

# Save the data using pickle
with open(file_path, 'wb') as file:
    pickle.dump(hrefs_with_lastupd, file)

In [15]:
driver.quit()