In [1]:
# Selenium 4:
from selenium import webdriver

# Starting/Stopping Driver: can specify ports or location but not remote access
from selenium.webdriver.chrome.service import Service as ChromeService

# Manages Binaries needed for WebDriver without installing anything directly
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.chrome.service import Service

# Allows searchs similar to beautiful soup: find_all
from selenium.webdriver.common.by import By

# Try to establish wait times for the page to load
from selenium.webdriver.support.ui import WebDriverWait

# Wait for specific condition based on defined task: web elements, boolean are examples
from selenium.webdriver.support import expected_conditions as EC

# Used for keyboard movements, up/down, left/right,delete, etc
from selenium.webdriver.common.keys import Keys

# Locate elements on page and throw error if they do not exist
from selenium.common.exceptions import NoSuchElementException

# In general to use with timing our function calls to Indeed
import time

# Assist with creating incremental timing for our scraping to seem more human
from time import sleep

from selenium.webdriver.chrome.options import Options
import requests 
import random

# Random integer for more realistic timing for clicks, buttons and searches during scraping
from random import randint

# For webscraping
from bs4 import BeautifulSoup

# Parsing and creating xml data
# from lxml import etree as et

# Store data as a csv file written out
from csv import writer

# Dataframe stuff
import pandas as pd

# Multi Threading
import threading

# Threading:
from concurrent.futures import ThreadPoolExecutor, wait

import chromedriver_autoinstaller

In [2]:
# Setting random UA Agent so as not get detected by the anti-bot system
url = 'https://httpbin.org/headers' 

user_agent_list = [ 
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', 
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' 
    ]
    # Returns a random user agent from the user_agent_list.
    # return random.choice(user_agent_list)

for i in range (1,6): 
	user_agent = random.choice(user_agent_list)
	# Checking if the response is getting blocked
	headers = {'User-Agent': user_agent} 
	response = requests.get(url, headers=headers) 
	received_ua = response.json()['headers']['User-Agent'] 
	print("Request #%d\nUser-Agent Sent: %s\nUser-Agent Received: %s\n" % (i, user_agent, received_ua)) 

Request #1
User-Agent Sent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36
User-Agent Received: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36

Request #2
User-Agent Sent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
User-Agent Received: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36

Request #3
User-Agent Sent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36
User-Agen

In [3]:
def get_random_user_agent():
    # Define a custom user agents for ua rotation
    user_agent_list = [ 
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', 
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' 
    ]
    # Returns a random user agent from the user_agent_list.
    return random.choice(user_agent_list)

def calling_options():
    # Allows you to cusotmize: ingonito mode, maximize window size, headless browser, disable certain features, etc
    option= webdriver.ChromeOptions()

    # Going undercover:
    option.add_argument("--incognito")

    # Consider this if the application works and you know how it works for speed ups and rendering!

    option.add_argument('--headless')

    option.add_argument("--no-sandbox")  # Bypass OS security model
    option.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems

    user_agent = get_random_user_agent()
    option.add_argument(f"user-agent={user_agent}")
    return option

def get_random_feed(feed):
    return random.choice(feed)

# chromedriver_autoinstaller.install()

In [4]:
def get_url(selected_feed):
    if selected_feed == 'international':
        url_feed = ['https://www.theguardian.com/international', 'https://www.bbc.com/news']
    elif selected_feed == 'tech':
        url_feed = ['https://www.ndtv.com/', 'https://timesofindia.indiatimes.com/', 'https://www.thehindu.com/']
    elif selected_feed == 'national':
        url_feed = ['https://www.ndtv.com/', 'https://timesofindia.indiatimes.com/', 'https://www.thehindu.com/']
    elif selected_feed == 'current-affairs':
        url_feed = ['https://www.ndtv.com/', 'https://timesofindia.indiatimes.com/', 'https://www.thehindu.com/']
    elif selected_feed == 'entertainment':
        url_feed = ['https://www.ndtv.com/', 'https://www.pinkvilla.com/']
    elif selected_feed == 'fashion':
        url_feed = ['https://www.pinkvilla.com/', 'https://timesofindia.indiatimes.com/']
    elif selected_feed == 'finance':
        url_feed = ['https://www.financialexpress.com/', 'https://www.thehindu.com/']
    elif selected_feed == 'sports':
        url_feed = ['https://www.ndtv.com/', 'https://timesofindia.indiatimes.com/', 'https://www.thehindu.com/']
    elif selected_feed == 'politics':
        url_feed = ['https://www.ndtv.com/', 'https://timesofindia.indiatimes.com/', 'https://www.thehindu.com/']
    elif selected_feed == 'education':
        url_feed = ['https://www.ndtv.com/', 'https://timesofindia.indiatimes.com/', 'https://www.thehindu.com/']
    elif selected_feed == 'miscellaneous':
        url_feed = ['https://www.ndtv.com/', 'https://timesofindia.indiatimes.com/', 'https://www.thehindu.com/']
    return url_feed


In [None]:
# Loop through the URLs
for url in urls:
    start = time.time()
    options = calling_options()
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    # Add a sleep to simulate human-like interactions
    sleep(randint(2, 6))
    # Add your scraping logic here
    print(url)
    selected_feed = random.choice(feed)

    url = 'https://indianexpress.com/'

# Function to get the desired URL
def get_desired_url(url):
    # Set up Selenium WebDriver
    driver = webdriver.Chrome()
    driver.get(url)

    try:
        # Wait for the navbar to be visible
        navbar = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.ID, "navbar"))
        )

        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find the 'business' link under the navbar
        business_link = navbar.find('a', text='Business')

        if business_link:
            # Click on the 'business' link if found
            business_link.click()

            # Wait for the new page to load (you may need to adjust the waiting time)
            WebDriverWait(driver, 10).until(
                EC.url_changes(url)
            )

            # Print the URL after clicking on 'business'
            print("URL after clicking on 'Business':", driver.current_url)
        else:
            print("Business link not found in the navbar.")

    finally:
        # Close the WebDriver session
        driver.quit()

# Main program
get_desired_url(url)


In [5]:
urls = ['https://www.ndtv.com/', 'https://timesofindia.indiatimes.com/', 'https://www.thehindu.com/', 'https://www.financialexpress.com/', 
        'https://www.pinkvilla.com/', 'https://www.theguardian.com/international', 'https://www.bbc.com/news']

feeds = [ 
        'international', 'tech', 'national', 
        'current-affairs', 'entertainment', 'fashion', 
        'finance', 'sports', 'politics', 
        'education', 'miscellaneous' ]

# 
# ENTER THE RANGE OF LENGTH OF THE NEWS ARTCLES
# 
min_limit = 50 
max_limit = 60


# 
# ENTER THE FEED USER WANTS
# 
feed = [ 
        'international', 'tech', 'national', 
        'current-affairs', 'entertainment', 'fashion', 
        'finance', 'sports', 'politics', 
        'education', 'miscellaneous' ]

selected_feed = get_random_feed(feed)

url_feed = get_url(selected_feed)



In [6]:
# Loop through the URLs
for url in urls:
    start = time.time()
    options = calling_options()
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    # Add a sleep to simulate human-like interactions
    sleep(randint(2, 6))
    # Add your scraping logic here
    print(url)
    selected_feed = random.choice(feed)
    
    
    
    


https://www.ndtv.com/
https://timesofindia.indiatimes.com/
https://www.thehindu.com/
https://www.financialexpress.com/
https://www.pinkvilla.com/
https://www.theguardian.com/international
https://www.bbc.com/news


In [14]:
start = time.time()
options = calling_options()
driver = webdriver.Chrome(options=options)
driver.get('https://www.bbc.com/news')
# Add a sleep to simulate human-like interactions
sleep(random.randint(2, 6))

already_read = set()
data_list = []

while True :
    # Find all elements with the specific class 'ssrcss-1amy2cn-ListItem e1gp961v0'
    elements = driver.find_elements(By.CSS_SELECTOR, '[class^="ssrcss-13h7haz-ListItem"], [class^="ssrcss-7uxr49-RichTextContainer"], [class^="ssrcss-11r1m41-RichTextComponentWrapper"], [class^= "li.ssrcss-1amy2cn-ListItem.e1gp961v0"]')

    # Extract text and links from each element
    for element in elements:
        link = element.find_element(By.TAG_NAME, 'a')
        link_text = link.text
        link_href = link.get_attribute('href')
        
        # Check if link has already been visited
        if link_href not in already_read:
            # Click on the link
            link.click()
            already_read.add(link_href)
            print(f"Visited link: {link_text}")
            # Wait for the page to load
            sleep(random.randint(5, 10))
            # Find all elements with the specified class on the visited page
            article_elements = driver.find_elements(By.CSS_SELECTOR, '.ssrcss-pv1rh6-ArticleWrapper [class*="RichTextComponentWrapper"][data-component="text-block"]')
            # Extract text from each article element and append to data list
            article_data = [article.text for article in article_elements]
            # print("Article Data:", article_data)  # Debugging: Print article data
            data_list.append(article_data)
            # Go back to the previous page
            driver.back()
            # Re-find the elements after navigating back
            break
        else:
            print(f"Link already visited: {link_text}")

    # Break the loop if all links have been visited
    if len(already_read) == len(elements):
        break

# Close the WebDriver
driver.quit()

# Print or use the data list as needed
# print("Data List:", data_list)


In [8]:
print(already_read)
print(len(already_read))

set()
0


In [9]:
start = time.time()
options = calling_options()
driver = webdriver.Chrome(options=options)
driver.get('https://www.bbc.com/news')
# Add a sleep to simulate human-like interactions
sleep(random.randint(2, 6))

already_read = set()
data_list = []

while True :
    # Find all elements with either class name 'ssrcss-13h7haz-ListItem', 'ssrcss-7uxr49-RichTextContainer', or 'ssrcss-11r1m41-RichTextComponentWrapper'
    elements = driver.find_elements(By.CSS_SELECTOR, '[class^="ssrcss-13h7haz-ListItem"], [class^="ssrcss-7uxr49-RichTextContainer"], [class^="ssrcss-11r1m41-RichTextComponentWrapper"]')

    # Extract text and links from each element
    for element in elements:
        link = element.find_element(By.TAG_NAME, 'a')
        link_text = link.text
        link_href = link.get_attribute('href')
        
        # Check if link has already been visited
        if link_href not in already_read:
            # Click on the link
            link.click()
            already_read.add(link_href)
            print(f"Visited link: {link_text}")
            # Wait for the page to load
            sleep(random.randint(5, 10))
            # Find all elements with the specified class on the visited page
            article_elements = driver.find_elements(By.CSS_SELECTOR, '.ssrcss-pv1rh6-ArticleWrapper [class*="RichTextComponentWrapper"][data-component="text-block"]')
            # Extract text from each article element and append to data list
            article_data = [article.text for article in article_elements]
            # print("Article Data:", article_data)  # Debugging: Print article data
            data_list.append(article_data)
            # Go back to the previous page
            driver.back()
            # Re-find the elements after navigating back
            break
        else:
            print(f"Link already visited: {link_text}")

    # Break the loop if all links have been visited
    if len(already_read) == len(elements):
        break

# Close the WebDriver
driver.quit()

# Print or use the data list as needed
# print("Data List:", data_list)


In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import sleep
import random

start = time.time()
options = calling_options()
driver = webdriver.Chrome(options=options)
driver.get('https://www.bbc.com/news')
# Add a sleep to simulate human-like interactions
sleep(random.randint(2, 6))

already_read = set()
data_list = []

while True:
    # Find all elements with either class name 'ssrcss-13h7haz-ListItem', 'ssrcss-7uxr49-RichTextContainer', or 'ssrcss-11r1m41-RichTextComponentWrapper'
    elements = driver.find_elements(By.CSS_SELECTOR, '[class^="ssrcss-13h7haz-ListItem"], [class^="ssrcss-7uxr49-RichTextContainer"], [class^="ssrcss-11r1m41-RichTextComponentWrapper"]')

    # Extract text and links from each element
    for element in elements:
        link = element.find_element(By.TAG_NAME, 'a')
        link_text = link.text
        link_href = link.get_attribute('href')
        
        # Check if link has already been visited
        if link_href not in already_read:
            # Click on the link
            link.click()
            already_read.add(link_href)
            print(f"Visited link: {link_text}")
            # Wait for the page to load
            sleep(random.randint(5, 10))
            # Find all paragraph elements within the specified class on the visited page
            article_elements = driver.find_elements(By.CSS_SELECTOR, '.ssrcss-7uxr49-RichTextContainer .ssrcss-1q0x1qg-Paragraph')
            
            # Debugging: Print the number of article elements found
            print("Number of article elements found:", len(article_elements))
            
            # Extract text from each article element and append to data list
            article_data = [article.text for article in article_elements]
            print("Article Data:", article_data)  # Debugging: Print article data
            data_list.append(article_data)
            # Go back to the previous page
            driver.back()
            # Re-find the elements after navigating back
            break
        else:
            print(f"Link already visited: {link_text}")

    # Break the loop if all links have been visited
    if len(already_read) == len(elements):
        break

# Close the WebDriver
driver.quit()

# Print or use the data list as needed
# print("Data List:", data_list)


In [11]:
print(data_list)

[]


In [12]:
# start = time.time()
# options = calling_options()
# driver = webdriver.Chrome(options=options)
# driver.get(random.choice(url_feed))
# # Add a sleep to simulate human-like interactions
# sleep(randint(2, 6))
# # Add your scraping logic here
# print(url_feed)

In [13]:
# mongodb+srv://aviraltanwar:AviralTanwar@cluster0.6xjhtom.mongodb.net/

# c:\\Users\\Aviral Tanwar\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\chromedriver_autoinstaller\\121\\chromedriver.exe