# Selenium

In [1]:
# Import packages
from selenium import webdriver
from chromedriver_autoinstaller import install
install(True)
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import ElementNotInteractableException

import time, re
import pandas as pd

In [2]:
# Open the browser
driver = webdriver.Chrome()

# Open Google News
driver.get("https://news.google.com/")

# Set default waiting time
driver.implicitly_wait(0.5)

# Reject cookies
try:
    proceed_to_webpage_button = driver.find_element(By.XPATH, "//*[@id='yDmH0d']/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/div[1]/form[1]")
    proceed_to_webpage_button.click()
except:
    pass

In [3]:
# List navigation bar links
links = driver.find_elements(By.CSS_SELECTOR, ".EctEBd a")
if links:
    print(f"Selenium found: {len(links)} links:")
    for link in links:
        print(link.text, link.get_attribute("href"))

Selenium found: 13 links:
Home https://news.google.com/home?hl=en-US&gl=US&ceid=US%3Aen
For you None
Following None
News Showcase https://news.google.com/showcase?hl=en-US&gl=US&ceid=US%3Aen
U.S. https://news.google.com/topics/CAAqIggKIhxDQkFTRHdvSkwyMHZNRGxqTjNjd0VnSmxiaWdBUAE?hl=en-US&gl=US&ceid=US%3Aen
World https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx1YlY4U0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen
Local https://news.google.com/topics/CAAqHAgKIhZDQklTQ2pvSWJHOWpZV3hmZGpJb0FBUAE?hl=en-US&gl=US&ceid=US%3Aen
Business https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen
Technology https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen
Entertainment https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNREpxYW5RU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen
Sports https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFp1ZEdvU0FtVn

In [4]:
# Navigate to U.S. topic
try:
    topic = driver.find_element(By.XPATH, "//*[@id='gb']/div[3]/div/c-wiz/div[1]/div[6]/a")
    topic.click()
except:
    topic = driver.find_element(By.XPATH, "//*[@id='gb']/div[3]/div/c-wiz/div[1]/div[5]")
    driver.execute_script("arguemnts[0].scrollIntoView({block: 'center'));", topic)
    driver.exectue_script("arguments[0].click();", topic)

# Wait up to 10 seconds for U.S. topic url to load
wait = WebDriverWait(driver, 10)
wait.until(EC.url_contains("/topics/CAAqIggKIhxDQkFTRHdvSkwyMHZNRGxqTjNjd0VnSmxiaWdBUAE?hl=en-US&gl=US&ceid=US%3Aen"))
print(driver.current_url)

# Wait for 10 seconds
time.sleep(10)

https://news.google.com/topics/CAAqIggKIhxDQkFTRHdvSkwyMHZNRGxqTjNjd0VnSmxiaWdBUAE?hl=en-US&gl=US&ceid=US%3Aen


In [5]:
# Count the amount of articles on the webpage
article_count = len(driver.find_elements(By.CSS_SELECTOR, ".IBr9hb"))

# Create list to store articles
data = []
# Loop through each article on the page
for i in range(article_count):
    # Locate article
    article = driver.find_elements(By.CSS_SELECTOR, ".IBr9hb")[i]
    # Locate article url
    link = article.find_element(By.TAG_NAME, "a")
    # Extract article url address
    href = link.get_attribute("href")

    # Wait for 2 seconds
    time.sleep(2)
    # Locate the news source
    source = article.find_element(By.CSS_SELECTOR, ".vr1PYe")

    # Try to find the article title
    try:
        title = article.find_element(By.CSS_SELECTOR, ".IBr9hb .gPFEn")
    except:
        pass

    # Turn the article title into text
    title_text = title.text
    # Turn the news source into text
    news_source = source.text

    # Append the extracted data to the data list
    data.append({
        "title": title_text,
        "link": href,
        "news_source": news_source
    })

# Convert data list to pandas DataFrame
df = pd.DataFrame(data)
# Show pandas DataFrame
df

Unnamed: 0,title,link,news_source
0,,https://news.google.com/read/CBMikgFBVV95cUxPV...,
1,,https://news.google.com/read/CBMigAFBVV95cUxNd...,
2,Judges lash out at Justice Department for stil...,https://news.google.com/read/CBMiqgFBVV95cUxNT...,CNN
3,Supreme Court allows Texas GOP to use new redi...,https://news.google.com/read/CBMic0FVX3lxTE00V...,CBS News
4,Second Strike Scrutiny Obscures Larger Questio...,https://news.google.com/read/CBMihgFBVV95cUxPN...,The New York Times
5,"Who is the Jan. 6 pipe bomb suspect? And, lawm...",https://news.google.com/read/CBMiwwFBVV95cUxOW...,NPR
6,"911 calls from Kerrville, Texas Hill Country f...",https://news.google.com/read/CBMinAFBVV95cUxNT...,NBC News
7,Trump ventures deeper into anti-immigrant lang...,https://news.google.com/read/CBMirAFBVV95cUxQW...,AP News
8,‘Cultivate resistance’: policy paper lays bare...,https://news.google.com/read/CBMi5AFBVV95cUxOc...,The Guardian
9,Triple polar vortex sends temperatures plungin...,https://news.google.com/read/CBMi_wFBVV95cUxOQ...,Yahoo


In [6]:
# Remove first two rows which do not contain a title nor a news source
df = df.iloc[2:]
# Show pandas Dataframe
df

Unnamed: 0,title,link,news_source
2,Judges lash out at Justice Department for stil...,https://news.google.com/read/CBMiqgFBVV95cUxNT...,CNN
3,Supreme Court allows Texas GOP to use new redi...,https://news.google.com/read/CBMic0FVX3lxTE00V...,CBS News
4,Second Strike Scrutiny Obscures Larger Questio...,https://news.google.com/read/CBMihgFBVV95cUxPN...,The New York Times
5,"Who is the Jan. 6 pipe bomb suspect? And, lawm...",https://news.google.com/read/CBMiwwFBVV95cUxOW...,NPR
6,"911 calls from Kerrville, Texas Hill Country f...",https://news.google.com/read/CBMinAFBVV95cUxNT...,NBC News
7,Trump ventures deeper into anti-immigrant lang...,https://news.google.com/read/CBMirAFBVV95cUxQW...,AP News
8,‘Cultivate resistance’: policy paper lays bare...,https://news.google.com/read/CBMi5AFBVV95cUxOc...,The Guardian
9,Triple polar vortex sends temperatures plungin...,https://news.google.com/read/CBMi_wFBVV95cUxOQ...,Yahoo
10,Speaker Johnson under attack from his own as m...,https://news.google.com/read/CBMijgFBVV95cUxOM...,CNN
11,‘Signalgate’ report contradicts Hegseth’s clai...,https://news.google.com/read/CBMinwFBVV95cUxQa...,The Washington Post


In [7]:
# Turn the pandas DataFrame into a CSV file
df.to_csv('google_news_articles.csv', index=False, encoding='utf-8')