In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
import re
from datetime import datetime

chrome_options = Options()
chrome_options.add_argument("--headless")  
service = Service('C:\\Users\\Avula Jhansy\\Downloads\\chromedriver_win32\\chromedriver.exe')  
driver = webdriver.Chrome(service=service, options=chrome_options)

def parse_date(date_str):
    return datetime.strptime(date_str, "%b %d, %Y %H:%M")

def scrape_articles(url):
    driver.get(url)
    all_articles = []

    start_date = datetime(2024, 6, 15)
    end_date = datetime(2024, 8, 2)

    while True:
        try:
            # Wait for articles to be present
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '.newsItem___wZtKx'))
            )

            # Extract articles on the current page
            articles = driver.find_elements(By.CSS_SELECTOR, '.newsItem___wZtKx')
            for article in articles:
                try:
                    title_element = article.find_element(By.CSS_SELECTOR, '.title___1baLV')
                    title = title_element.text
                    description = article.find_element(By.CSS_SELECTOR, '.description___z7ktb').text
                    link = title_element.find_element(By.XPATH, '..').get_attribute('href')  # The link is in the parent <a> tag
                    date_time_text = article.find_element(By.CSS_SELECTOR, '.date___3dzkE').text
                    
                    # Parse date and time
                    date_time = parse_date(date_time_text)
                    
                    # Check if date is within the range
                    if start_date <= date_time <= end_date:
                        all_articles.append([title, description, link, date_time_text])
                    elif date_time < start_date:
                        # If we have gone before the start date, stop scraping
                        return all_articles
                    
                except Exception as e:
                    print(f"Error extracting data from article: {e}")

            
            try:
                more_news_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, '.footer___PvIjk'))
                )
                ActionChains(driver).move_to_element(more_news_button).click().perform()
                time.sleep(2)  # Wait for more news to load
            except Exception as e:
                print(f"Error finding or clicking 'more news' button: {e}")
                break

        except Exception as e:
            print(f"Error during scraping: {e}")
            break

    return all_articles

def save_to_csv(data, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Description', 'Link', 'Date and Time'])
        writer.writerows(data)


url = 'https://news.metal.com/list/industry/aluminium'
articles = scrape_articles(url)
save_to_csv(articles, 'articles_data.csv')

driver.quit()
