In [41]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd
import time

# Define the URL
url = "https://news.metal.com/list/industry/aluminium"

# Set up Selenium WebDriver (you may need to specify the path to your chromedriver)
driver = webdriver.Chrome()

# Open the URL
driver.get(url)

# Scroll down to load more articles (repeat as necessary)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # Wait for the page to load
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Parse the loaded content with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()  # Close the browser

# Extract all articles
articles = soup.find_all('div', class_='news___3BmE1')

# Define the date 45 days ago from today
date_45_days_ago = datetime.now() - timedelta(days=45)

# List to hold the news data
news_data = []

for article in articles:
    # Extract title, summary, and date with error handling
    title = article.find('h2', class_="titleText___1IQOs")
    summary = article.find('p', class_='timeDsc___1i4u2')
    date_element = article.find('p', class_='timeTxt___2YE-p')

    if title and summary and date_element:
        title_text = title.get_text(strip=True)
        summary_text = summary.get_text(strip=True)
        date_str = date_element.get_text(strip=True)

        # Try to parse the date
        try:
            date = datetime.strptime(date_str, '%b %d, %Y %H:%M')
        except ValueError:
            # Skip this article if date parsing fails
            continue

        # Check if the date is within the last 45 days
        if date >= date_45_days_ago:
            news_data.append({
                'title': title_text,
                'summary': summary_text,
                'date': date.strftime('%b %d, %Y %H:%M')
            })

# Convert the list to a DataFrame
news_df = pd.DataFrame(news_data)

# Save the DataFrame to a CSV file
news_df.to_csv('aluminium_news.csv', index=False)

print("Data extraction completed and saved to 'aluminium_news.csv'")


Data extraction completed and saved to 'aluminium_news.csv'


In [43]:
import openai
import pandas as pd
import numpy as np
import requests
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

def fetch_api_key():
    url = "http://52.66.239.27:8504/get_keys"
    email = {"email": "aditi.baggu_2025@woxsen.edu.in"}
    response = requests.post(url, json=email)
    if response.status_code == 200:
        return response.json().get('key')
    else:
        raise Exception("Failed to fetch API key")

# Set the OpenAI API key
openai.api_key = fetch_api_key()

# Load the scraped data
news_df = pd.read_csv('aluminium_news.csv')

# Generate embeddings for the news summaries
embeddings = []
for summary in news_df['summary']:
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=summary
    )
    embeddings.append(response['data'][0]['embedding'])

# Add embeddings to the DataFrame
news_df['embedding'] = embeddings

# Save the embeddings to a file
news_df.to_pickle('aluminium_news_embeddings.pkl')
