## Actalyst AI Engineer Task

#### Step 01: Extract the relevant data from the Aluminium industry (precisely  Title, Summary and Date) 
#### Step 02: convert the scrap data to vector embeddings using text-embedding-ada-002
#### Step 03: Created Streamlit app that loads the vector embeddings
#### Step 04: Deployment

## This code is to extract the last 45 days news as well the latest updates too

In [32]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import csv
from datetime import datetime

# Initialize the Chrome driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Open the webpage
driver.get("https://news.metal.com/list/industry/aluminium")

# Define the date range
start_date_str = "Jun 18, 2024 07:00"
end_date_str = "Aug 2, 2024 14:41"
start_date = datetime.strptime(start_date_str, "%b %d, %Y %H:%M")
end_date = datetime.strptime(end_date_str, "%b %d, %Y %H:%M")

# Function to load news by clicking the "Load More" button until a certain date
def load_news(driver, stop_date):
    articles_loaded = 0
    last_article_date = datetime.max  # Initialize with a future date
    
    while True:
        try:
            # Find the "Load More" button
            load_more_button = driver.find_element(By.CSS_SELECTOR, ".footer___PvIjk")
            # Click the "Load More" button
            load_more_button.click()
            # Wait for the new content to load
            time.sleep(2)
            
            # Check dates of the newly loaded articles
            titles = driver.find_elements(By.CSS_SELECTOR, ".title___1baLV")
            dates = driver.find_elements(By.CSS_SELECTOR, ".date___3dzkE")
            if not dates:
                break
            
            for i in range(len(dates)):
                date_str = dates[i].text
                article_date = parse_date(date_str)
                if article_date:
                    last_article_date = min(last_article_date, article_date)
            
            # Stop loading more if the last article date is before the stop_date
            if last_article_date < stop_date:
                break

            articles_loaded = len(titles)
            print(f"Articles loaded: {articles_loaded}, Last article date: {last_article_date}")

        except Exception as e:
            print(f"An error occurred while loading more articles: {e}")
            break

# Function to parse date from the string
def parse_date(date_str):
    try:
        # Adjust the format according to how the date is represented on the website
        return datetime.strptime(date_str, "%b %d, %Y %H:%M")
    except ValueError:
        return None

# Load all news articles
load_news(driver, start_date)

# Extract the news articles
titles = driver.find_elements(By.CSS_SELECTOR, ".title___1baLV")
summaries = driver.find_elements(By.CSS_SELECTOR, ".description___z7ktb.descriptionspec___lj3uG")
dates = driver.find_elements(By.CSS_SELECTOR, ".date___3dzkE")

# Print lengths and HTML content for debugging
print(f"Titles length: {len(titles)}")
print(f"Summaries length: {len(summaries)}")
print(f"Dates length: {len(dates)}")

# Filter and prepare the data for CSV
news_data = []
for i in range(min(len(titles), len(summaries), len(dates))):
    title = titles[i].text
    summary = summaries[i].text if i < len(summaries) else "No summary"
    date_str = dates[i].text
    article_date = parse_date(date_str)
    
    # Store news if it's within the date range or future updates
    if article_date and start_date <= article_date <= end_date:
        news_data.append([title, summary, date_str])

# Save the data to a CSV file
csv_file = "news_data_filtered.csv"
if news_data:
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Title", "Summary", "Date"])
        writer.writerows(news_data)
    print(f"Data saved to {csv_file}")
else:
    print("No data found for the specified date range.")

# Close the driver
driver.quit()


Articles loaded: 20, Last article date: 2024-07-30 13:28:00
Articles loaded: 30, Last article date: 2024-07-26 16:54:00
Articles loaded: 40, Last article date: 2024-07-25 06:56:00
Articles loaded: 50, Last article date: 2024-07-23 07:14:00
Articles loaded: 60, Last article date: 2024-07-22 07:04:00
Articles loaded: 70, Last article date: 2024-07-18 13:44:00
Articles loaded: 80, Last article date: 2024-07-16 07:26:00
Articles loaded: 90, Last article date: 2024-07-12 07:11:00
Articles loaded: 100, Last article date: 2024-07-10 07:11:00
Articles loaded: 110, Last article date: 2024-07-05 14:30:00
Articles loaded: 120, Last article date: 2024-07-03 11:36:00
Articles loaded: 130, Last article date: 2024-07-02 07:18:00
Articles loaded: 140, Last article date: 2024-06-28 07:17:00
Articles loaded: 150, Last article date: 2024-06-26 08:54:00
Articles loaded: 160, Last article date: 2024-06-24 07:31:00
Articles loaded: 170, Last article date: 2024-06-20 11:07:00
Articles loaded: 180, Last artic

### Code to generate Data Embeddings

In [40]:
import openai
import json
import os
from tqdm import tqdm
import numpy as np
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# API key setup
openai.api_key = os.getenv('OPENAI_API_KEY')

# Load the metadata
with open('metadata.json', 'r') as f:
    metadata = json.load(f)

# Function to generate embeddings
def get_embedding(text):
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-ada-002"
    )
    return response['data'][0]['embedding']

# Generate embeddings for the data
for item in tqdm(metadata):
    item['embedding'] = get_embedding(item['summary'])

# Save the embeddings
with open('embeddings_with_metadata.json', 'w') as f:
    json.dump(metadata, f)


100%|████████████████████████████████████████████████████████████████████████████████| 187/187 [01:07<00:00,  2.75it/s]


#### The Streamlit code is in the directory app.py