In [1]:
import os
import copy
#!pip install beautifulsoup4 pandas selenium webdriver_manager
# -------------------------------------------------------------------------
#  LangChain Imports
# -------------------------------------------------------------------------
from langchain.chains import SequentialChain, LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI  # Replace with any LLM provider
from langchain.output_parsers import RegexParser
# -------------------------------------------------------------------------
#  Web Scraping Imports
# -------------------------------------------------------------------------
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
# -------------------------------------------------------------------------

In [44]:
def get_latest_bullionvault_articles(URL="https://www.bullionvault.com/gold-news"):
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, 'html.parser')
    latest=soup.find(id='views-bootstrap-grid-1').find_all(class_='field-content')
    list_data = []
    for item in latest:
        date=item.find(class_='views-field-created')
        if not date:
            continue
        link=item.find(class_='views-field-title').find('a')['href']
        page_response = requests.get(link)
        page_soup = BeautifulSoup(page_response.content, 'html.parser')
        content = page_soup.find('div', class_='field field-name-body field-type-text-with-summary field-label-hidden')
        title = page_soup.find('h1').text.strip()
        content_text = content.text.strip() if content else ''
        data_point = {'Date': date.text.strip() if date else 'N/A', 'Content': title + ':' + content_text}
        list_data.append(data_point)
    list_df=pd.DataFrame(list_data)
    list_df['Date']= pd.to_datetime(list_df['Date'],errors='coerce').dt.date
    return list_df

def yf_extract_info(item):
    link=item.find('a',class_='subtle-link')['href']
    title=item.find('a',class_='subtle-link')['title']

    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration

    page_driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    page_driver.get(link)

    page_soup = BeautifulSoup(page_driver.page_source, 'html.parser')
    content = page_soup.find('div', class_='body')
    content_text = content.text.strip() if content else ''
    date= page_soup.find('div', class_= lambda c: c and c.startswith("byline")).find('time')
    data_point = {'Date': date.text.strip() if date else 'N/A', 'Content': title + ':' + content_text}
    page_driver.quit()
    return data_point

def get_latest_yf_articles(URL="https://finance.yahoo.com/news/"):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.get(URL)
    time.sleep(2)
    last_height = driver.execute_script("return document.body.scrollHeight")
    count = 0
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        count += 1
        if count > 1:
            break
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    articles = soup.find_all('li', class_='story-item')
    list_data = []
    #print(f"Found {len(articles)} articles on Yahoo Finance.")
    for article in articles:
        try:
            list_data.append(yf_extract_info(article))
        except Exception as e:
            continue
    driver.quit()
    list_df=pd.DataFrame(list_data)
    list_df['Date'].dropna(inplace=True)
    #list_df['Date']= pd.to_datetime(list_df['Date'],errors='coerce').dt.date
    return list_df

def get_reuters_article_text(item, base_URL="https://www.reuters.com"):
    title = item.get_text(strip=True)
    link = item.find('a', href=True)['href']
    if not link.startswith('http'):
        link = base_URL + link
    chrome_options = Options()
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(link)
    time.sleep(2)
    page_soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    article = page_soup.find_all("div", class_=lambda c: c and c.startswith("article-body__paragraph"))
    article_text = ""
    for para in article:
        paragraph_text = para.get_text(strip=True)
        article_text = article_text + "." + paragraph_text
    date = page_soup.find("span", class_=lambda c: c and c.startswith("date-line__date")).get_text(strip=True)
    data_point = {'Date': date, 'Content': title + ':' + article_text}
    return data_point

def get_reuters_articles_list(URL):
    chrome_options = Options()
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(URL)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    articles=soup.find_all("div", class_=lambda c: c and c.startswith("story-card__area-headline"))
    list_data = []
    for article in articles:
        try:
            list_data.append(get_reuters_article_text(article))
        except Exception as e:
            continue
    list_df=pd.DataFrame(list_data)
    list_df['Date']= pd.to_datetime(list_df['Date'],errors='coerce').dt.date
    return list_df

def get_reuters_articles():
    base_URL="https://www.reuters.com"
    search_query="/site-search/?query=gold"
    df = pd.DataFrame(columns=['Date', 'Content'])
    for section_val in ['all']:
        for offset_nb in range(0, 40, 20):
            offset =f"&offset={offset_nb}"
            section=f"&section={section_val}"
            URL = base_URL + search_query + offset + section
            try:
                df_latest=get_reuters_articles_list(URL)
                df = pd.concat([df, df_latest], ignore_index=True)
            except Exception as e:
                print(f"Error fetching articles from {URL}: {e}")
                continue
    return df

In [45]:
# Step 1: Load/clean dummy data (placeholder)
def extract_news_data():
    bullion_df = get_latest_bullionvault_articles()
    yf_df=get_latest_yf_articles()
    yf_df['Date']=pd.to_datetime(yf_df['Date'],errors='coerce').dt.date
    reuters_df = get_reuters_articles()
    three_days_ago = pd.to_datetime('today').date() - timedelta(days=3)

    df_combined = pd.concat([bullion_df, yf_df, reuters_df], ignore_index=True)
    df_combined = df_combined.sort_values(by='Date')
    df_combined=df_combined[df_combined['Date'] >= three_days_ago]
    # Placeholder for actual news data extraction logic
    return df_combined
news_data = "Oil prices surged due to Middle East tensions."
price_data = "RSI indicates strong momentum in gold stocks."

# Step 2: Sentence Pre-processing
news_sentences = [news_data]  # In practice, split into sentences
price_sentences = [price_data]

In [11]:
yf_df

NameError: name 'yf_df' is not defined

In [None]:
# Step 3: Topic & Sentiment Extraction
prompt = PromptTemplate(
    input_variables=["sentence"],
    template="Extract the main subject and sentiment score (between -1 to 1) from this sentence:\n{sentence}"
)
llm_chain = LLMChain(llm=OpenAI(temperature=0), prompt=prompt)

In [None]:
# Function to extract topic and sentiment using a mock LLM
def extract_topic_sentiment(sentences):
    # Placeholder: Replace with actual LLM call
    # Here, we mock the output for demonstration
    if sentences == news_sentences:
        return {"Oil": 0.92, "Middle East": 0.7}
    else:
        return {"Gold": 0.85, "RSI": 0.6}

# Step 4: Generate dictionary of {topic: sentiment} for each path
news_topic_sentiment = extract_topic_sentiment(news_sentences)
price_topic_sentiment = extract_topic_sentiment(price_sentences)

In [None]:
# Step 4: Generate dictionary of {topic: sentiment}
results = [topic_sentiment_chain.run(sentence=s) for s in sentences]
topic_sentiment_dict = {
    "Oil": 0.92,
    "Harvest": 0.02,
    "GDP": 0.45,
    "Boxoffice": 0.58
}

In [None]:
# Step 5: Custom Model Placeholder (simulate transformation)
def custom_model_output(ts_dict):
    return {topic: score * 1.1 for topic, score in ts_dict.items()}

# Step 6: Base Model Placeholder (simulate transformation)
def base_model_output(ts_dict):
    return {topic: score * 0.95 for topic, score in ts_dict.items()}

# Step 7: Meta-Model using Ensemble Strategy (simple averaging)
def meta_model(custom_out, base_out):
    ensemble = {}
    for k in custom_out:
        ensemble[k] = (custom_out[k] + base_out.get(k, 0)) / 2
    return ensemble

In [None]:
# --- News Path ---
news_custom = custom_model_output(news_topic_sentiment)
news_base = base_model_output(news_topic_sentiment)
news_ensemble = meta_model(news_custom, news_base)

# --- Price Path ---
price_custom = custom_model_output(price_topic_sentiment)
price_base = base_model_output(price_topic_sentiment)
price_ensemble = meta_model(price_custom, price_base)

In [None]:
# --- Final Meta-Model: Combine News and Price Ensembles ---
final_input = {**news_ensemble, **price_ensemble}  # Merge both
# Optionally, you could run another meta_model or further logic here

print("News Ensemble Output:", news_ensemble)
print("Price Ensemble Output:", price_ensemble)
print("Final Combined Output:", final_input)