## **MODULAR REAL TIME MODELING FOR FINANCIAL INSIGHT GENERATION**

In [None]:
import requests
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from datetime import datetime
import time

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import cohere
import faiss
from tqdm import tqdm
import random

import praw
import re
from datasets import Dataset
import sys
from src.utils import clean_corpus, refine_corpus
from src.scraping_module import *

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/emmanueladeleye/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2026-01-15 02:20:20,807 - INFO - Loading faiss.
2026-01-15 02:20:20,846 - INFO - Successfully loaded faiss.
2026-01-15 02:20:20,850 - INFO - Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes.
2026-01-15 02:20:20,981 - INFO - TensorFlow version 2.17.0 available.


Pulling all news source

In [2]:
logger.info("Starting the news pull")

scraper = UnifiedFinancialScraper(query="stocks")
result_df = scraper.fetch_all()
logger.info(f"Scraping complete | Total articles fetched: {len(result_df)}")
logger.info("Commencing cleaning and refining the corpus")
result_df['full_response'] = " Source: " + result_df['source'] + "\n" + result_df['title'] + '\n' + result_df['content']

texts_list = result_df['title'] + '\n' + result_df['content']

# Stack them all into a single string
all_text = "\n".join(texts_list)     
logger.info(f"News pull complete | Total articles fetched: {len(result_df)}")

2026-01-15 02:20:22,585 - INFO - Starting the news pull
2026-01-15 02:20:34,261 - ERROR - Error fetching from https://finance.yahoo.com/news/rssindex: object has no attribute 'summary'
2026-01-15 02:20:34,577 - INFO - Fetched 14 RSS articles
2026-01-15 02:20:35,424 - INFO - Fetched 99 Google News articles
2026-01-15 02:20:36,237 - INFO - Fetched 10 Finnhub articles
2026-01-15 02:20:36,565 - INFO - Fetched 0 Alpha Vantage articles
2026-01-15 02:20:37,108 - INFO - Fetched 100 NewsAPI articles
2026-01-15 02:20:42,557 - INFO - Fetched 149 Reddit posts
2026-01-15 02:20:42,565 - INFO - Total articles before deduplication: 372
2026-01-15 02:20:42,602 - INFO - Total articles after deduplication: 305
2026-01-15 02:20:42,604 - INFO - Scraping complete | Total articles fetched: 305
2026-01-15 02:20:42,605 - INFO - Commencing cleaning and refining the corpus
2026-01-15 02:20:42,606 - INFO - News pull complete | Total articles fetched: 305


                                                 title   source  quality_score
186  Cerebras in discussions to raise about $1B at ...  NewsAPI           0.75
178  Mangoceuticals files to sell 2.64M shares of c...  NewsAPI           0.75
151           Citi initiated with bullish view at CICC  NewsAPI           0.75
152                        Magpul Introdces New Colors  NewsAPI           0.75
153  Bytes Technology just downgraded at Jefferies,...  NewsAPI           0.75
154  RBI's $10-billion forex swap gets thrice the bids  NewsAPI           0.75
155  Closing Bell Movers: Glaukos slips 13% after p...  NewsAPI           0.75
156  Stock market today: Dow, S&P 500, Nasdaq futur...  NewsAPI           0.75
157  Stock market today: Dow, S&P 500, Nasdaq futur...  NewsAPI           0.75
158  Stock market today: Dow, S&P 500, Nasdaq slide...  NewsAPI           0.75
source
NewsAPI                      99
Reddit r/investing           30
Reddit r/stocks              29
Reddit r/options            

News source distribution

In [3]:
pd.Series(result_df["source"].value_counts())

source
NewsAPI                      99
Reddit r/investing           30
Reddit r/stocks              29
Reddit r/options             27
Reddit r/StockMarket         24
Reddit r/algotrading         15
CNBC                         12
Reddit r/fatFIRE             11
The Motley Fool               8
Reddit r/economics            7
Yahoo Finance                 6
MarketWatch                   6
Bloomberg                     3
wsj_markets                   2
Seeking Alpha                 2
Nasdaq                        2
bloomberg_markets             2
cnbc                          2
ft_markets                    2
Barron's                      2
The Globe and Mail            1
Investor's Business Daily     1
bloomberg_economics           1
bloomberg_bview               1
Business Insider              1
Financial Times               1
Morningstar Canada            1
Investopedia                  1
South China Morning Post      1
Politico                      1
ft_companies                  1
E

In [5]:
cleaned_corpus = refine_corpus(all_text)
logger.info(
    "Corpus refined | original=%d | cleaned=%d",
    len(all_text),
    len(cleaned_corpus)
)

2026-01-15 02:22:16,528 - INFO - Corpus refined | original=214213 | cleaned=1035


#### **Search Option 1 - Cohere Embeddings**

In [6]:
load_dotenv()
cohere_api = os.getenv("cohere_api")
co = cohere.Client(cohere_api)
openai_api = os.getenv("openai_api")

In [7]:
def embed_with_backoff(texts, batch_size=90, sleep_time=1):
    embeddings = []
    total_texts = len(texts)
    logger.info("Starting embedding with backoff | total_texts=%d", total_texts)
    for i in tqdm(range(0, total_texts, batch_size)):
        batch_texts = texts[i:i+batch_size]
        success = False
        while not success:
            try:
                response = co.embed(texts=batch_texts, input_type='search_document')
                embeddings.extend(response.embeddings)
                success = True
                logger.info(f"Successfully embedded batch starting from {i} to {i+len(batch_texts)}")
                time.sleep(2) # to respect rate limiting
            except Exception as e:
                logger.warning("Embedding failed for batch starting at index %d: %s. Retrying after %d seconds.", i, str(e), sleep_time)
                time.sleep(sleep_time)
    return embeddings

def search(s_query, number_of_results):
    query_embed = embed_with_backoff([s_query])[0]
    query_vc_np = np.array([query_embed], dtype='float32')
    dimension = query_vc_np.shape[1]
    index = faiss.IndexFlatL2(dimension)
    corpus_embeddings = np.array(embed_with_backoff(cleaned_corpus), dtype='float32')
    index.add(corpus_embeddings)
    distances, similar_items_ids = index.search(query_vc_np, number_of_results)
    results_idx = similar_items_ids[0]
    results_dist = distances[0]

    results = pd.DataFrame({
        "text": np.array(cleaned_corpus)[results_idx],
        "distance": results_dist
    })
    print(f"Search complete | query='{s_query}' | results_returned={len(results)}")
    return results

In [8]:
search('Attack on Putin', 5)

2026-01-15 02:22:18,233 - INFO - Starting embedding with backoff | total_texts=1
  0%|          | 0/1 [00:00<?, ?it/s]2026-01-15 02:22:18,510 - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2026-01-15 02:22:18,568 - INFO - Successfully embedded batch starting from 0 to 1
100%|██████████| 1/1 [00:02<00:00,  2.34s/it]
2026-01-15 02:22:20,577 - INFO - Starting embedding with backoff | total_texts=1035
  0%|          | 0/12 [00:00<?, ?it/s]2026-01-15 02:22:21,436 - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2026-01-15 02:22:23,705 - INFO - Successfully embedded batch starting from 0 to 90
  8%|▊         | 1/12 [00:05<00:56,  5.13s/it]2026-01-15 02:22:26,284 - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2026-01-15 02:22:27,873 - INFO - Successfully embedded batch starting from 90 to 180
 17%|█▋        | 2/12 [00:09<00:45,  4.57s/it]2026-01-15 02:22:30,706 - INFO - HTTP Request: POST https://api.co

Search complete | query='Attack on Putin' | results_returned=5


Unnamed: 0,text,distance
0,"With the US hitting Venezuela today, is anyone...",8731.922852
1,Source: I think stocks tomorrow will drop and...,9288.166992
2,Trump has also deployed U.S. troops to police ...,9742.404297
3,After spending heavily to propel Trump back to...,9899.087891
4,Flooding the global oil supply will lower pric...,9928.169922


#### **Search Option 2 - BM25 + Reranking**

In [9]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [10]:
# similarity + keyword match
def bm25_tokenizer(text):
    tokenizer_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token)>0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenizer_doc.append(token)

    return tokenizer_doc

def keyword_and_reranking_search(s_query, top_k=5, num_candidates=10, bm25=None):
    print(f'Input Query: {s_query}')

    #### BM25 search lexical search
    bm25_scores = bm25.get_scores(bm25_tokenizer(s_query))
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{'corpus_id':idx, 'score':bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

    print(f'Top-{top_k} Lexical Search (BM25) Hits')

    for hit in bm25_hits[0:top_k]:
        print('\t{:.3f}\t{}'.format(hit['score'], cleaned_corpus[hit['corpus_id']].replace('\n',' ')))

    # Adding reranking
    docs = [cleaned_corpus[hit['corpus_id']] for hit in bm25_hits]

    print(f'\nTop-{top_k} Hits By Rank-API ({len(bm25_hits)} BM25 Hits Re-Ranked)')
    results = co.rerank(query=s_query, documents=docs, top_n=top_k, return_documents=True)

    for hit in results.results:
        print('\t{:.3f}\t{}'.format(hit.relevance_score, hit.document.text.replace('\n','')))

In [11]:
tokenized_corpus = []
for passage in tqdm(cleaned_corpus):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)    

100%|██████████| 1035/1035 [00:00<00:00, 192135.29it/s]


In [12]:
keyword_and_reranking_search(s_query='China\'s position on US tarrifs', bm25=bm25)

Input Query: China's position on US tarrifs
Top-5 Lexical Search (BM25) Hits
	6.625	If you allocate 10k to BTC and 10k to SOL, your Solana position contributes 62 more risk to your portfolio than your Bitcoin position.
	5.625	I realized I was taking on more risk than I fully understood, especially with position sizing.
	5.485	China's ban on cybersecurity software roils three of our stocks  here's our view Chinese authorities have told domestic companies to stop using software from a handful of U.S. and Israeli firms.
	5.485	China's ban on cybersecurity software roils three of our stocks  here's our view Chinese authorities have told domestic companies to stop using software from a handful of U.S. and Israeli firms.
	5.251	Position: Lots of VGLT to bet on rate cuts and QE pushing up bond pricesyields down.

Top-5 Hits By Rank-API (10 BM25 Hits Re-Ranked)


2026-01-15 02:23:15,991 - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK"


	0.061	Position: Lots of VGLT to bet on rate cuts and QE pushing up bond pricesyields down.
	0.053	Given the strong backlog and upbeat forward guidance, do you think this is a good time to start a position?
	0.050	I realized I was taking on more risk than I fully understood, especially with position sizing.
	0.038	China's ban on cybersecurity software roils three of our stocks  here's our view Chinese authorities have told domestic companies to stop using software from a handful of U.S. and Israeli firms.
	0.038	China's ban on cybersecurity software roils three of our stocks  here's our view Chinese authorities have told domestic companies to stop using software from a handful of U.S. and Israeli firms.


#### **Search Option 3 - Implementing Langchain**

In [13]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

import openai

In [14]:
response_schemas = [
    ResponseSchema(name="key_insights", description='3-5 bullet points summarizing key insights/outlook on the topic'),
    ResponseSchema(name="key_drivers", description='Main economic/politcal indicators driving the topic'),
    ResponseSchema(name='risks', description='Potential risks associated with the topic'),
    ResponseSchema(name='sentiment', description='Overall social sentiment (positive/negative/neutral with evidence) and degree of sentiment in percentage')    
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

In [15]:
def analyze_text(s_query, relevant_text):
    
    # initiate llm model
    prompt = ChatPromptTemplate.from_template(
    """
    Analyze the following news corpus regarding {query} and extract:
    {format_instructions}
    
    Corpus:
    {text}
    
    """)
    #client = openai.OpenAI(api_key=openai_key)
    llm = ChatOpenAI(
        model="gpt-3.5-turbo",
        temperature=0.3,
        openai_api_key=openai_api  # Pass key directly or use environment variable
    )

    messages = prompt.format_messages(
        query = s_query,
        text = relevant_text,
        format_instructions=format_instructions
    )

    response = llm(messages)
    return output_parser.parse(response.content)

In [16]:
def create_search_index(full_text):
    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = text_splitter.split_text(full_text)

    # creating searchable index
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api)
    return FAISS.from_texts(chunks, embeddings)    

def analyze_with_semantic_search(s_query, text_list, n_results=10):
    full_texts = " ".join(text_list) if isinstance(text_list, list) else text_list
    # creating vector index on full corpus
    index = create_search_index(full_texts)

    # retreiving relevant chunks
    bm25_retriever = BM25Retriever.from_texts(full_texts)
    faiss_retriever = index.as_retriever()
    ensemble_retriever = EnsembleRetriever(
        retrievers=[bm25_retriever, faiss_retriever], 
        weights=[0.4,0.6]
    )
    
    relevant_text = ensemble_retriever.get_relevant_documents(s_query)
    
    return analyze_text(s_query, relevant_text), relevant_text

#### **Testing**

Including social media

In [17]:
## reruning semantic search on data inclusive of reddit posts
s_query = 'Greenland and the global economy'
full_texts = cleaned_corpus
result, context = analyze_with_semantic_search(s_query, full_texts, n_results=5)

  embeddings = OpenAIEmbeddings(openai_api_key=openai_api)
2026-01-15 02:23:56,640 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
  relevant_text = ensemble_retriever.get_relevant_documents(s_query)
2026-01-15 02:23:59,312 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
  llm = ChatOpenAI(
  response = llm(messages)
2026-01-15 02:24:02,446 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [18]:
result

{'key_insights': '1. Global economy expected to grow by 3.2% despite US trade war 2. Small businesses are leading indicators of labor market trends 3. Potential rise in USAR due to geopolitical tensions 4. Speculation on crashing gold prices to reinforce dollar safety 5. Grid capacity shortages being addressed with diesel and gas generators',
 'key_drivers': "Tariffs imposed by Trump on countries doing business with Iran, renewable energy grid upgrades, China's rental of Venezuelan resources, potential digital currency backed by tangible assets",
 'risks': "Potential reduction in employment at mid to large private sector firms, impact on China's economy from US actions, volatility in gold and oil prices due to geopolitical tensions",
 'sentiment': 'Neutral. The corpus contains a mix of positive and negative sentiments, with discussions on potential economic growth and challenges ahead. Overall sentiment leans towards caution and uncertainty.'}

In [None]:
contextt = [doc.page_content for doc in context]


["spending by the U.S. tech industry - The Economist: Were we wrong about Trump's tariffs? - CNN: Were in a windchill economy, where things feel worse than they are - OECD: Despite US trade war, OECD expects global economy will grow 3.2 this year - IMF: Global Economic Outlook Shows Modest Change Amid Policy Shifts and Complex Forces - Bloomberg Chief Economist: Yeah, I am surprised as well. But that is the data speaking. - The Washington Post: Why you may not want lower prices as much as you think you do Misc charts  Fed projections goldilocks. Domestic-to-international performance ratio round tripped, the actual anomaly was the lame duck session outperformance. Inflation adjusted 6040 portfolio normie allocation outside Reddit finally broke out of the COVID range, recovering from the bond carnage. my watchlist is just a museum of stocks i never bought i remember adding this stock to my watchlist at 120. the rest are just reminders of hesitation, overthinking, and pretending patience"

Excluding social media

In [19]:
pd.DataFrame(result_df["source"].value_counts())

Unnamed: 0_level_0,count
source,Unnamed: 1_level_1
NewsAPI,99
Reddit r/investing,30
Reddit r/stocks,29
Reddit r/options,27
Reddit r/StockMarket,24
Reddit r/algotrading,15
CNBC,12
Reddit r/fatFIRE,11
The Motley Fool,8
Reddit r/economics,7


In [20]:
len(result_df)

305

In [25]:
full_response_no_reddit = result_df[result_df['source'].str.lower().str[:6] != 'reddit']
texts_list_no_reddit = full_response_no_reddit['title'] + '\n' + full_response_no_reddit['content']
all_text_no_reddit = "\n".join(texts_list_no_reddit)

In [26]:
## rerunning semantic search on data exclusive of reddit posts
cleaned_texts_no_reddit = refine_corpus(all_text_no_reddit)
result_2, context_2 = analyze_with_semantic_search(s_query, cleaned_texts_no_reddit, n_results=5)

2026-01-15 02:29:00,351 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-01-15 02:29:01,227 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-01-15 02:29:04,210 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [27]:
result_2

{'key_insights': "1. Greenland Technologies stock rises amidst geopolitical tensions 2. Silver mining stocks jump as metal holds above $90 milestone 3. Venezuela stocks soar to record highs post Maduro's ouster 4. US stock market experiences back-to-back losses in 2026 5. Low crude oil prices negatively impact Big Oil stocks",
 'key_drivers': "Geopolitical tensions, silver prices, political changes in Venezuela, economic data, Federal Reserve's interest-rate path, tech sector performance, bank earnings",
 'risks': "Potential risks include further escalation of geopolitical tensions, volatility in silver prices, uncertainty in Venezuela's economic stability, impact of economic data on market expectations, Federal Reserve's interest-rate decisions, tech sector performance, and bank earnings",
 'sentiment': 'Neutral sentiment overall with evidence of mixed reactions in the market due to various factors such as geopolitical tensions, economic data, and stock performance. The sentiment degr

In [32]:
context_2 = [doc.page_content for doc in context_2]
context_2 = [content for content in context_2 if len(content.strip()) > 1]  
context_2

['That May Be Undervalued In January 2026 Three Stocks That May Be Undervalued In January 2026nbsp;nbsp;Yahoo Finance Greenland Technologies Stock Rises. There Are Better Ways to Play the Geopolitical Tug-of-War.nbsp;nbsp;Barron\'s These 2 "Magnificent Seven" Stocks May Announce Stock Splits in 2026 These 2 "Magnificent Seven" Stocks May Announce Stock Splits in 2026nbsp;nbsp;The Motley Fool Stock Market Today: Dow Closes Lower, But Nasdaq Is The Big Loser; This Robotics Name Slides Live Coverage Stock Market Today: Dow Closes Lower, But Nasdaq Is The Big Loser; This Robotics Name Slides Live Coveragenbsp;nbsp;Investor\'s Business Daily Stock Market News, Jan. 14, 2026: Dow, SP 500 end lower, Nasdaq logs worst day since late-December on geopolitical tensions and lack of tariff decision from Supreme Court; China bans U.S. cybersecurity software; silver hits new record high Stock Market News, Jan. 14, 2026: Dow, Samp;P 500 end lower, Nasdaq logs worst day since late-December on geopoliti