<a href="https://colab.research.google.com/github/BHARATH077/Semantic-Analysis-of-Financial-News-via-RAG/blob/main/Semantic_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project Title: Semantic Analysis of Financial News via RAG

## 🧠 Data Collection & Preprocessing

In [1]:
# 🧩 Step 1: Install Dependencies

!pip install pandas numpy requests beautifulsoup4 sentence-transformers faiss-cpu transformers feedparser -q




  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


In [2]:
# 📰 Step 2: Collect Financial News Data

import requests
import pandas as pd

# Sample: Financial news using Yahoo Finance RSS
rss_url = "https://feeds.finance.yahoo.com/rss/2.0/headline?s=AAPL,MSFT,GOOG&region=US&lang=en-US"

import feedparser
feed = feedparser.parse(rss_url)

data = []
for entry in feed.entries:
    data.append({
        "title": entry.title,
        "link": entry.link,
        "summary": entry.summary
    })

df_news = pd.DataFrame(data)
df_news.head()


Unnamed: 0,title,link,summary
0,Top 10 Cybersecurity Tips Can Save Your Busine...,https://finance.yahoo.com/news/top-10-cybersec...,"With the holiday season approaching, cyberatta..."
1,Apple could make $133 billion a year on humano...,https://finance.yahoo.com/news/apple-could-mak...,Morgan Stanley predicts Apple could generated ...
2,Microsoft President Sold $20 Million in Shares...,https://finance.yahoo.com/m/0a1e5d8a-a6e3-367d...,"Vice Chair and President Brad Smith sold 38,50..."
3,"Magnificent Seven Stocks: Nvidia, Tesla, Meta,...",https://finance.yahoo.com/m/4205eaa9-f620-3a0b...,"Magnificent Seven stocks, including Nvidia and..."
4,"Duolingo: Speaking Growth Fluently, Despite th...",https://finance.yahoo.com/news/duolingo-speaki...,"Duolingo is a strong growth story, as AI fears..."


In [3]:
# 🧾 Step 3: Load Sample SEC Filing Texts (Demo Data)

filings = [
    {"company": "Apple Inc", "text": "Apple reported Q2 revenue of $117B, down 5% YoY. Strong performance in services and wearables segments."},
    {"company": "Microsoft Corp", "text": "Microsoft cloud revenue grew 24%, driven by Azure and Office 365 demand. Net income rose 12%."},
    {"company": "Alphabet Inc", "text": "Google parent Alphabet posted advertising revenue of $58.9B with growth in YouTube and Cloud."}
]

df_filings = pd.DataFrame(filings)
df_filings


Unnamed: 0,company,text
0,Apple Inc,"Apple reported Q2 revenue of $117B, down 5% Yo..."
1,Microsoft Corp,"Microsoft cloud revenue grew 24%, driven by Az..."
2,Alphabet Inc,Google parent Alphabet posted advertising reve...


In [4]:
# 🧹 Step 4: Clean and Merge Text Sources

df_news["source"] = "News"
df_filings["source"] = "Filing"

df_news = df_news.rename(columns={"summary": "text"})
df_all = pd.concat([
    df_news[["title", "text", "link", "source"]],
    df_filings.assign(link=None)[["company", "text", "source"]].rename(columns={"company":"title"})
])

df_all.reset_index(drop=True, inplace=True)
df_all.head(5)


Unnamed: 0,title,text,link,source
0,Top 10 Cybersecurity Tips Can Save Your Busine...,"With the holiday season approaching, cyberatta...",https://finance.yahoo.com/news/top-10-cybersec...,News
1,Apple could make $133 billion a year on humano...,Morgan Stanley predicts Apple could generated ...,https://finance.yahoo.com/news/apple-could-mak...,News
2,Microsoft President Sold $20 Million in Shares...,"Vice Chair and President Brad Smith sold 38,50...",https://finance.yahoo.com/m/0a1e5d8a-a6e3-367d...,News
3,"Magnificent Seven Stocks: Nvidia, Tesla, Meta,...","Magnificent Seven stocks, including Nvidia and...",https://finance.yahoo.com/m/4205eaa9-f620-3a0b...,News
4,"Duolingo: Speaking Growth Fluently, Despite th...","Duolingo is a strong growth story, as AI fears...",https://finance.yahoo.com/news/duolingo-speaki...,News


In [5]:
# 📂 Step 5: Save Preprocessed Data

df_all.to_csv("financial_documents.csv", index=False)
print("✅ Saved cleaned financial text data (financial_documents.csv)")


✅ Saved cleaned financial text data (financial_documents.csv)


## 🧠 Document Embedding & Vector Indexing (FAISS)


In [6]:
import pandas as pd

# 🧾 Step 1: Load Preprocessed Data

# Load data
df = pd.read_csv("financial_documents.csv")
print("✅ Loaded dataset with", len(df), "documents")
df.head(3)


✅ Loaded dataset with 23 documents


Unnamed: 0,title,text,link,source
0,Top 10 Cybersecurity Tips Can Save Your Busine...,"With the holiday season approaching, cyberatta...",https://finance.yahoo.com/news/top-10-cybersec...,News
1,Apple could make $133 billion a year on humano...,Morgan Stanley predicts Apple could generated ...,https://finance.yahoo.com/news/apple-could-mak...,News
2,Microsoft President Sold $20 Million in Shares...,"Vice Chair and President Brad Smith sold 38,50...",https://finance.yahoo.com/m/0a1e5d8a-a6e3-367d...,News


In [7]:
# 🧩 Step 2: Generate Embeddings

from sentence_transformers import SentenceTransformer

# Load pre-trained embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode text into dense embeddings
texts = df['text'].astype(str).tolist()
embeddings = embedding_model.encode(texts, show_progress_bar=True)

print("✅ Generated embeddings of shape:", embeddings.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Generated embeddings of shape: (23, 384)


In [8]:
# 💾 Step 3: Store Embeddings in FAISS

import faiss
import numpy as np

# Initialize FAISS index (L2 distance)
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)

# Convert to float32 (FAISS requirement)
embeddings = np.array(embeddings).astype('float32')

# Add embeddings to index
index.add(embeddings)
print("✅ Added", index.ntotal, "vectors to FAISS index")


✅ Added 23 vectors to FAISS index


In [9]:
# 🔍 Step 4: Test Vector Search

# Convert query to vector
query = "Apple quarterly earnings report"
query_vector = embedding_model.encode([query]).astype('float32')

# Retrieve top 3 similar documents
k = 3
distances, indices = index.search(query_vector, k)

print("\n🔍 Query:", query)
print("\nTop Results:")
for i, idx in enumerate(indices[0]):
    print(f"{i+1}. {df.iloc[idx]['title']} — Source: {df.iloc[idx]['source']}")



🔍 Query: Apple quarterly earnings report

Top Results:
1. Apple Inc — Source: Filing
2. Apple could make $133 billion a year on humanoid robots by 2040: Morgan Stanley — Source: News
3. Apple could reboot Siri to finally please users — Source: News


In [10]:
# 💾 Step 5: Save Vector Index and Mappings

faiss.write_index(index, "financial_news_index.faiss")
df.to_csv("financial_documents_with_embeddings.csv", index=False)

print("✅ Saved FAISS index and metadata")


✅ Saved FAISS index and metadata


# 🧠 Build the RAG (Retrieval-Augmented Generation) Pipeline

In [11]:
# 🧩 Step 1: Install Required Libraries

!pip install langchain openai -q


In [12]:
# 🧾 Step 2: Load FAISS Index and Metadata

import faiss
import pandas as pd
import numpy as np

# Load FAISS index
index = faiss.read_index("financial_news_index.faiss")

# Load document metadata
df = pd.read_csv("financial_documents_with_embeddings.csv")
print("✅ Loaded FAISS index and document metadata")


✅ Loaded FAISS index and document metadata


In [13]:
# 🧩 Step 3: Setup Embeddings for Retrieval

from sentence_transformers import SentenceTransformer

# Use the same embedding model as Day 2
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')


In [14]:
# 🧩 Step 4: Create a Simple Retriever

class FAISS_Retriever:
    def __init__(self, index, df, embedding_model, k=3):
        self.index = index
        self.df = df
        self.embedding_model = embedding_model
        self.k = k

    def retrieve(self, query):
        query_vector = self.embedding_model.encode([query]).astype('float32')
        distances, indices = self.index.search(query_vector, self.k)
        results = []
        for idx in indices[0]:
            results.append({
                "title": self.df.iloc[idx]["title"],
                "text": self.df.iloc[idx]["text"],
                "source": self.df.iloc[idx]["source"],
                "link": self.df.iloc[idx]["link"] if "link" in self.df.columns else None
            })
        return results

# Initialize retriever
retriever = FAISS_Retriever(index, df, embedding_model, k=3)


In [15]:
# 🔍 Step 5: Test Retrieval

query = "Apple quarterly revenue performance"
results = retriever.retrieve(query)

print(f"Query: {query}\n")
for i, doc in enumerate(results):
    print(f"{i+1}. Title: {doc['title']} — Source: {doc['source']}")
    print(f"Text: {doc['text'][:150]}...\n")


Query: Apple quarterly revenue performance

1. Title: Apple Inc — Source: Filing
Text: Apple reported Q2 revenue of $117B, down 5% YoY. Strong performance in services and wearables segments....

2. Title: Apple could make $133 billion a year on humanoid robots by 2040: Morgan Stanley — Source: News
Text: Morgan Stanley predicts Apple could generated $133 billion a year from humanoid robots by the year 2040....

3. Title: Microsoft Corp — Source: Filing
Text: Microsoft cloud revenue grew 24%, driven by Azure and Office 365 demand. Net income rose 12%....



In [17]:
# 🧩 Step 6: Integrate LLM for RAG

from transformers import pipeline

# Load a small summarization model (for free usage in Colab)
llm = pipeline("text2text-generation", model="google/flan-t5-small")

def generate_answer(query, retrieved_docs):
    context = "\n\n".join([f"{doc['text']}" for doc in retrieved_docs])
    prompt = f"Answer the following question using the context below. Cite sources if possible.\n\nContext:\n{context}\n\nQuestion: {query}"
    result = llm(prompt, max_length=200)[0]['generated_text']
    return result

# Test RAG pipeline
answer = generate_answer(query, results)
print("Generated Answer:\n", answer)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Answer:
 Q2 revenue of $117B, down 5% YoY


# 🧠 Semantic Query Answering with Citations

In [18]:
# 🧩 Step 1: Define a Function to Format Context with Citations

def format_context_with_citations(retrieved_docs):
    """
    Concatenate retrieved document texts with citations.
    """
    formatted_context = ""
    for i, doc in enumerate(retrieved_docs, 1):
        citation = f"Source: {doc['title']}" if doc['link'] is None else f"Source: {doc['title']} ({doc['link']})"
        formatted_context += f"{i}. {doc['text']} [{citation}]\n\n"
    return formatted_context


In [19]:
# 🧩 Step 2: Modify RAG Function to Include Citations

def generate_answer_with_citations(query, retrieved_docs):
    """
    Generate an answer to a financial query using context from retrieved documents with citations.
    """
    context = format_context_with_citations(retrieved_docs)
    prompt = f"Answer the following financial query using the context below. Include citations to sources.\n\nContext:\n{context}\n\nQuestion: {query}"

    # Use the small free LLM (FLAN-T5)
    result = llm(prompt, max_length=250)[0]['generated_text']
    return result


In [20]:
# 🔍 Step 3: Test Semantic Query Answering

query = "Microsoft cloud revenue growth in 2023"
retrieved_docs = retriever.retrieve(query)

answer = generate_answer_with_citations(query, retrieved_docs)
print("Query:", query)
print("\nGenerated Answer:\n", answer)


Both `max_new_tokens` (=256) and `max_length`(=250) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Query: Microsoft cloud revenue growth in 2023

Generated Answer:
 1. Microsoft cloud revenue grew 24%, driven by Azure and Office 365 demand. Net income rose 12%. [Source: Microsoft Corp (nan)] 2. Google parent Alphabet posted advertising revenue of $58.9B with growth in YouTube and Cloud. [Source: Alphabet Inc (nan)] 3. Morgan Stanley predicts Apple could generate $133 billion a year from humanoid robots by 2040. [Source: Google parent Alphabet posted advertising revenue of $58.9B with growth in YouTube and Cloud. [Source: Alphabet Inc (nan)] 4. Morgan Stanley predicts Apple could generate $133 billion a year from humanoid robots by 2040. [Source: Alphabet Inc (nan)] 5. Morgan Stanley predicts Apple could generate $133 billion a year from humanoid robots by 2040. [Source: Alphabet Inc (nan)] 6. Morgan Stanley predicts Apple could generate $133 billion a year from humanoid robots by 2040. [Source: Alphabet Inc (nan)] 7. Morgan Stanley predicts Apple could generate $133 billion a year f

In [21]:
# 🧩 Step 4: Optional — Top-K Citation Highlight

def show_cited_sources(retrieved_docs):
    print("Cited Sources:")
    for doc in retrieved_docs:
        title = doc['title']
        source = doc['source']
        link = doc['link'] if doc['link'] else "N/A"
        print(f"- {title} | Source: {source} | Link: {link}")

show_cited_sources(retrieved_docs)


Cited Sources:
- Microsoft Corp | Source: Filing | Link: nan
- Alphabet Inc | Source: Filing | Link: nan
- Apple could make $133 billion a year on humanoid robots by 2040: Morgan Stanley | Source: News | Link: https://finance.yahoo.com/news/apple-could-make-133-billion-a-year-on-humanoid-robots-by-2040-morgan-stanley-194419260.html?.tsrc=rss


# 🧠 Entity Extraction & Relationship Modeling


In [22]:
# 🧩 Step 1: Install NLP Libraries

!pip install spacy transformers -q
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [23]:
# 🧩 Step 2: Load spaCy Model

import spacy

nlp = spacy.load("en_core_web_sm")


In [24]:
# 🧩 Step 3: Define Entity Extraction Function

def extract_entities(text):
    """
    Extract named entities from financial text.
    Focus on ORG (companies), MONEY, PERCENT, DATE, and events (as nouns/verbs).
    """
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if ent.label_ in ["ORG", "MONEY", "PERCENT", "DATE"]:
            entities.append({"text": ent.text, "label": ent.label_})
    return entities


In [25]:
# 🔍 Step 4: Test Entity Extraction on Sample Documents

sample_doc = retrieved_docs[0]["text"]
entities = extract_entities(sample_doc)
print("Sample Document:\n", sample_doc, "\n")
print("Extracted Entities:")
for e in entities:
    print(f"- {e['text']} ({e['label']})")


Sample Document:
 Microsoft cloud revenue grew 24%, driven by Azure and Office 365 demand. Net income rose 12%. 

Extracted Entities:
- Microsoft (ORG)
- 24% (PERCENT)
- 12% (PERCENT)


In [26]:
# 🧩 Step 5: Build Simple Relationship Extraction

def extract_relationships(text):
    """
    Extract simple relationships: Company -> Event -> Value
    """
    doc = nlp(text)
    relationships = []

    for sent in doc.sents:
        orgs = [ent.text for ent in sent.ents if ent.label_ == "ORG"]
        moneys = [ent.text for ent in sent.ents if ent.label_ == "MONEY"]
        percents = [ent.text for ent in sent.ents if ent.label_ == "PERCENT"]
        if orgs and (moneys or percents):
            for org in orgs:
                for value in moneys + percents:
                    relationships.append({"company": org, "value": value, "sentence": sent.text})
    return relationships


In [27]:
# 🔍 Step 6: Test Relationship Extraction

relationships = extract_relationships(sample_doc)
print("Extracted Relationships:")
for rel in relationships:
    print(f"- {rel['company']} -> {rel['value']}: {rel['sentence'][:80]}...")


Extracted Relationships:
- Microsoft -> 24%: Microsoft cloud revenue grew 24%, driven by Azure and Office 365 demand....


In [28]:
# 🧩 Step 7: Integrate with RAG Pipeline

def generate_answer_with_entities(query, retriever):
    docs = retriever.retrieve(query)
    answer = generate_answer_with_citations(query, docs)

    # Extract entities & relationships from top docs
    entities = []
    relationships = []
    for doc in docs:
        entities.extend(extract_entities(doc['text']))
        relationships.extend(extract_relationships(doc['text']))

    return {
        "answer": answer,
        "entities": entities,
        "relationships": relationships
    }

# Test
result = generate_answer_with_entities("Apple quarterly revenue", retriever)
print("Answer:\n", result['answer'])
print("\nEntities:", result['entities'])
print("\nRelationships:", result['relationships'])


Both `max_new_tokens` (=256) and `max_length`(=250) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer:
 1. Apple reported Q2 revenue of $117B, down 5% YoY. Strong performance in services and wearables segments. [Source: Apple Inc (nan)] 2. Morgan Stanley predicts Apple could generate $133 billion a year from humanoid robots by the year 2040. [Source: Apple Inc (nan)] 3. Microsoft cloud revenue grew 24%, driven by Azure and Office 365 demand. Net income rose 12%. [Source: Microsoft Corp (nan)]

Entities: [{'text': 'Apple', 'label': 'ORG'}, {'text': 'Q2', 'label': 'DATE'}, {'text': '117B', 'label': 'MONEY'}, {'text': '5%', 'label': 'PERCENT'}, {'text': 'Morgan Stanley', 'label': 'ORG'}, {'text': 'Apple', 'label': 'ORG'}, {'text': '$133 billion', 'label': 'MONEY'}, {'text': 'the year 2040', 'label': 'DATE'}, {'text': 'Microsoft', 'label': 'ORG'}, {'text': '24%', 'label': 'PERCENT'}, {'text': '12%', 'label': 'PERCENT'}]

Relationships: [{'company': 'Apple', 'value': '117B', 'sentence': 'Apple reported Q2 revenue of $117B, down 5% YoY.'}, {'company': 'Apple', 'value': '5%', 'sentence

# 🧠 Streamlit UI & Deployment

In [29]:
# 🧩 Step 1: Install Streamlit

!pip install streamlit -q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [30]:
# 🧩 Step 2: Build Streamlit App

%%writefile app.py
import streamlit as st
from transformers import pipeline
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer

# Load model & data
df = pd.read_csv("financial_documents_with_embeddings.csv")
index = faiss.read_index("financial_news_index.faiss")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
llm = pipeline("text2text-generation", model="google/flan-t5-small")

# Custom retriever
class FAISS_Retriever:
    def __init__(self, index, df, embedding_model, k=3):
        self.index = index
        self.df = df
        self.embedding_model = embedding_model
        self.k = k

    def retrieve(self, query):
        query_vector = self.embedding_model.encode([query]).astype('float32')
        distances, indices = self.index.search(query_vector, self.k)
        results = []
        for idx in indices[0]:
            results.append({
                "title": self.df.iloc[idx]["title"],
                "text": self.df.iloc[idx]["text"],
                "source": self.df.iloc[idx]["source"],
                "link": self.df.iloc[idx].get("link", None)
            })
        return results

retriever = FAISS_Retriever(index, df, embedding_model, k=3)

# Entity extraction
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if ent.label_ in ["ORG","MONEY","PERCENT","DATE"]:
            entities.append({"text": ent.text, "label": ent.label_})
    return entities

def extract_relationships(text):
    doc = nlp(text)
    relationships = []
    for sent in doc.sents:
        orgs = [ent.text for ent in sent.ents if ent.label_=="ORG"]
        moneys = [ent.text for ent in sent.ents if ent.label_=="MONEY"]
        percents = [ent.text for ent in sent.ents if ent.label_=="PERCENT"]
        if orgs and (moneys or percents):
            for org in orgs:
                for value in moneys + percents:
                    relationships.append({"company": org, "value": value, "sentence": sent.text})
    return relationships

# RAG + Citations
def format_context_with_citations(retrieved_docs):
    formatted = ""
    for i, doc in enumerate(retrieved_docs, 1):
        citation = f"{doc['title']}" if doc['link'] is None else f"{doc['title']} ({doc['link']})"
        formatted += f"{i}. {doc['text']} [{citation}]\n\n"
    return formatted

def generate_answer_with_entities(query):
    docs = retriever.retrieve(query)
    context = format_context_with_citations(docs)
    prompt = f"Answer the following question using the context below and include citations.\n\nContext:\n{context}\n\nQuestion: {query}"
    answer = llm(prompt, max_length=250)[0]['generated_text']

    # Extract entities & relationships
    entities = []
    relationships = []
    for doc in docs:
        entities.extend(extract_entities(doc['text']))
        relationships.extend(extract_relationships(doc['text']))

    return answer, entities, relationships

# Streamlit UI
st.title("Financial News Semantic Analysis (RAG)")
query = st.text_input("Enter your financial query:")

if query:
    answer, entities, relationships = generate_answer_with_entities(query)

    st.subheader("Generated Answer")
    st.write(answer)

    st.subheader("Extracted Entities")
    st.json(entities)

    st.subheader("Extracted Relationships")
    st.json(relationships)


Writing app.py


In [31]:
# 🧩 Step 3: Run Streamlit in Colab

!pip install pyngrok -q
from pyngrok import ngrok

!streamlit run app.py &>/dev/null&
public_url = ngrok.connect(port='8501')
print("Streamlit app live at:", public_url)
