# Setup and dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from pathlib import Path
import json
from typing import List, Dict, Any

# PDF processing
import PyPDF2
import pdfplumber
from io import BytesIO

# Vector database and embeddings
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

# LLM integration (using OpenAI as example)
from openai import OpenAI
import tiktoken

# Text processing
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore') # Filter out warnings for cleaner output

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

# PDF Parsing & Text Extraction

In this section, I implemented a `Parser` class for PDF financial reports, extracting raw text, metadata and tables from the input PDF document for later use.

I used [Nvidia's 2024 annual report from StockLight](https://stocklight.com/stocks/us/nasdaq-nvda/nvidia/annual-reports/nasdaq-nvda-2024-10K-24660316.pdf) for demo.

In [2]:
class FinancialPDFParser:
    """Parse and extract structured information from financial PDF statements"""
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        
    def extract_text_from_pdf(self, pdf_path: str) -> Dict[str, Any]:
        """Extract text and metadata from PDF file"""
        
        # Define structure for extracted data
        extracted_data = {
            'raw_text': '',
            'pages': [],
            'tables': [],
            'metadata': {}
        }
        
        try:
            with pdfplumber.open(pdf_path) as pdf:
                # Extract metadata
                extracted_data['metadata'] = {
                    'total_pages': len(pdf.pages),
                    'title': getattr(pdf.metadata, 'title', 'Unknown'),
                    'author': getattr(pdf.metadata, 'author', 'Unknown'),
                    'creation_date': getattr(pdf.metadata, 'creation_date', None)
                }
                
                # Extract text and tables from each page
                for page_num, page in enumerate(pdf.pages):
                    page_text = page.extract_text()
                    if page_text:
                        extracted_data['pages'].append({
                            'page_number': page_num + 1,
                            'text': page_text
                        })
                        extracted_data['raw_text'] += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
                    
                    # Extract tables
                    tables = page.extract_tables()
                    if tables:
                        for table_idx, table in enumerate(tables):
                            extracted_data['tables'].append({
                                'page': page_num + 1,
                                'table_index': table_idx,
                                'data': table
                            })            
        except Exception as e:
            print(f"Error parsing PDF: {str(e)}")
            return None
            
        print(f"PDF parsed successfully!")
        print(f"Pages: {extracted_data['metadata']['total_pages']}")
        print(f"Tables found: {len(extracted_data['tables'])}")
        print(f"Total text length: {len(extracted_data['raw_text'])} characters")
        
        return extracted_data

In [3]:
# Initialize parser and create sample data
pdf_parser = FinancialPDFParser()
# Get results
pdf_data = pdf_parser.extract_text_from_pdf("nvda-2024.pdf")

PDF parsed successfully!
Pages: 89
Tables found: 56
Total text length: 341204 characters


In [4]:
# Print results
print("="*60)
print(f"Document metadata: {pdf_data['metadata']}")
print("="*60)
print(f"Text preview: {pdf_data['raw_text'][:200]}...")

Document metadata: {'total_pages': 89, 'title': 'Unknown', 'author': 'Unknown', 'creation_date': None}
Text preview: 
--- Page 1 ---
stocklight.com > Stocks > United States
NVIDIA Corporation > Annual Reports > 2024
Annual Report
NVIDIA Corporation Annual Report 2024
Form 10-K (NASDAQ:NVDA)
Published: February 21st,...


# Vector Database Setup

Build the vector database with given pdf data with `FinancialRAGDatabase` class. General pipeline: 
1. Use `raw_text` from previous parser
2. Split raw_text into different section 
3. Overlap chunking for each section
4. Encode chunks with selected `embedding_model`
5. Add embedding vectors to vector database

`FinancialRAGDatabase` also includes a wrapper `search` function to define how to perform search in this db.

For demostration purpose, only one pdf file included.

In [5]:
class FinancialRAGDatabase:
    """Vector database for financial document RAG"""
    
    def __init__(self, embedding_model = None, collection_name="financial_statements"):
        self.collection_name = collection_name
        self.embedding_model = SentenceTransformer('all-mpnet-base-v2') if embedding_model == None else embedding_model
        self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
        self.collection = self._setup_collection()

        # print(f"Embedding model: {self.embedding_model}")
        
    def _setup_collection(self):
        """Initialize or get existing ChromaDB collection"""
        try:
            # Try to get existing collection
            collection = self.chroma_client.get_collection(self.collection_name)
            print(f"Using existing collection: {self.collection_name}")
        except:
            # Create new collection
            collection = self.chroma_client.create_collection(
                name=self.collection_name,
                metadata={"description": "Financial statement chunks for RAG"}
            )
            print(f"Created new collection: {self.collection_name}")
        
        return collection
    

    def chunk_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[Dict]:
        """Split financial text into overlapping chunks"""
        
        # Split into sentences first
        sentences = sent_tokenize(text)
        
        chunks = []
        current_chunk = ""
        current_length = 0
        
        for sentence in sentences:
            sentence_length = len(sentence)
            
            # If adding this sentence would exceed chunk size, save current chunk
            if current_length + sentence_length > chunk_size and current_chunk:
                chunks.append({
                    'text': current_chunk.strip(),
                    'length': current_length,
                    'sentence_count': len(sent_tokenize(current_chunk))
                })
                
                # Start new chunk with one overlap sentence.
                overlap_sentences = sent_tokenize(current_chunk)[-2:] if len(sent_tokenize(current_chunk)) >= 2 else []
                current_chunk = " ".join(overlap_sentences) + " " + sentence
                current_length = len(current_chunk)
            else:
                current_chunk += " " + sentence
                current_length += sentence_length
        
        # Add the final chunk
        if current_chunk.strip():
            chunks.append({
                'text': current_chunk.strip(),
                'length': current_length,
                'sentence_count': len(sent_tokenize(current_chunk))
            })
        
        return chunks
    
    def extract_financial_sections(self, text: str) -> Dict[str, str]:
        """Split financial statements into different sections from the text"""
        
        sections = {}
        
        # Define section patterns via regex
        section_patterns = {
            'income_statement': r'(CONSOLIDATED STATEMENTS OF OPERATIONS|INCOME)(.*?)(?=CONSOLIDATED BALANCE SHEETS|BALANCE SHEET|$)',
            'balance_sheet': r'(CONSOLIDATED BALANCE SHEETS|BALANCE SHEET)(.*?)(?=MANAGEMENT\'S DISCUSSION|CASH FLOWS|$)',
            'management_discussion': r'(MANAGEMENT\'S DISCUSSION AND ANALYSIS|MD&A)(.*?)(?=RISK FACTORS|NOTES TO|$)',
            'risk_factors': r'(RISK FACTORS)(.*?)(?=NOTES TO|LEGAL|$)'
        }
        
        for section_name, pattern in section_patterns.items():
            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
            if match:
                sections[section_name] = match.group(2).strip()
        
        # If no sections found, treat entire text as general content
        if not sections:
            sections['general_content'] = text
            
        print(f"Extracted {len(sections)} financial sections: {list(sections.keys())}")
        return sections
    
    def add_document(self, pdf_data: Dict, document_id: str = "financial_statement_1", chunk_size: int = 500):
        """Main funciton to add financial document to vector database"""
        
        text = pdf_data['raw_text']
        metadata = pdf_data['metadata']
        
        # Extract financial sections
        sections = self.extract_financial_sections(text)
        
        # Create chunks for each section
        all_chunks = []
        all_metadatas = []
        all_ids = []
        
        for section_name, section_text in sections.items():
            chunks = self.chunk_text(section_text, chunk_size)
            print(f"Created {len(chunks)} text chunks in {section_name}")

            for i, chunk in enumerate(chunks):
                chunk_id = f"{document_id}_{section_name}_{i}"
                
                chunk_metadata = {
                    'document_id': document_id,
                    'section': section_name,
                    'chunk_index': i,
                    'chunk_length': chunk['length'],
                    'sentence_count': chunk['sentence_count'],
                    'document_title': metadata.get('title', 'Unknown'),
                    'total_pages': metadata.get('total_pages', 0)
                }
                
                all_chunks.append(chunk['text'])
                all_metadatas.append(chunk_metadata)
                all_ids.append(chunk_id)
        
        # Generate embeddings
        print("Generating embeddings...")
        embeddings = self.embedding_model.encode(all_chunks).tolist()
        
        # Add to ChromaDB
        self.collection.add(
            embeddings=embeddings,
            documents=all_chunks,
            metadatas=all_metadatas,
            ids=all_ids
        )
        
        print(f"Added {len(all_chunks)} chunks to vector database")
        
        return {
            'total_chunks': len(all_chunks),
            'sections': list(sections.keys()),
            'document_id': document_id
        }
    
    def search(self, query: str, n_results: int = 5, section_filter: str = None) -> Dict:
        """A wrapper function to perform search in the vector database"""
        
        # Build where clause for filtering
        where_clause = {}
        if section_filter:
            where_clause['section'] = section_filter
        
        # Perform semantic search
        results = self.collection.query(
            query_embeddings=[self.embedding_model.encode(query).tolist()],
            n_results=n_results,
            where=where_clause if where_clause else None,
            include=['documents', 'metadatas', 'distances']
        )
        
        return {
            'query': query,
            'results': results,
            'num_results': len(results['documents'][0]) if results['documents'] else 0
        }

In [6]:
# Initialize database and add financial document
rag_db = FinancialRAGDatabase()
indexing_result = rag_db.add_document(pdf_data)

print(f"="*60)
print(f"Indexing Summary:")
print(f"   Total chunks created: {indexing_result['total_chunks']}")
print(f"   Sections indexed: {', '.join(indexing_result['sections'])}")

Created new collection: financial_statements
Extracted 4 financial sections: ['income_statement', 'balance_sheet', 'management_discussion', 'risk_factors']
Created 274 text chunks in income_statement
Created 51 text chunks in balance_sheet
Created 1 text chunks in management_discussion
Created 1 text chunks in risk_factors
Generating embeddings...
Added 327 chunks to vector database
Indexing Summary:
   Total chunks created: 327
   Sections indexed: income_statement, balance_sheet, management_discussion, risk_factors


In [7]:
# Test search functionality
test_query = "What was the total revenue for NVDA in 2024?"
search_results = rag_db.search(test_query, n_results=3)

print(f"\n🔍 Test Search Results for: '{test_query}'")
for i, (doc, metadata, distance) in enumerate(zip(
    search_results['results']['documents'][0],
    search_results['results']['metadatas'][0],
    search_results['results']['distances'][0]
)):
    print(f"\n   Result {i+1} (Similarity: {1-distance:.3f}):")
    print(f"   Document ID: {metadata['document_id']}")
    print(f"   Chunk Index: {metadata['chunk_index']}")
    print(f"   Text: {doc[:200]}...")
    print("="*60)


🔍 Test Search Results for: 'What was the total revenue for NVDA in 2024?'

   Result 1 (Similarity: 0.378):
   Document ID: financial_statement_1
   Chunk Index: 130
   Text: Revenue for fiscal year 2024 was $60.9 billion, up 126% from a year ago. Data Center revenue for fiscal year 2024 was up 217%. Strong demand was driven by enterprise software and consumer internet app...

   Result 2 (Similarity: 0.266):
   Document ID: financial_statement_1
   Chunk Index: 146
   Text: Professional Visualization revenue for fiscal year 2024 was $1.6 billion, up 1% from fiscal year 2023. In Professional Visualization, we announced new GPUs based
on the NVIDIA RTX Ada Lovelace archite...

   Result 3 (Similarity: 0.261):
   Document ID: financial_statement_1
   Chunk Index: 131
   Text: Customers across industry verticals access NVIDIA AI infrastructure both through the cloud and on-
premises. Data Center compute revenue was up 244% in the fiscal year. Networking revenue was up 133% ...


We could find that the search successfully located the correct result at page 40 of the original statement.

## Unit test for `chunk_text`

In [8]:
import unittest

class TestChunkText(unittest.TestCase):
    """Unit tests for the chunk_text method"""
    
    def setUp(self):
        """Set up test fixtures"""
        pass
    
    def test_simple_chunking(self):
        """Test basic chunking functionality"""
        rag_db = FinancialRAGDatabase(collection_name="test1")
        text = "This is sentence one. This is sentence two. This is sentence three."
        chunks = rag_db.chunk_text(text, chunk_size=50, overlap=10)
        
        # Should create multiple chunks due to size limit
        self.assertGreater(len(chunks), 1)
        
        # Each chunk should have required fields
        for chunk in chunks:
            self.assertIn('text', chunk)
            self.assertIn('length', chunk)
            self.assertIn('sentence_count', chunk)
            self.assertIsInstance(chunk['text'], str)
            self.assertIsInstance(chunk['length'], int)
            self.assertIsInstance(chunk['sentence_count'], int)
    
    def test_chunk_size_limit(self):
        """Test that chunks respect size limits"""
        rag_db = FinancialRAGDatabase(collection_name="test2")
        text = "Short sentence. " * 50  # Create long text
        chunk_size = 100
        chunks = rag_db.chunk_text(text, chunk_size=chunk_size)
        
        # Most chunks should be under the size limit (allowing some flexibility for overlap)
        oversized_chunks = [c for c in chunks if c['length'] > chunk_size * 1.5]
        self.assertLess(len(oversized_chunks), len(chunks) * 0.3)  # Less than 30% oversized
    
    def test_empty_text(self):
        """Test handling of empty input"""
        rag_db = FinancialRAGDatabase(collection_name="test3")
        chunks = rag_db.chunk_text("", chunk_size=500)
        self.assertEqual(len(chunks), 0)
    
    def test_single_sentence(self):
        """Test chunking of single sentence"""
        rag_db = FinancialRAGDatabase(collection_name="test4")
        text = "This is a single sentence."
        chunks = rag_db.chunk_text(text, chunk_size=500)
        
        self.assertEqual(len(chunks), 1)
        self.assertEqual(chunks[0]['text'].strip(), text.strip())
        self.assertEqual(chunks[0]['sentence_count'], 1)
    
    def test_chunk_overlap(self):
        """Test that chunks have proper overlap"""
        rag_db = FinancialRAGDatabase(collection_name="test5")
        text = "First sentence here. Second sentence follows. Third sentence appears. Fourth sentence concludes."
        chunks = rag_db.chunk_text(text, chunk_size=40, overlap=10)
        
        if len(chunks) > 1:
            # Check that there's some overlap between consecutive chunks
            first_chunk_words = set(chunks[0]['text'].split())
            second_chunk_words = set(chunks[1]['text'].split())
            overlap_words = first_chunk_words.intersection(second_chunk_words)
            self.assertGreater(len(overlap_words), 0, "Chunks should have overlapping words")
    
    def test_very_long_sentence(self):
        """Test handling of sentences longer than chunk size"""
        rag_db = FinancialRAGDatabase(collection_name="test6")
        long_sentence = "This is a very long sentence that exceeds the chunk size limit. " * 10
        chunks = rag_db.chunk_text(long_sentence, chunk_size=50)
        
        # Should still create at least one chunk
        self.assertGreaterEqual(len(chunks), 1)
        # The chunk should contain the long sentence (even if it exceeds limit)
        self.assertIn("very long sentence", chunks[0]['text'])
    
    def test_different_chunk_sizes(self):
        """Test chunking with different size parameters"""
        rag_db1 = FinancialRAGDatabase(collection_name="test7-1")
        rag_db2 = FinancialRAGDatabase(collection_name="test7-2")
        text = "One. Two. Three. Four. Five. Six. Seven. Eight."
        
        # Test small chunks
        small_chunks = rag_db1.chunk_text(text, chunk_size=10)
        
        # Test large chunks
        large_chunks = rag_db2.chunk_text(text, chunk_size=200)
        
        # Small chunk size should create more chunks
        self.assertGreaterEqual(len(small_chunks), len(large_chunks))

def run_tests():
    """Run the unit tests"""
    unittest.main(argv=[''], exit=False, verbosity=2)

run_tests()

test_chunk_overlap (__main__.TestChunkText.test_chunk_overlap)
Test that chunks have proper overlap ... ok
test_chunk_size_limit (__main__.TestChunkText.test_chunk_size_limit)
Test that chunks respect size limits ... 

Created new collection: test5


ok
test_different_chunk_sizes (__main__.TestChunkText.test_different_chunk_sizes)
Test chunking with different size parameters ... 

Created new collection: test2
Created new collection: test7-1


ok
test_empty_text (__main__.TestChunkText.test_empty_text)
Test handling of empty input ... 

Created new collection: test7-2


ok
test_simple_chunking (__main__.TestChunkText.test_simple_chunking)
Test basic chunking functionality ... 

Created new collection: test3


ok
test_single_sentence (__main__.TestChunkText.test_single_sentence)
Test chunking of single sentence ... 

Created new collection: test1


ok
test_very_long_sentence (__main__.TestChunkText.test_very_long_sentence)
Test handling of sentences longer than chunk size ... 

Created new collection: test4


ok

----------------------------------------------------------------------
Ran 7 tests in 6.838s

OK


Created new collection: test6


# RAG system

In [9]:
class FinancialRAGSystem:
    """Complete RAG system for financial Q&A"""
    
    def __init__(self, rag_database: FinancialRAGDatabase):
        self.rag_db = rag_database
        self.tokenizer = tiktoken.get_encoding("cl100k_base")
        
    def count_tokens(self, text: str) -> int:
        """Count tokens in text"""
        return len(self.tokenizer.encode(text))
    
    def prepare_context(self, search_results: Dict, max_tokens: int = 2000) -> str:
        """Prepare context from search results within token limit"""
        
        context_parts = []
        total_tokens = 0
        
        for doc, metadata in zip(
            search_results['results']['documents'][0],
            search_results['results']['metadatas'][0]
        ):
            # Format the context chunk
            chunk_text = f"[{metadata['section'].upper()}]: {doc}"
            chunk_tokens = self.count_tokens(chunk_text)
            
            if total_tokens + chunk_tokens <= max_tokens:
                context_parts.append(chunk_text)
                total_tokens += chunk_tokens
            else:
                break
        
        return "\n\n".join(context_parts)
    
    def generate_answer_openai(self, question: str, context: str) -> str:
        """Generate answer using OpenAI API"""
        
        system_prompt = """You are a financial analyst AI assistant. Answer questions about financial statements accurately and concisely based on the provided context. 

Guidelines:
1. Use only information from the provided context
2. Include specific numbers and percentages when available
3. If the context doesn't contain enough information, say so clearly
4. Provide clear, professional financial analysis
5. Format financial numbers properly (e.g., $125.4 million, 7.1% growth)"""

        user_prompt = f"""Context from financial statements:
{context}

Question: {question}

Please provide a concise answer based on the financial data above."""

        client = OpenAI()
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.1,
                max_tokens=500
            )
            
            return response.choices[0].message.content.strip()
            
        except Exception as e:
            return f"Error generating response: {str(e)}"
    
    def generate_answer_local(self, question: str, context: str) -> str:
        """Generate answer using local processing (fallback when no OpenAI API)"""
        
        # Simple keyword-based answer generation for demo
        context_lower = context.lower()
        question_lower = question.lower()
        
        # Extract relevant sentences
        sentences = sent_tokenize(context)
        relevant_sentences = []
        
        # Simple keyword matching
        question_keywords = set(word_tokenize(question_lower)) - set(stopwords.words('english'))
        
        for sentence in sentences:
            sentence_words = set(word_tokenize(sentence.lower()))
            if question_keywords.intersection(sentence_words):
                relevant_sentences.append(sentence)
        
        if relevant_sentences:
            return f"Based on the financial statements: {' '.join(relevant_sentences[:3])}"
        else:
            return "I couldn't find specific information to answer your question in the provided financial statements."
    
    def answer_question(self, question: str, use_openai: bool = False, section_filter: str = None) -> Dict:
        """Complete RAG pipeline to answer financial questions"""
        
        # Step 1: Search for relevant context
        search_results = self.rag_db.search(question, n_results=3, section_filter=section_filter)
        
        if search_results['num_results'] == 0:
            return {
                'question': question,
                'answer': "No relevant information found in the financial statements.",
                'context_used': "",
                'sources': [],
                'confidence': 0.0
            }
        
        # Step 2: Prepare context
        context = self.prepare_context(search_results)
        
        # Step 3: Generate answer
        if use_openai:
            answer = self.generate_answer_openai(question, context)
        else:
            answer = self.generate_answer_local(question, context)
        
        # Step 4: Extract sources
        sources = []
        for metadata in search_results['results']['metadatas'][0]:
            sources.append({
                'section': metadata['section'],
                'chunk_index': metadata['chunk_index'],
                'document_title': metadata['document_title']
            })
        
        # Step 5: Calculate confidence (based on search similarity)
        avg_distance = np.mean(search_results['results']['distances'][0])
        confidence = max(0.0, 1.0 - avg_distance)
        
        return {
            'question': question,
            'answer': answer,
            'context_used': context,
            'sources': sources,
            'confidence': confidence,
            'num_sources': search_results['num_results']
        }

In [10]:
# Initialize RAG system
rag_system = FinancialRAGSystem(rag_db)

print("Financial RAG System initialized successfully!")
# Demo questions for the financial statement
demo_questions = [
    "What was the total revenue for 2023?",
    "How much did revenue grow from 2023 to 2024?",
    "What are the main risk factors mentioned?",
    "What are the main drivers of revenue growth?",
]

print("💬 FINANCIAL Q&A DEMONSTRATION")
print("=" * 50)

results_summary = []

for i, question in enumerate(demo_questions, 1):
    print(f"\n❓ Question {i}: {question}")
    
    # Get answer using local processing
    result = rag_system.answer_question(question, use_openai=True)
    
    print(f"🤖 Answer: {result['answer']}")
    print(f"📊 Confidence: {result['confidence']:.2f}")
    print(f"📚 Sources: {result['num_sources']} chunks from {len(set(s['section'] for s in result['sources']))} sections")
    
    # Show top source section
    if result['sources']:
        top_section = result['sources'][0]['section']
        print(f"🎯 Primary source: {top_section}")
    
    results_summary.append({
        'question': question,
        'answer_length': len(result['answer']),
        'confidence': result['confidence'],
        'num_sources': result['num_sources']
    })
    
    print("-" * 50)

# Summary statistics
summary_df = pd.DataFrame(results_summary)
print(f"\n📈 Q&A SESSION SUMMARY:")
print(f"   Average confidence: {summary_df['confidence'].mean():.2f}")
print(f"   Average answer length: {summary_df['answer_length'].mean():.0f} characters")
print(f"   Average sources used: {summary_df['num_sources'].mean():.1f}")

Financial RAG System initialized successfully!
💬 FINANCIAL Q&A DEMONSTRATION

❓ Question 1: What was the total revenue for 2023?
🤖 Answer: The total revenue for fiscal year 2023 was $26.974 billion.
📊 Confidence: 0.27
📚 Sources: 3 chunks from 1 sections
🎯 Primary source: income_statement
--------------------------------------------------

❓ Question 2: How much did revenue grow from 2023 to 2024?
🤖 Answer: Revenue grew from $26,974 million in 2023 to $60,922 million in 2024, representing an increase of $33,948 million, or 126%.
📊 Confidence: 0.33
📚 Sources: 3 chunks from 1 sections
🎯 Primary source: income_statement
--------------------------------------------------

❓ Question 3: What are the main risk factors mentioned?
🤖 Answer: The main risk factors mentioned include:

1. Macroeconomic factors such as inflation, increased interest rates, capital market volatility, global supply chain constraints, and global economic and geopolitical developments.
2. Regulatory and legal risks.
3. C

# Summary and improvements

From the Q&A demo section, we could find the RAG system provides reasonable answers to simple questions (Q1,2) within the income statements sections and basic understanding questions (Q4). 

As for risk factors analysis (3), it's not targeted at the correct section (we have `risk factors` sections defined), probablly due to the poor section extraction by regex (as we only get 1 text chunk in `risk factors` section), which can be improved later on.

Another extension to implement is using the extracted tables from the original PDF data.