### get_company_list.py

In [3]:
import requests
import csv
import os

BASE_DIR = os.getcwd()  # Compatibile con Jupyter
save_path = os.path.join(BASE_DIR, "company_list.csv")

headers = {'User-Agent': 'Gerardo DArco gerardo@email.com'}
url = "https://www.sec.gov/files/company_tickers.json"

response = requests.get(url, headers=headers)
data = response.json()

with open(save_path, "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["ticker", "cik"])
    for entry in data.values():
        ticker = entry['ticker']
        cik = str(entry['cik_str']).zfill(10)
        writer.writerow([ticker, cik])

print(f"✅ Salvato: {save_path}")


✅ Salvato: c:\Users\carbo\GitHub\Programming_project\scripts\company_list.csv


### download_edgar_reports.py

In [4]:
import csv
import os
import requests
import time

# Configurazione base
BASE_DIR = os.getcwd()
CSV_PATH = os.path.join(BASE_DIR, "company_list.csv")
SAVE_FOLDER = os.path.join(BASE_DIR, "filings")
FORM_TYPES = ["10-K", "10-Q"]
MAX_COMPANIES = 100  # cambia se vuoi più aziende
HEADERS = {'User-Agent': 'Gerardo DArco gerardo@email.com'}

os.makedirs(SAVE_FOLDER, exist_ok=True)

def download_form(ticker, cik, form_type, already_files):
    """Scarica il primo filing disponibile (10-K o 10-Q) se non già presente"""
    try:
        url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        r = requests.get(url, headers=HEADERS)
        data = r.json()
        recent = data["filings"]["recent"]
        forms = recent["form"]
        accessions = recent["accessionNumber"]

        downloaded = 0
        for i in range(len(forms)):
            if forms[i] == form_type:
                acc_no = accessions[i].replace("-", "")
                filename = f"{ticker}_{form_type}_{acc_no}.html"
                path = os.path.join(SAVE_FOLDER, filename)

                if filename in already_files:
                    continue  # già scaricato

                filing_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc_no}/{acc_no}-index.html"
                html = requests.get(filing_url, headers=HEADERS).text

                with open(path, "w", encoding="utf-8") as f:
                    f.write(html)

                print(f"[DOWNLOAD] {filename}")
                time.sleep(0.5)
                downloaded += 1
                break  # solo uno per tipo (più recente)

        return downloaded

    except Exception as e:
        print(f"[ERROR] {ticker} – {form_type} – {e}")
        return 0

def main():
    already_files = set(f for f in os.listdir(SAVE_FOLDER) if f.endswith(".html"))

    with open(CSV_PATH, "r") as file:
        reader = csv.DictReader(file)
        for i, row in enumerate(reader):
            if i >= MAX_COMPANIES:
                break
            ticker = row["ticker"]
            cik = row["cik"]

            for form_type in FORM_TYPES:
                prefix = f"{ticker}_{form_type}_"
                already_for_type = any(f.startswith(prefix) for f in already_files)

                if not already_for_type:
                    download_form(ticker, cik, form_type, already_files)
                else:
                    print(f"[SKIP] {ticker} – {form_type} già presente")

if __name__ == "__main__":
    main()


[DOWNLOAD] MSFT_10-K_000095017024087843.html
[DOWNLOAD] MSFT_10-Q_000095017025061046.html
[DOWNLOAD] AAPL_10-K_000032019324000123.html
[DOWNLOAD] AAPL_10-Q_000032019325000057.html
[DOWNLOAD] NVDA_10-K_000104581025000023.html
[DOWNLOAD] NVDA_10-Q_000104581024000316.html
[DOWNLOAD] AMZN_10-K_000101872425000004.html
[DOWNLOAD] AMZN_10-Q_000101872425000036.html
[DOWNLOAD] GOOGL_10-K_000165204425000014.html
[DOWNLOAD] GOOGL_10-Q_000165204425000043.html
[DOWNLOAD] META_10-K_000132680125000017.html
[DOWNLOAD] META_10-Q_000132680125000054.html
[DOWNLOAD] BRK-B_10-K_000095017025025210.html
[DOWNLOAD] BRK-B_10-Q_000095017025063112.html
[DOWNLOAD] AVGO_10-K_000173016824000139.html
[DOWNLOAD] AVGO_10-Q_000173016825000021.html
[DOWNLOAD] TSLA_10-K_000162828025003063.html
[DOWNLOAD] TSLA_10-Q_000162828025018911.html
[DOWNLOAD] WMT_10-K_000010416925000021.html
[DOWNLOAD] WMT_10-Q_000010416924000178.html
[DOWNLOAD] JPM_10-K_000001961725000270.html
[DOWNLOAD] JPM_10-Q_000001961725000421.html
[DOWNLOAD]

### extract_text.py

In [5]:
import os
from bs4 import BeautifulSoup

BASE_DIR = os.getcwd()
INPUT_FOLDER = os.path.join(BASE_DIR, "filings")
OUTPUT_FOLDER = os.path.join(BASE_DIR, "text_clean")

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

def extract_text_from_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
        for tag in soup(["script", "style"]):
            tag.decompose()
        text = soup.get_text(separator="\n")
        lines = [line.strip() for line in text.splitlines() if line.strip()]
        return "\n".join(lines)

def process_all_files():
    for file in os.listdir(INPUT_FOLDER):
        if not file.endswith(".html"):
            continue
        input_path = os.path.join(INPUT_FOLDER, file)
        output_filename = file.replace(".html", ".txt")
        output_path = os.path.join(OUTPUT_FOLDER, output_filename)
        if os.path.exists(output_path):
            continue
        text = extract_text_from_html(input_path)
        with open(output_path, "w", encoding="utf-8") as out:
            out.write(text)
        print(f"[EXTRACT] {file}")

if __name__ == "__main__":
    process_all_files()


[EXTRACT] AAPL_10-K_000032019324000123.html
[EXTRACT] AAPL_10-Q_000032019325000057.html
[EXTRACT] ABBV_10-K_000155115225000020.html
[EXTRACT] ABBV_10-Q_000155115225000029.html
[EXTRACT] ABT_10-K_000162828025007110.html
[EXTRACT] ABT_10-Q_000162828025021090.html
[EXTRACT] ACN_10-K_000146737324000278.html
[EXTRACT] ACN_10-Q_000146737325000100.html
[EXTRACT] ADBE_10-K_000079634325000004.html
[EXTRACT] ADBE_10-Q_000079634325000059.html
[EXTRACT] AMAT_10-K_000000695124000044.html
[EXTRACT] AMAT_10-Q_000000695125000011.html
[EXTRACT] AMD_10-K_000000248825000012.html
[EXTRACT] AMD_10-Q_000000248825000047.html
[EXTRACT] AMGN_10-K_000031815425000010.html
[EXTRACT] AMGN_10-Q_000031815425000020.html
[EXTRACT] AMZN_10-K_000101872425000004.html
[EXTRACT] AMZN_10-Q_000101872425000036.html
[EXTRACT] AVGO_10-K_000173016824000139.html
[EXTRACT] AVGO_10-Q_000173016825000021.html
[EXTRACT] AXP_10-K_000000496225000016.html
[EXTRACT] AXP_10-Q_000000496225000045.html
[EXTRACT] BAC_10-K_000007085825000139.ht

### CONTARE QUANTE AZIENDE CI SONO DA CANCELLARE (da cancellare)

In [6]:
import os

FOLDER = "filings"  # o "text_clean"

tickers = set()

for file in os.listdir(FOLDER):
    if file.endswith(".html") or file.endswith(".txt"):
        ticker = file.split("_")[0]
        tickers.add(ticker)

print(f"✅ Numero di aziende uniche: {len(tickers)}")
print(f"Esempi di ticker: {sorted(list(tickers))[:10]}...")


✅ Numero di aziende uniche: 78
Esempi di ticker: ['AAPL', 'ABBV', 'ABT', 'ACN', 'ADBE', 'AMAT', 'AMD', 'AMGN', 'AMZN', 'AVGO']...


### chunk_text.py

In [7]:
import os
import textwrap

BASE_DIR = os.getcwd()
INPUT_FOLDER = os.path.join(BASE_DIR, "text_clean")
OUTPUT_FOLDER = os.path.join(BASE_DIR, "chunks")
MAX_CHARS = 3000

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

def safe_split_paragraph(para, max_chars):
    return textwrap.wrap(para, width=max_chars, break_long_words=False, replace_whitespace=False)

def chunk_by_paragraphs(text, max_chars=MAX_CHARS):
    paragraphs = text.split("\n\n")
    chunks = []
    current_chunk = ""

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        if len(para) > max_chars:
            for sub_para in safe_split_paragraph(para, max_chars):
                if current_chunk:
                    chunks.append(current_chunk.strip())
                    current_chunk = ""
                chunks.append(sub_para.strip())
        elif len(current_chunk) + len(para) + 2 > max_chars:
            chunks.append(current_chunk.strip())
            current_chunk = para
        else:
            current_chunk += "\n\n" + para

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def process_all_files():
    for file in os.listdir(INPUT_FOLDER):
        if not file.endswith(".txt"):
            continue
        input_path = os.path.join(INPUT_FOLDER, file)
        base_name = file.replace(".txt", "")
        chunk_already_present = any(f.startswith(base_name) for f in os.listdir(OUTPUT_FOLDER))
        if chunk_already_present:
            continue
        with open(input_path, "r", encoding="utf-8") as f:
            text = f.read()
        chunks = chunk_by_paragraphs(text)
        for i, chunk in enumerate(chunks):
            chunk_filename = f"{base_name}_chunk{i+1}.txt"
            output_path = os.path.join(OUTPUT_FOLDER, chunk_filename)
            with open(output_path, "w", encoding="utf-8") as out:
                out.write(chunk)
        print(f"[CHUNKED] {file} → {len(chunks)} blocchi")

if __name__ == "__main__":
    process_all_files()


[CHUNKED] AAPL_10-K_000032019324000123.txt → 1 blocchi
[CHUNKED] AAPL_10-Q_000032019325000057.txt → 1 blocchi
[CHUNKED] ABBV_10-K_000155115225000020.txt → 1 blocchi
[CHUNKED] ABBV_10-Q_000155115225000029.txt → 1 blocchi
[CHUNKED] ABT_10-K_000162828025007110.txt → 1 blocchi
[CHUNKED] ABT_10-Q_000162828025021090.txt → 1 blocchi
[CHUNKED] ACN_10-K_000146737324000278.txt → 1 blocchi
[CHUNKED] ACN_10-Q_000146737325000100.txt → 1 blocchi
[CHUNKED] ADBE_10-K_000079634325000004.txt → 1 blocchi
[CHUNKED] ADBE_10-Q_000079634325000059.txt → 1 blocchi
[CHUNKED] AMAT_10-K_000000695124000044.txt → 1 blocchi
[CHUNKED] AMAT_10-Q_000000695125000011.txt → 1 blocchi
[CHUNKED] AMD_10-K_000000248825000012.txt → 1 blocchi
[CHUNKED] AMD_10-Q_000000248825000047.txt → 1 blocchi
[CHUNKED] AMGN_10-K_000031815425000010.txt → 1 blocchi
[CHUNKED] AMGN_10-Q_000031815425000020.txt → 1 blocchi
[CHUNKED] AMZN_10-K_000101872425000004.txt → 1 blocchi
[CHUNKED] AMZN_10-Q_000101872425000036.txt → 1 blocchi
[CHUNKED] AVGO_1

### embed_and_index.py

In [8]:
pip install python-dotenv


Note: you may need to restart the kernel to use updated packages.


In [11]:
import os
import json
import faiss
import numpy as np
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv

# 📁 Percorsi cartelle
BASE_DIR = os.getcwd()
CHUNKS_FOLDER = os.path.join(BASE_DIR, "chunks")
INDEX_FOLDER = os.path.join(BASE_DIR, "index")
os.makedirs(INDEX_FOLDER, exist_ok=True)

INDEX_PATH = os.path.join(INDEX_FOLDER, "company_index.faiss")
METADATA_PATH = os.path.join(INDEX_FOLDER, "metadata.json")
EMBEDDING_MODEL = "text-embedding-3-small"

# 🔐 Carica chiave OpenAI
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# 📚 Carica tutti i chunk .txt
chunks = []
metadati = []

for file in sorted(os.listdir(CHUNKS_FOLDER)):
    if file.endswith(".txt"):
        file_path = os.path.join(CHUNKS_FOLDER, file)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        if text.strip():
            chunks.append(text)
            metadati.append({
                "filename": file,
                "length": len(text),
                "path": file_path
            })

# 🔁 Calcolo embedding
def get_embedding(text):
    response = client.embeddings.create(
        input=[text],
        model=EMBEDDING_MODEL
    )
    return response.data[0].embedding

print(f"🔢 Calcolo {len(chunks)} embedding con OpenAI ({EMBEDDING_MODEL})...")
embeddings = [get_embedding(chunk) for chunk in tqdm(chunks)]

# 📦 Costruzione indice FAISS
dimension = len(embeddings[0])
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype("float32"))

# 💾 Salva indice e metadati
faiss.write_index(index, INDEX_PATH)
with open(METADATA_PATH, "w", encoding="utf-8") as f:
    json.dump(metadati, f, indent=2)

print(f"\n✅ Indice FAISS salvato in: {INDEX_PATH}")
print(f"✅ Metadati salvati in: {METADATA_PATH}")


🔢 Calcolo 156 embedding con OpenAI (text-embedding-3-small)...


100%|██████████| 156/156 [01:19<00:00,  1.96it/s]


✅ Indice FAISS salvato in: c:\Users\carbo\GitHub\Programming_project\scripts\index\company_index.faiss
✅ Metadati salvati in: c:\Users\carbo\GitHub\Programming_project\scripts\index\metadata.json





### retriever.py

In [None]:
import os
import json
import faiss
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv

# 📁 Percorsi
BASE_DIR = os.getcwd()
INDEX_PATH = os.path.join(BASE_DIR, "index", "company_index.faiss")
METADATA_PATH = os.path.join(BASE_DIR, "index", "metadata.json")
CHUNKS_FOLDER = os.path.join(BASE_DIR, "chunks")
EMBEDDING_MODEL = "text-embedding-3-small"
TOP_K = 5  # Numero di risultati da restituire

# 🔐 API OpenAI
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# 📦 Carica FAISS + metadati
index = faiss.read_index(INDEX_PATH)
with open(METADATA_PATH, "r", encoding="utf-8") as f:
    metadata = json.load(f)

# 🔁 Funzione per calcolare embedding della query
def get_query_embedding(query):
    response = client.embeddings.create(
        input=[query],
        model=EMBEDDING_MODEL
    )
    return np.array(response.data[0].embedding, dtype="float32").reshape(1, -1)

# 🔍 Funzione principale di retrieval
def retrieve_relevant_chunks(query, top_k=TOP_K):
    query_vector = get_query_embedding(query)
    distances, indices = index.search(query_vector, top_k)

    results = []
    for dist, idx in zip(distances[0], indices[0]):
        if idx >= len(metadata):
            continue  # sicurezza
        meta = metadata[idx]
        with open(meta["path"], "r", encoding="utf-8") as f:
            text = f.read()
        results.append({
            "filename": meta["filename"],
            "distance": float(dist),
            "text": text.strip()
        })
    return results

# ▶️ Esecuzione manuale per test  (DA TOGLIEREEEE!!!!)
if __name__ == "__main__":
    query = input("🔎 Inserisci la tua domanda: ")
    print("\n📄 Risultati più rilevanti:\n")
    for i, result in enumerate(retrieve_relevant_chunks(query), 1):
        print(f"#{i} – {result['filename']} (score: {result['distance']:.4f})")
        print(result["text"][:500], "...\n")  # mostra solo i primi 500 caratteri



📄 Risultati più rilevanti:

#1 – ACN_10-K_000146737324000278_chunk1.txt (score: 1.8867)
SEC.gov | File Unavailable
Skip to Main Content
U.S. Securities and
Exchange Commission
SEC.gov Search Form
Search SEC.gov
Company Filings
|
More Search Options
About
What We Do
Commissioners
Securities Laws
SEC Docket
Reports and Publications
Careers
Contact
Divisions
Corporation Finance
Enforcement
Investment Management
Economic and Risk Analysis
Trading and Markets
National Exam Program
All Divisions and Offices
Enforcement
Litigation Releases
Administrative Proceedings
Opinions and Adjudicat ...

#2 – WFC_10-Q_000007297125000129_chunk1.txt (score: 1.8867)
SEC.gov | File Unavailable
Skip to Main Content
U.S. Securities and
Exchange Commission
SEC.gov Search Form
Search SEC.gov
Company Filings
|
More Search Options
About
What We Do
Commissioners
Securities Laws
SEC Docket
Reports and Publications
Careers
Contact
Divisions
Corporation Finance
Enforcement
Investment Management
Economic and Risk Ana

### llm_wrapper.py 

In [20]:
import os
from openai import OpenAI
from dotenv import load_dotenv
from retriever import retrieve_relevant_chunks

# 🔐 Carica la chiave OpenAI da .env
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# ⚙️ Configurazione modello
MODEL = "gpt-4o"  # GPT-4.5 (chat GPT-4 omni)
TOP_K = 5         # Numero di chunk da recuperare

# 🧠 Costruzione del prompt
def build_prompt(query, retrieved_chunks):
    context = "\n\n".join([f"[{i+1}] {chunk['text']}" for i, chunk in enumerate(retrieved_chunks)])
    prompt = (
        f"You are a financial assistant answering user questions based only on the following context.\n"
        f"If the answer is clearly stated or can be reasonably inferred, provide it.\n"
        f"If the answer is not present or cannot be inferred, respond: \"I don't know based on the documents.\"\n\n"
        f"Context:\n{context}\n\n"
        f"Question:\n{query}\n\n"
        f"Answer:"
    )
    return prompt

# 🤖 Funzione per interrogare GPT con i chunk trovati
def ask_llm(query):
    retrieved_chunks = retrieve_relevant_chunks(query, top_k=TOP_K)

    # 🔍 DEBUG: controlla se ci sono chunk
    if not retrieved_chunks:
        print("❌ Nessun chunk trovato. Verifica gli embedding o riformula la query.")
        return "I don't know (no relevant context found)."

    # 👀 Stampa i chunk recuperati
    for i, chunk in enumerate(retrieved_chunks, 1):
        print(f"\n📄 CHUNK #{i} – {chunk['filename']}")
        print(chunk['text'][:500], "...\n")  # primi 500 caratteri

    # 📜 Costruzione del prompt
    prompt = build_prompt(query, retrieved_chunks)

    # 💬 Chiamata a GPT
    response = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0  # massima affidabilità
    )

    return response.choices[0].message.content.strip()

# ▶️ Test da terminale
if __name__ == "__main__":
    user_question = input("💬 Inserisci la tua domanda: ")
    answer = ask_llm(user_question)
    print("\n🧠 Risposta di Mark:\n")
    print(answer)



📄 CHUNK #1 – WFC_10-Q_000007297125000129_chunk1.txt
SEC.gov | File Unavailable
Skip to Main Content
U.S. Securities and
Exchange Commission
SEC.gov Search Form
Search SEC.gov
Company Filings
|
More Search Options
About
What We Do
Commissioners
Securities Laws
SEC Docket
Reports and Publications
Careers
Contact
Divisions
Corporation Finance
Enforcement
Investment Management
Economic and Risk Analysis
Trading and Markets
National Exam Program
All Divisions and Offices
Enforcement
Litigation Releases
Administrative Proceedings
Opinions and Adjudicat ...


📄 CHUNK #2 – AAPL_10-K_000032019324000123_chunk1.txt
SEC.gov | File Unavailable
Skip to Main Content
U.S. Securities and
Exchange Commission
SEC.gov Search Form
Search SEC.gov
Company Filings
|
More Search Options
About
What We Do
Commissioners
Securities Laws
SEC Docket
Reports and Publications
Careers
Contact
Divisions
Corporation Finance
Enforcement
Investment Management
Economic and Risk Analysis
Trading and Markets
National Exam Pr