get_company_list.py

In [14]:
import requests
import csv
import os

BASE_DIR = os.getcwd()  # Compatibile con Jupyter
save_path = os.path.join(BASE_DIR, "company_list.csv")

headers = {'User-Agent': 'Gerardo DArco gerardo@email.com'}
url = "https://www.sec.gov/files/company_tickers.json"

response = requests.get(url, headers=headers)
data = response.json()

with open(save_path, "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["ticker", "cik"])
    for entry in data.values():
        ticker = entry['ticker']
        cik = str(entry['cik_str']).zfill(10)
        writer.writerow([ticker, cik])

print(f"✅ Salvato: {save_path}")


✅ Salvato: c:\Users\carbo\OneDrive\Desktop\task1\company_list.csv


download_edgar_reports.py

In [16]:
import csv
import os
import requests
import time

# Configurazione base
BASE_DIR = os.getcwd()
CSV_PATH = os.path.join(BASE_DIR, "company_list.csv")
SAVE_FOLDER = os.path.join(BASE_DIR, "filings")
FORM_TYPES = ["10-K", "10-Q"]
MAX_COMPANIES = 100  # cambia se vuoi più aziende
HEADERS = {'User-Agent': 'Gerardo DArco gerardo@email.com'}

os.makedirs(SAVE_FOLDER, exist_ok=True)

def download_form(ticker, cik, form_type, already_files):
    """Scarica il primo filing disponibile (10-K o 10-Q) se non già presente"""
    try:
        url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        r = requests.get(url, headers=HEADERS)
        data = r.json()
        recent = data["filings"]["recent"]
        forms = recent["form"]
        accessions = recent["accessionNumber"]

        downloaded = 0
        for i in range(len(forms)):
            if forms[i] == form_type:
                acc_no = accessions[i].replace("-", "")
                filename = f"{ticker}_{form_type}_{acc_no}.html"
                path = os.path.join(SAVE_FOLDER, filename)

                if filename in already_files:
                    continue  # già scaricato

                filing_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc_no}/{acc_no}-index.html"
                html = requests.get(filing_url, headers=HEADERS).text

                with open(path, "w", encoding="utf-8") as f:
                    f.write(html)

                print(f"[DOWNLOAD] {filename}")
                time.sleep(0.5)
                downloaded += 1
                break  # solo uno per tipo (più recente)

        return downloaded

    except Exception as e:
        print(f"[ERROR] {ticker} – {form_type} – {e}")
        return 0

def main():
    already_files = set(f for f in os.listdir(SAVE_FOLDER) if f.endswith(".html"))

    with open(CSV_PATH, "r") as file:
        reader = csv.DictReader(file)
        for i, row in enumerate(reader):
            if i >= MAX_COMPANIES:
                break
            ticker = row["ticker"]
            cik = row["cik"]

            for form_type in FORM_TYPES:
                prefix = f"{ticker}_{form_type}_"
                already_for_type = any(f.startswith(prefix) for f in already_files)

                if not already_for_type:
                    download_form(ticker, cik, form_type, already_files)
                else:
                    print(f"[SKIP] {ticker} – {form_type} già presente")

if __name__ == "__main__":
    main()


[SKIP] MSFT – 10-K già presente
[SKIP] MSFT – 10-Q già presente
[SKIP] AAPL – 10-K già presente
[SKIP] AAPL – 10-Q già presente
[SKIP] NVDA – 10-K già presente
[SKIP] NVDA – 10-Q già presente
[SKIP] GOOGL – 10-K già presente
[SKIP] GOOGL – 10-Q già presente
[SKIP] AMZN – 10-K già presente
[SKIP] AMZN – 10-Q già presente
[SKIP] META – 10-K già presente
[SKIP] META – 10-Q già presente
[SKIP] BRK-B – 10-K già presente
[SKIP] BRK-B – 10-Q già presente
[SKIP] AVGO – 10-K già presente
[SKIP] AVGO – 10-Q già presente
[SKIP] TSLA – 10-K già presente
[SKIP] TSLA – 10-Q già presente
[SKIP] WMT – 10-K già presente
[SKIP] WMT – 10-Q già presente
[SKIP] LLY – 10-K già presente
[SKIP] LLY – 10-Q già presente
[SKIP] JPM – 10-K già presente
[SKIP] JPM – 10-Q già presente
[SKIP] V – 10-K già presente
[SKIP] V – 10-Q già presente
[SKIP] MA – 10-K già presente
[SKIP] MA – 10-Q già presente
[SKIP] NFLX – 10-K già presente
[SKIP] NFLX – 10-Q già presente
[SKIP] COST – 10-K già presente
[SKIP] COST – 10-Q g

extract_text.py

In [17]:
import os
from bs4 import BeautifulSoup

BASE_DIR = os.getcwd()
INPUT_FOLDER = os.path.join(BASE_DIR, "filings")
OUTPUT_FOLDER = os.path.join(BASE_DIR, "text_clean")

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

def extract_text_from_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
        for tag in soup(["script", "style"]):
            tag.decompose()
        text = soup.get_text(separator="\n")
        lines = [line.strip() for line in text.splitlines() if line.strip()]
        return "\n".join(lines)

def process_all_files():
    for file in os.listdir(INPUT_FOLDER):
        if not file.endswith(".html"):
            continue
        input_path = os.path.join(INPUT_FOLDER, file)
        output_filename = file.replace(".html", ".txt")
        output_path = os.path.join(OUTPUT_FOLDER, output_filename)
        if os.path.exists(output_path):
            continue
        text = extract_text_from_html(input_path)
        with open(output_path, "w", encoding="utf-8") as out:
            out.write(text)
        print(f"[EXTRACT] {file}")

if __name__ == "__main__":
    process_all_files()


[EXTRACT] ACN_10-K_000146737324000278.html
[EXTRACT] ACN_10-Q_000146737325000100.html
[EXTRACT] ADBE_10-K_000079634325000004.html
[EXTRACT] ADBE_10-Q_000079634325000059.html
[EXTRACT] AMD_10-K_000000248825000012.html
[EXTRACT] AMD_10-Q_000000248825000047.html
[EXTRACT] AMGN_10-K_000031815425000010.html
[EXTRACT] AMGN_10-Q_000031815425000020.html
[EXTRACT] AXP_10-K_000000496225000016.html
[EXTRACT] AXP_10-Q_000000496225000045.html
[EXTRACT] BA_10-K_000001292725000015.html
[EXTRACT] BA_10-Q_000001292725000031.html
[EXTRACT] BKNG_10-K_000107553125000010.html
[EXTRACT] BKNG_10-Q_000107553125000024.html
[EXTRACT] BLK_10-K_000095017025026584.html
[EXTRACT] BLK_10-Q_000095017025065838.html
[EXTRACT] BSX_10-K_000088572525000011.html
[EXTRACT] BSX_10-Q_000088572525000026.html
[EXTRACT] BX_10-K_000119312525042469.html
[EXTRACT] BX_10-Q_000119312525111595.html
[EXTRACT] CAT_10-K_000001823025000008.html
[EXTRACT] CAT_10-Q_000001823025000016.html
[EXTRACT] CMCSA_10-K_000116669125000011.html
[EXTRAC

#CONTARE QUANTE AZIENDE CI SONO DA CANCELLARE

In [19]:
import os

FOLDER = "filings"  # o "text_clean"

tickers = set()

for file in os.listdir(FOLDER):
    if file.endswith(".html") or file.endswith(".txt"):
        ticker = file.split("_")[0]
        tickers.add(ticker)

print(f"✅ Numero di aziende uniche: {len(tickers)}")
print(f"Esempi di ticker: {sorted(list(tickers))[:10]}...")


✅ Numero di aziende uniche: 77
Esempi di ticker: ['AAPL', 'ABBV', 'ABT', 'ACN', 'ADBE', 'AMD', 'AMGN', 'AMZN', 'AVGO', 'AXP']...


chunk_text.py

In [18]:
import os
import textwrap

BASE_DIR = os.getcwd()
INPUT_FOLDER = os.path.join(BASE_DIR, "text_clean")
OUTPUT_FOLDER = os.path.join(BASE_DIR, "chunks")
MAX_CHARS = 3000

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

def safe_split_paragraph(para, max_chars):
    return textwrap.wrap(para, width=max_chars, break_long_words=False, replace_whitespace=False)

def chunk_by_paragraphs(text, max_chars=MAX_CHARS):
    paragraphs = text.split("\n\n")
    chunks = []
    current_chunk = ""

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        if len(para) > max_chars:
            for sub_para in safe_split_paragraph(para, max_chars):
                if current_chunk:
                    chunks.append(current_chunk.strip())
                    current_chunk = ""
                chunks.append(sub_para.strip())
        elif len(current_chunk) + len(para) + 2 > max_chars:
            chunks.append(current_chunk.strip())
            current_chunk = para
        else:
            current_chunk += "\n\n" + para

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def process_all_files():
    for file in os.listdir(INPUT_FOLDER):
        if not file.endswith(".txt"):
            continue
        input_path = os.path.join(INPUT_FOLDER, file)
        base_name = file.replace(".txt", "")
        chunk_already_present = any(f.startswith(base_name) for f in os.listdir(OUTPUT_FOLDER))
        if chunk_already_present:
            continue
        with open(input_path, "r", encoding="utf-8") as f:
            text = f.read()
        chunks = chunk_by_paragraphs(text)
        for i, chunk in enumerate(chunks):
            chunk_filename = f"{base_name}_chunk{i+1}.txt"
            output_path = os.path.join(OUTPUT_FOLDER, chunk_filename)
            with open(output_path, "w", encoding="utf-8") as out:
                out.write(chunk)
        print(f"[CHUNKED] {file} → {len(chunks)} blocchi")

if __name__ == "__main__":
    process_all_files()


[CHUNKED] ACN_10-K_000146737324000278.txt → 1 blocchi
[CHUNKED] ACN_10-Q_000146737325000100.txt → 1 blocchi
[CHUNKED] ADBE_10-K_000079634325000004.txt → 1 blocchi
[CHUNKED] ADBE_10-Q_000079634325000059.txt → 1 blocchi
[CHUNKED] AMD_10-K_000000248825000012.txt → 1 blocchi
[CHUNKED] AMD_10-Q_000000248825000047.txt → 1 blocchi
[CHUNKED] AMGN_10-K_000031815425000010.txt → 1 blocchi
[CHUNKED] AMGN_10-Q_000031815425000020.txt → 1 blocchi
[CHUNKED] AXP_10-K_000000496225000016.txt → 1 blocchi
[CHUNKED] AXP_10-Q_000000496225000045.txt → 1 blocchi
[CHUNKED] BA_10-K_000001292725000015.txt → 1 blocchi
[CHUNKED] BA_10-Q_000001292725000031.txt → 1 blocchi
[CHUNKED] BKNG_10-K_000107553125000010.txt → 1 blocchi
[CHUNKED] BKNG_10-Q_000107553125000024.txt → 1 blocchi
[CHUNKED] BLK_10-K_000095017025026584.txt → 1 blocchi
[CHUNKED] BLK_10-Q_000095017025065838.txt → 1 blocchi
[CHUNKED] BSX_10-K_000088572525000011.txt → 1 blocchi
[CHUNKED] BSX_10-Q_000088572525000026.txt → 1 blocchi
[CHUNKED] BX_10-K_000119

embed_and_index.py

In [21]:
pip install python-dotenv


Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0
Note: you may need to restart the kernel to use updated packages.


In [25]:
import os
import json
import faiss
import numpy as np
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv

# 📁 Percorsi cartelle
BASE_DIR = os.getcwd()
CHUNKS_FOLDER = os.path.join(BASE_DIR, "chunks")
INDEX_FOLDER = os.path.join(BASE_DIR, "index")
os.makedirs(INDEX_FOLDER, exist_ok=True)

INDEX_PATH = os.path.join(INDEX_FOLDER, "company_index.faiss")
METADATA_PATH = os.path.join(INDEX_FOLDER, "metadata.json")
EMBEDDING_MODEL = "text-embedding-3-small"

# 🔐 Carica chiave OpenAI
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# 📚 Carica tutti i chunk .txt
chunks = []
metadati = []

for file in sorted(os.listdir(CHUNKS_FOLDER)):
    if file.endswith(".txt"):
        file_path = os.path.join(CHUNKS_FOLDER, file)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        if text.strip():
            chunks.append(text)
            metadati.append({
                "filename": file,
                "length": len(text),
                "path": file_path
            })

# 🔁 Calcolo embedding
def get_embedding(text):
    response = client.embeddings.create(
        input=[text],
        model=EMBEDDING_MODEL
    )
    return response.data[0].embedding

print(f"🔢 Calcolo {len(chunks)} embedding con OpenAI ({EMBEDDING_MODEL})...")
embeddings = [get_embedding(chunk) for chunk in tqdm(chunks)]

# 📦 Costruzione indice FAISS
dimension = len(embeddings[0])
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype("float32"))

# 💾 Salva indice e metadati
faiss.write_index(index, INDEX_PATH)
with open(METADATA_PATH, "w", encoding="utf-8") as f:
    json.dump(metadati, f, indent=2)

print(f"\n✅ Indice FAISS salvato in: {INDEX_PATH}")
print(f"✅ Metadati salvati in: {METADATA_PATH}")


🔢 Calcolo 154 embedding con OpenAI (text-embedding-3-small)...


100%|██████████| 154/154 [01:05<00:00,  2.34it/s]


✅ Indice FAISS salvato in: c:\Users\carbo\OneDrive\Desktop\task1\index\company_index.faiss
✅ Metadati salvati in: c:\Users\carbo\OneDrive\Desktop\task1\index\metadata.json



