In [1]:
pip install requests beautifulsoup4 pandas unstructured pdfminer.six snowflake-connector-python


Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install selenium


Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests

CIK = '0001318605'
url = f"https://data.sec.gov/submissions/CIK{CIK}.json"

headers = {
    "User-Agent": "Avikshith Yelakonda avikshith@example.com",  # use your actual email
    "Accept-Encoding": "gzip, deflate",
    "Host": "data.sec.gov"
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    try:
        data = response.json()
        print("✅ Successfully fetched Tesla's submission data")

        # Print latest 10-K accession number
        filings = data['filings']['recent']
        for i, form_type in enumerate(filings['form']):
            if form_type == "10-K":
                accession_number = filings['accessionNumber'][i]
                print("Tesla latest 10-K accession number:", accession_number)
                break

    except ValueError:
        print("❌ JSON decoding error — response is not JSON:")
        print(response.text[:500])
else:
    print(f"❌ HTTP Error: {response.status_code} — URL failed:")
    print(url)



✅ Successfully fetched Tesla's submission data
Tesla latest 10-K accession number: 0001628280-25-003063


In [4]:
accession_number = '0001628280-25-003063'
accession_nodash = accession_number.replace('-', '')  # → 000162828025003063


In [5]:
CIK = '0001318605'
doc_index_url = f"https://www.sec.gov/Archives/edgar/data/{int(CIK)}/{accession_nodash}/index.json"


In [11]:
import requests

headers = {
    "User-Agent": "Avikshith Yelakonda avikshith@example.com"
}

response = requests.get(doc_index_url, headers=headers)
doc_data = response.json()

files = doc_data['directory']['item']
html_files = [f for f in files if f['name'].endswith('.htm') or f['name'].endswith('.html')]

# Pick the first full HTML document (usually first one)
report_file = html_files[0]['name']
report_url = f"https://www.sec.gov/Archives/edgar/data/{int(CIK)}/{accession_nodash}/{report_file}"

print("✅ 10-K Report URL:", report_url)


✅ 10-K Report URL: https://www.sec.gov/Archives/edgar/data/1318605/000162828025003063/0001628280-25-003063-index-headers.html


In [8]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

# Use the final report URL from earlier step
report_url = "https://www.sec.gov/Archives/edgar/data/1318605/000162828025003063/tsla-20231231.htm"
headers = {"User-Agent": "Avikshith Yelakonda avikshith@example.com"}

response = requests.get(report_url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

# Extract all raw text
raw_text = soup.get_text(separator="\n")



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(response.content, "html.parser")


In [12]:
# === New: Fetch plain text version of the 10-K ===
txt_url = "https://www.sec.gov/Archives/edgar/data/1318605/000162828025003063/tsla-20231231.txt"

response = requests.get(txt_url, headers=headers)
text = response.text.replace('\xa0', ' ')
text = re.sub(r'\s+', ' ', text)

def extract_between(start, end, text):
    pattern = re.compile(re.escape(start) + r"(.*?)" + re.escape(end), re.IGNORECASE | re.DOTALL)
    match = pattern.search(text)
    return match.group(1).strip() if match else f"{start} NOT FOUND"

sections = {
    "Item 1. Business": extract_between("Item 1.", "Item 1A.", text),
    "Item 1A. Risk Factors": extract_between("Item 1A.", "Item 1B.", text),
    "Item 7. Management's Discussion": extract_between("Item 7.", "Item 7A.", text),
    "Item 8. Financial Statements": extract_between("Item 8.", "Item 9.", text),
}

# Save the sections to a CSV for further analysis
df = pd.DataFrame(sections.items(), columns=["Section", "Content"])
df.to_csv("tesla_2024_financial_sections.csv", index=False)
print("✅ Extracted key financial sections from .txt and saved to CSV")


✅ Extracted key financial sections from .txt and saved to CSV


In [13]:
import requests
import pandas as pd
import re

# Fetch plain text version of 10-K
txt_url = "https://www.sec.gov/Archives/edgar/data/1318605/000162828025003063/tsla-20231231.txt"
headers = {"User-Agent": "Avikshith Yelakonda avikshith@example.com"}

response = requests.get(txt_url, headers=headers)
lines = response.text.splitlines()

# Clean and normalize lines
cleaned_lines = [re.sub(r'\s+', ' ', line).strip() for line in lines if line.strip()]

# Map to hold section starts
section_markers = {
    "Item 1.": "Item 1. Business",
    "Item 1A.": "Item 1A. Risk Factors",
    "Item 7.": "Item 7. Management's Discussion",
    "Item 8.": "Item 8. Financial Statements"
}

# Find the line numbers where each item starts
section_indices = {}
for idx, line in enumerate(cleaned_lines):
    for key in section_markers.keys():
        if line.upper().startswith(key.upper()) and key not in section_indices:
            section_indices[key] = idx

# Sort by order of appearance
sorted_keys = sorted(section_indices.items(), key=lambda x: x[1])
sorted_keys.append(("EOF", len(cleaned_lines)))  # Add end marker

# Extract content between items
sections = {}
for i in range(len(sorted_keys) - 1):
    start_key, start_idx = sorted_keys[i]
    end_idx = sorted_keys[i + 1][1]
    content = "\n".join(cleaned_lines[start_idx:end_idx]).strip()
    sections[section_markers[start_key]] = content

# Save to CSV
df = pd.DataFrame(sections.items(), columns=["Section", "Content"])
df.to_csv("tesla_2024_financial_sections.csv", index=False)
print("✅ Sections extracted and saved to tesla_2024_financial_sections.csv")


✅ Sections extracted and saved to tesla_2024_financial_sections.csv


In [15]:
import requests
import pandas as pd
import re

# Step 1: Fetch the plain text 10-K report from SEC
txt_url = "https://www.sec.gov/Archives/edgar/data/1318605/000162828025003063/tsla-20231231.txt"
headers = {"User-Agent": "Avikshith Yelakonda avikshith@example.com"}

response = requests.get(txt_url, headers=headers)
lines = response.text.splitlines()

# Step 2: Clean each line
cleaned_lines = [re.sub(r'\s+', ' ', line).strip() for line in lines if line.strip()]

# Optional: Preview the first 100 lines to inspect structure
print("\nPreview of cleaned lines (first 100):")
for i, line in enumerate(cleaned_lines[:100]):
    print(f"{i+1:03d}: {line}")

# Step 3: Detect section headers using regex
section_indices = {}
for idx, line in enumerate(cleaned_lines):
    line_upper = line.upper()
    if re.match(r'^ITEM\s+1\.', line_upper) and "Item 1." not in section_indices:
        section_indices["Item 1."] = idx
    elif re.match(r'^ITEM\s+1A\.', line_upper) and "Item 1A." not in section_indices:
        section_indices["Item 1A."] = idx
    elif re.match(r'^ITEM\s+7\.', line_upper) and "Item 7." not in section_indices:
        section_indices["Item 7."] = idx
    elif re.match(r'^ITEM\s+8\.', line_upper) and "Item 8." not in section_indices:
        section_indices["Item 8."] = idx

# Step 4: Ensure sections were found
print("\n✅ Detected section indices:")
for k, v in section_indices.items():
    print(f"{k} found at line {v}")

# Step 5: Extract text between sections
section_map = {
    "Item 1.": "Item 1. Business",
    "Item 1A.": "Item 1A. Risk Factors",
    "Item 7.": "Item 7. Management's Discussion",
    "Item 8.": "Item 8. Financial Statements"
}

sorted_keys = sorted(section_indices.items(), key=lambda x: x[1])
sorted_keys.append(("EOF", len(cleaned_lines)))  # artificial end

sections = {}
for i in range(len(sorted_keys) - 1):
    start_key, start_idx = sorted_keys[i]
    end_idx = sorted_keys[i + 1][1]
    section_name = section_map[start_key]
    content = "\n".join(cleaned_lines[start_idx:end_idx]).str_



Preview of cleaned lines (first 100):
001: <?xml version="1.0" encoding="UTF-8"?>
002: <Error><Code>NoSuchKey</Code><Message>The specified key does not exist.</Message><Key>edgar/data/1318605/000162828025003063/tsla-20231231.txt</Key><RequestId>2A2VGPB65F84CN43</RequestId><HostId>GLPG/7yiRqEvrw5XErYWvf1K19/e2YP0f98c20k2hTbAgpWvUHc6EcpqarRKkaLjy+BavIR8EhRwtqooL9INQsqWuZF7NZKju0+8jGROFcQ=</HostId></Error>

✅ Detected section indices:
