# Optimized Financial Filing Analysis Notebook


This notebook demonstrates an optimized approach for analyzing SEC financial filings. 
The code is modular, efficient, and includes descriptive explanations for each step.

### Key Features
- **File Downloading and Processing**: Efficient handling of SEC EDGAR data.
- **Mapping CIKs to Tickers**: Using SEC-provided ticker files for mapping.
- **Filing Analysis**: Counting filings by month and weekday.
- **Sentiment Analysis**: Examining sentiment in 10-K filings.

Let's dive into the analysis.


## Utility Functions

In [None]:

import os
import re
import requests
from datetime import date
import numpy as np

def download_file(url, filename):
    """Download a file from the given URL if it doesn't already exist locally."""
    destfile = os.path.join(os.getcwd(), filename)
    if not os.path.exists(destfile):
        response = requests.get(url, stream=True)
        with open(destfile, "wb") as file:
            file.write(response.content)
    return destfile

def read_lines(filepath):
    """Open a file and return its contents as a list of lines."""
    with open(filepath, "r") as file:
        return [line.strip() for line in file]


## Processing SEC Data

In [None]:

def process_data(urls, start_line=11, filing_type='10-K'):
    """Processes SEC master index files to extract unique CIKs for a given filing type."""
    cik_sets = []  # Hold sets of CIKs for each year's filings
    for url, filename in urls:
        filepath = download_file(url, filename)
        lines = read_lines(filepath)[start_line:]  # Skip header rows
        ciks = {line.split('|')[0] for line in lines if '|' in line and line.split('|')[2] == filing_type}
        cik_sets.append(ciks)
    return list(set.intersection(*cik_sets))

# SEC master index file URLs
data_urls = [
    ('https://www.sec.gov/Archives/edgar/full-index/1998/QTR4/master.idx', 'master_1998_Q4.txt'),
    ('https://www.sec.gov/Archives/edgar/full-index/1999/QTR4/master.idx', 'master_1999_Q4.txt'),
    ('https://www.sec.gov/Archives/edgar/full-index/2000/QTR4/master.idx', 'master_2000_Q4.txt')
]

unique_ciks = process_data(data_urls)
print(f"The number of unique CIKs is {len(unique_ciks)}.")


## Mapping CIKs to Tickers

In [None]:

def map_ciks_to_tickers(ciks, url):
    """Maps CIKs to their corresponding stock tickers using SEC data."""
    ticker_filepath = download_file(url, "ticker.txt")
    with open(ticker_filepath, "r") as file:
        data = file.read()
    cik_ticker_pairs = re.findall(r'([a-z-]+)\s+(\d+)', data)
    cik_to_ticker = {cik: ticker for ticker, cik in cik_ticker_pairs}
    return {cik: cik_to_ticker.get(cik, "") for cik in ciks}

cik_ticker_mapping = map_ciks_to_tickers(unique_ciks, 'https://www.sec.gov/include/ticker.txt')
missing_tickers = sum(1 for ticker in cik_ticker_mapping.values() if not ticker)
print(f"Of the {len(cik_ticker_mapping)} CIKs, {missing_tickers} are missing a ticker.")


## Analyzing Filings for 1999

In [None]:

def analyze_filings(urls, ciks, filing_type='8-K'):
    """Analyzes filings by month and day of the week for a given filing type."""
    filings = []
    for url, filename in urls:
        filepath = download_file(url, filename)
        lines = read_lines(filepath)[11:]
        filings.extend([(line.split('|')[0], line.split('|')[3]) for line in lines if '|' in line and line.split('|')[2] == filing_type])
    filings = [(cik, date_string) for cik, date_string in filings if cik in ciks]
    months = [0] * 12
    days = [0] * 7
    for _, date_string in filings:
        y, m, d = map(int, date_string.split('-'))
        date_obj = date(y, m, d)
        months[m - 1] += 1
        days[date_obj.weekday()] += 1
    return months, days

filing_urls = [
    ('https://www.sec.gov/Archives/edgar/full-index/1999/QTR1/master.idx', 'master_1999_Q1.txt'),
    ('https://www.sec.gov/Archives/edgar/full-index/1999/QTR2/master.idx', 'master_1999_Q2.txt'),
    ('https://www.sec.gov/Archives/edgar/full-index/1999/QTR3/master.idx', 'master_1999_Q3.txt'),
    ('https://www.sec.gov/Archives/edgar/full-index/1999/QTR4/master.idx', 'master_1999_Q4.txt')
]

months, days = analyze_filings(filing_urls, unique_ciks)
month_labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
day_labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

print("Filings by Month in 1999:")
for month, count in zip(month_labels, months):
    print(f"{month}: {count}")

print("Filings by Day of the Week in 1999:")
for day, count in zip(day_labels, days):
    print(f"{day}: {count}")


## Sentiment Analysis of 10-K Filings

In [None]:

def analyze_sentiment(cik, years, word_files):
    """
    Performs sentiment analysis on 10-K filings for a given CIK across multiple years.
    It uses predefined sentiment word lists for negative, positive, uncertain, and litigious terms.
    """
    counts = np.zeros((len(years), len(word_files)))  # Matrix to store word counts for each sentiment
    lengths = []  # List to store total word count for each year

    for i, year in enumerate(years):
        # Download the 10-K filing for the given year and CIK
        url = f'https://www.sec.gov/Archives/edgar/data/{cik}/{year}.txt'
        filepath = download_file(url, f'cik_{year}.txt')
        with open(filepath, "r") as file:
            text = file.read().replace('\n', '|')  # Replace newlines for easier regex matching

        # Extract the "Item 1" section (Business Description) using regex
        section = re.findall(r'ITEM 1(.*)ITEM 2', text, flags=re.IGNORECASE)
        words = re.findall(r'[A-Za-z]+', section[0]) if section else []
        words = [word.upper() for word in words]  # Convert all words to uppercase
        lengths.append(len(words))  # Total word count for this filing

        # Count occurrences of words from each sentiment category
        for j, word_list in enumerate(word_files):
            counts[i, j] = sum(1 for word in words if word in word_list)

    return counts, lengths

# Load the predefined sentiment word lists (negative, positive, uncertain, litigious)
def load_word_lists(filenames):
    """Loads sentiment word lists from text files."""
    word_lists = []
    for filename in filenames:
        filepath = download_file(f'https://example.com/{filename}', filename)  # Replace with actual URLs or local paths
        with open(filepath, "r") as file:
            word_lists.append([line.strip().upper() for line in file.readlines()])
    return word_lists

# File names for the sentiment word lists
word_files = [
    '1.9_LM_negative.txt',
    '1.9_LM_positive.txt',
    '1.9_LM_uncertainty.txt',
    '1.9_LM_litigious.txt'
]

# Load word lists into memory
word_lists = load_word_lists(word_files)

# Perform sentiment analysis for a specific CIK and years
cik = '894490'  # Replace with a valid CIK from `unique_ciks`
years = ['1998', '1999', '2000']
counts, lengths = analyze_sentiment(cik, years, word_lists)

# Calculate percentages of words in each category
percentages = (counts.T / lengths).T * 100  # Convert counts to percentages

# Display the results
print("Sentiment Analysis Results:")
headers = ["Year", "Negative (%)", "Positive (%)", "Uncertain (%)", "Litigious (%)"]
print(f"{headers[0]:<6} {headers[1]:<15} {headers[2]:<15} {headers[3]:<15} {headers[4]:<15}")
for i, year in enumerate(years):
    print(f"{year:<6} {percentages[i, 0]:<15.2f} {percentages[i, 1]:<15.2f} {percentages[i, 2]:<15.2f} {percentages[i, 3]:<15.2f}")
