# Hedge Fund Holdings Analysis via EDGAR

## 1. Setup and Imports

In [119]:
from sec_edgar_downloader import Downloader
from bs4 import BeautifulSoup

import os
import re
import requests
import unicodedata
import pandas as pd
import csv
import xml.etree.ElementTree as ET

In [128]:
dl = Downloader("Narain Nair", "nair4986@gmail.com")
dl.get("13F-HR", "1167483")

95

## 2. Parse and Structure Filing Data

In [129]:
def extract_info_table_xml(txt_file_path):
    """
    Extracts the XML string within <INFORMATIONTABLE> tags from the full .txt filing.
    """
    with open(txt_file_path, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()

    # Use regex to extract everything between <INFORMATIONTABLE>...</INFORMATIONTABLE>
    match = re.search(r"<INFORMATIONTABLE[\s\S]*?</INFORMATIONTABLE>", content, re.IGNORECASE)

    if match:
        return match.group(0)
    else:
        return None  # No XML found

In [130]:
def parse_info_table_xml(xml_string):
    """
    Parses the extracted XML string into a list of holdings as a pandas DataFrame.
    """
    # XML has namespace — remove it for easier parsing
    xml_string_clean = re.sub(r'\sxmlns="[^"]+"', '', xml_string, count=1)

    root = ET.fromstring(xml_string_clean)

    data = []
    for info in root.findall("infoTable"):
        row = {}
        for child in info:
            tag = child.tag.strip()
            text = child.text.strip() if child.text else ''
            row[tag] = text
        data.append(row)

    return pd.DataFrame(data)

In [131]:
def filter_2016(folder_name):
    match = re.search(r'-([0-9]{2})-', folder_name)
    if match:
        year_suffix = int(match.group(1))
        year = 2000 + year_suffix
        return year > 2016
    return False

In [135]:
folder_path = "./sec-edgar-filings/0001167483/13F-HR/"
holdings = []

for folder in os.listdir(folder_path):
    if not filter_2016(folder):
        continue

    access_folder_path = os.path.join(folder_path, folder)

    for file in os.listdir(access_folder_path):
        if file.endswith(".txt"):
            file_path = os.path.join(access_folder_path, file)
            print(f"Processing: {file_path}")
            
            xml_str = extract_info_table_xml(file_path)
            if xml_str:
                df = parse_info_table_xml(xml_str)
                df["source_file"] = file
                df["accession"] = folder
                holdings.append(df)

master_holdings_df = pd.concat(holdings, ignore_index=True)

Processing: ./sec-edgar-filings/0001167483/13F-HR/0000919574-20-007148/full-submission.txt
Processing: ./sec-edgar-filings/0001167483/13F-HR/0000919574-24-004713/full-submission.txt
Processing: ./sec-edgar-filings/0001167483/13F-HR/0000919574-18-001706/full-submission.txt
Processing: ./sec-edgar-filings/0001167483/13F-HR/0000919574-22-006727/full-submission.txt
Processing: ./sec-edgar-filings/0001167483/13F-HR/0000919574-20-005337/full-submission.txt
Processing: ./sec-edgar-filings/0001167483/13F-HR/0000919574-18-005626/full-submission.txt
Processing: ./sec-edgar-filings/0001167483/13F-HR/0000919574-23-001481/full-submission.txt
Processing: ./sec-edgar-filings/0001167483/13F-HR/0000919574-21-007099/full-submission.txt
Processing: ./sec-edgar-filings/0001167483/13F-HR/0000919574-25-003217/full-submission.txt
Processing: ./sec-edgar-filings/0001167483/13F-HR/0000919574-19-001612/full-submission.txt
Processing: ./sec-edgar-filings/0001167483/13F-HR/0000919574-24-001349/full-submission.txt