In [13]:
import os
import re
import glob
import logging
import datetime
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from fuzzywuzzy import process

'''
TO DO 
separate issuer investment
make issuer would have company name

June 30 2021

start Black rock capital investment core BKCC (cik==1326003)
'''
# https://www.sec.gov/robots.txt
def get_standard_name(col, choices, score_cutoff=60):
    best_match, score = process.extractOne(col, choices)
    if score > score_cutoff:
        return best_match
    return col

def stopping_criterion(qtr:str)->str:
    return '{}'.format(r'Total\s*Non-Control/Non-Affiliate\s*Investments')

    # if qtr == '2023-12-31':
    #     return '{}'.format(r'Total\s*Cash\s*Equivalents')
    # return '{}'.format(r'Total\s*Investments')
    
def concat(*dfs)->list:
    final = []
    for df in dfs:
        final.extend(df.values.tolist())
    return final

def common_subheaders()->tuple:
    return tuple(map(lambda header:header.replace(' ', r'\s*'),
        ('Advertising, Public Relations and Marketing ',
        'Air Transportation',
        'Amusement and Recreation',
        'Apparel Manufacturing',
        'Building Equipment Contractors',
        'Business Support Services',
        'Chemicals',
        'Communications Equipment Manufacturing',
        'Credit Related Activities',
        'Computer Systems Design and Related Services',
        'Credit (Nondepository)',
        'Data Processing and Hosting Services',
        'Educational Support Services',
        'Electronic Component Manufacturing',
        'Equipment Leasing',
        'Facilities Support Services',
        'Grocery Stores',
        'Hospitals',
        'Insurance',
        'Lessors of Nonfinancial Licenses',
        'Management, Scientific, and Technical Consulting Services',
        'Motion Picture and Video Industries',
        'Other Information Services',
        'Other Manufacturing',
        'Other Publishing',
        'Other Real Estate Activities',
        'Other Telecommunications',
        'Plastics Manufacturing',
        'Radio and Television Broadcasting',
        'Real Estate Leasing',
        'Restaurants',
        'Retail',
        'Satellite Telecommunications',
        'Scientific Research and Development Services',
        'Texttile Furnishings Mills',
        'Traveler Arrangement',
        'Software Publishing',
        'Utility System Construction',
        'Wholesalers',
        'Wired Telecommunications Carriers',
        'Wireless Telecommunications Carriers',
        )
    ))

def standard_field_names()->tuple:
    return (
        'Portfolio Company / Type of Investment',
        'portfolio company',
        'business description',
        'type of investment',
        'investment date',
        'index rate',
        'shares/ units',
        'total rate',
        'reference rate and spread',
        'pik rate',
        'maturity date',
        'principal',
        'cost',
        'footnotes',
        'industry',
        'principal amount', # TODO change stand names for more dynamic fuzzywuzzy matching
        'fair value',
    )

def company_control_headers()->tuple:
    return tuple(map(lambda header:header.replace(' ', r'\s*'),
        (
        'Debt Investments',
        'Debt Investments (82.23%)',
        'Debt Investments (A)',
        'Debt Investments (continued)',
        'Equity Securities',
        'Equity Securities (continued)',
        'Cash and Cash Equivalents',
        )
    ))

def exceptions()->list:
    return [
        '2022-09-30\Schedule_of_Investments_0.csv'
    ]

def except_rows()->list:
    return (
        'As of',
        'HMS Income Fund',
        'schedule of investments'
    )
    
def strip_string(
    columns_names:list,
    standardize:bool=False
)->tuple:
    # columns = tuple(map(lambda col:re.sub(r'[^a-z]', '', str(col).lower()),columns_names))
    if standardize:
        standard_fields = standard_field_names()
        return tuple(
            re.sub(r'\s+', '_',get_standard_name(col,standard_fields)) for col in columns_names
        )
    return tuple(re.sub(r'\s+', '_',col) for col in columns_names)

def get_key_fields(
    df_cur:pd.DataFrame,
)->tuple:
    important_fields = standard_field_names() + common_subheaders()
    for idx,row in enumerate(df_cur.iterrows()):
        found = any(any(
            key in str(field).lower() 
            for key in important_fields)
                    for field in row[-1].dropna().tolist()
            )
        if found and len(set(row[-1].dropna().tolist())) >= 4:
            cols = df_cur.iloc[:idx + 1].apply(lambda row: ' '.join(row.dropna()), axis=0).tolist()
            fields = strip_string(cols,standardize=found) ,idx
            return fields
    return strip_string(df_cur.iloc[0].tolist(),standardize=found),0


# Function to extract date and convert to datetime object
def extract_date(file_path):
    # Extract date from file path (assuming date is always in 'YYYY-MM-DD' format)
    date_str = re.search(r'\d{4}-\d{2}-\d{2}', file_path).group()
    return datetime.datetime.strptime(date_str, '%Y-%m-%d')


def merge_duplicate_columns(
    df:pd.DataFrame,
    merged_pair_idxs:dict={}
)->pd.DataFrame:
    duplicate_cols = merged_pair_idxs.keys()
    flag = not merged_pair_idxs.keys()
    if flag: 
        duplicate_cols = df.columns.unique() 
    for col_name in duplicate_cols:
        mask = merged_pair_idxs.get(col_name)
        if flag:
            mask = df.columns == col_name
            merged_pair_idxs[col_name] = mask
        duplicate_data = df.loc[:, mask]
        merged_data = duplicate_data.apply(lambda row: ' '.join(set(row.dropna().astype(str))), axis=1)
        df = df.loc[:, ~mask]
        df[col_name] = merged_data
    return df.reset_index(drop=True),merged_pair_idxs

def extract_subheaders(
    df:pd.DataFrame,
    control:bool,
)->pd.DataFrame:
    col_name = 'company_control' if control else 'Type_of_Investment'
    if col_name in df.columns:
        return df
    include = df.apply(
        lambda row: re.search('|'.join(company_control_headers() if control else common_subheaders()), str(row[0]), re.IGNORECASE) is not None,
        axis=1
    )  
    
    exclude = ~df.apply(
        lambda row: row.astype(str).str.contains('total|Inc|Ltd|LLC|Holdings|LP|Co|Corporation', case=False, na=False).any(),
        axis=1
    )
    idx = df[include & exclude].index.tolist()
    df[col_name] = None
    if not idx:
        return df

    prev_header = subheader = None
    df.loc[idx[-1]:,col_name] = df.iloc[idx[-1],1] if isinstance(df.iloc[idx[-1],0],float)  else df.iloc[idx[-1],0]
    for j,i in enumerate(idx[:-1]):
        prev_header = subheader
        subheader = df.iloc[i,1] if isinstance(df.iloc[i,0],float)  else df.iloc[i,0]
        df.loc[idx[j]:idx[j+1],col_name] = subheader if subheader != '' else prev_header
    return df


def remove_row_duplicates(row:pd.Series)->pd.Series: 
    out = []
    for v in row:
        if v in out:
            out.append(np.nan)
        else:
            out.append(v)
    return pd.Series(out)

  
def _clean(
    file_path:str,
    except_rows:str,
    merged_pair_idxs:dict={},
)->pd.DataFrame:
    df = pd.read_csv(file_path,index_col=0,na_values=[' ', ''])
    df.replace(['\u200b',None, r'^\s*$'],np.nan,regex=True,inplace=True) #':','$','%'
    df.dropna(axis=0,how='all',inplace=True)

    df = df[~df.apply(lambda row:row.astype(str).str.contains(except_rows,case=False, na=False).any(),axis=1)]
    if not merged_pair_idxs:
        important_fields,idx = get_key_fields(df)
        df.columns = important_fields
    df,merge_pair_idxs = merge_duplicate_columns(df,merged_pair_idxs=merged_pair_idxs)
    duplicate_idx = df.apply(lambda row:row[pd.to_numeric(row,errors='coerce').isna()].duplicated().sum() > 1 ,axis=1)
    clean_rows = df.loc[duplicate_idx].apply(remove_row_duplicates, axis=1).reset_index(drop=True)
    j = 0
    for i,flag in enumerate(duplicate_idx):
        if not flag:
            continue
        df.iloc[i,:] = clean_rows.loc[j,:]
        j += 1
    df.replace([r'^\s*$'],np.nan,regex=True,inplace=True) #':','$','%'
    df.dropna(axis=1,how='all',inplace=True)
    
    columns = (~df.isna()).sum(axis=0) <= 3 
    df.drop(columns=df.columns[columns],inplace=True)
    return df.reset_index(drop=True),merge_pair_idxs
    
def main()->None:
    qtrs = os.listdir('.')
    ex = exceptions()
    ex_rows = '|'.join(except_rows())
    for qtr in qtrs:
        if '.csv' in qtr or not os.path.exists(os.path.join(qtr,f'Schedule_of_Investments_0.csv')):
            continue
        # qtr = '2016-06-30'
        print(qtr)

        index_list_sum = i = 0
        soi_files = sorted([
            os.path.join(qtr,file) 
            for file in os.listdir(qtr)
            if file.endswith('.csv')
        ],key=lambda f: int(f.split('_')[-1].split('.')[0]))
        soi_files = [f for f in soi_files if f not in ex]
        df,merged_pair_idxs = _clean(soi_files[i],except_rows=ex_rows,merged_pair_idxs={})
        index_list = df.apply(
            lambda row:row.astype(str).str.contains(stopping_criterion(qtr), case=False, na=False).any(),
            axis=1
        )
        index_list_sum = index_list.sum()
        dfs = [df]     
        i += 1
        while index_list_sum == 0:
            print(soi_files[i])
            df,merged_pair_idxs = _clean(soi_files[i],except_rows=ex_rows,merged_pair_idxs=merged_pair_idxs)
            dfs.append(df)
            index_list = df.apply(
                lambda row:row.astype(str).str.contains(stopping_criterion(qtr), case=False, na=False).any(),
                axis=1
            )
            index_list_sum = index_list.sum()
            i += 1
            
        date_final = dfs[0]
        if len(dfs) > 1:
            date_final = pd.concat(dfs,axis=0,ignore_index=True)#pd.DataFrame(concat(*dfs))
        # date_final = extract_subheaders(date_final,control=True)
        # date_final = extract_subheaders(date_final,control=False)

        date_final['qtr'] = qtr.split('\\')[-1]
        if not os.path.exists(os.path.join(qtr,'output')):
            os.makedirs(os.path.join(qtr,'output'))
        columns_to_drop = date_final.notna().sum() <= 2
        date_final.drop(columns=columns_to_drop[columns_to_drop].index)
        date_final.to_csv(os.path.join(qtr,'output',f'{qtr}.csv'),index=False)
        break
    
    # Use glob to find files
    files = sorted(glob.glob(f'*/output/*.csv'), key=extract_date)
    single_truth = pd.concat([
        pd.read_csv(df) for df in files
    ],axis=0,ignore_index=True)
    single_truth.drop(columns=single_truth.columns[['Unnamed' in col for col in single_truth.columns]],inplace=True)
    single_truth.to_csv(f'{cik}_soi_table.csv',index=False)
    
    
# from utils import init_logger
import warnings

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
# init_logger()
cik = 1535778
main()



2012-06-30


0           Portfolio Company / Type of Investment  (1)
1     Academy, Ltd, LIBOR Plus 4.50%, Current Coupon...
2     Ameritech College Operations, LLC, 18% Secured...
3     CHMB, Inc., 12% Secured Debt (Maturity – Octob...
4     Ipreo Holdings LLC, LIBOR Plus 6.50%, Current ...
5     IRTH Holdings, Inc., 12% Secured Debt (Maturit...
6     Metropolitan Health Networks, Inc., LIBOR Floo...
7     Multiplan, Inc., LIBOR Plus 3.25%, Current Cou...
8     NAPCO Precast, LLC, 18.00% Secured Debt (Matur...
9     National Healing Corporation, LIBOR Plus 6.75%...
10    NRI Clinical Research, LLC, 14.00% Secured Deb...
11    Pacific Architects and Engineers Incorporated,...
12    Phillips Plastic Corporation, LIBOR Plus 5.00%...
13    Principle Environmental, LLC, 12.00% Secured D...
14    Ulterra Drilling Technologies, L.P., LIBOR Flo...
15    UniTek Global Services, Inc., LIBOR Plus 7.50%...
16    VFH Parent LLC, LIBOR Plus 6.00%, Current Coup...
17    Visant Corporation, Base Rate of 1.25% Plu

In [115]:
file_path = r"2022-09-30\Schedule_of_Investments_0.csv"

def stopping_criterion(qtr:str)->str:
    return '{}'.format(r'Total\s*Non-Control/Non-Affiliate\s*Investments')
    # if qtr == '2023-12-31':
    #     return '{}'.format(r'Total\s*Cash\s*Equivalents')
    # return '{}'.format(r'Total\s*Investments')


def _clean(
    file_path:str,
    merged_pair_idxs:dict={},
)->pd.DataFrame:
    df = pd.read_csv(file_path,index_col=0,na_values=[' ', ''])
    df.replace(['\u200b',None, r'^\s*$'],np.nan,regex=True,inplace=True) #':','$','%'
    df.dropna(axis=0,how='all',inplace=True)
    return df ,None
    # df = df[~df.apply(lambda row:row.astype(str).str.match(regex_pattern).all(),axis=1)]
    # if not merged_pair_idxs:
    #     important_fields,idx = get_key_fields(df)
    #     df.columns = important_fields
    # df,merge_pair_idxs = merge_duplicate_columns(df,merged_pair_idxs=merged_pair_idxs)
    # duplicate_idx = df.apply(lambda row:row[pd.to_numeric(row,errors='coerce').isna()].duplicated().sum() > 1 ,axis=1)
    # clean_rows = df.loc[duplicate_idx].apply(remove_row_duplicates, axis=1).reset_index(drop=True)
    # j = 0
    # for i,flag in enumerate(duplicate_idx):
    #     if not flag:
    #         continue
    #     df.iloc[i,:] = clean_rows.loc[j,:]
    #     j += 1
    # df.replace([r'^\s*$'],np.nan,regex=True,inplace=True) #':','$','%'
    # df.dropna(axis=1,how='all',inplace=True)
    
    # columns = (~df.isna()).sum(axis=0) <= 3 
    # df.drop(columns=df.columns[columns],inplace=True)
    # return df.reset_index(drop=True),merge_pair_idxs


df,merged_pair_idxs = _clean(file_path)
display(df)
display(merged_pair_idxs)
index_list = df.apply(
    lambda row:row.astype(str).str.contains(stopping_criterion(None), case=False, na=False).any(),
    axis=1
)
index_list_sum = index_list.sum()
index_list_sum

Unnamed: 0,0,1,2,3,4,5,6,7,8
1,PART I,PART I,PART I,PART I,PART I,PART I,,,
2,FINANCIAL INFORMATION,FINANCIAL INFORMATION,FINANCIAL INFORMATION,FINANCIAL INFORMATION,FINANCIAL INFORMATION,FINANCIAL INFORMATION,,,
3,Item 1.,Item 1.,Item 1.,Consolidated Financial Statements,Consolidated Financial Statements,Consolidated Financial Statements,,,
4,,,,"Consolidated Balance Sheets— September 30, 2...","Consolidated Balance Sheets— September 30, 2...","Consolidated Balance Sheets— September 30, 2...",1.0,1.0,1.0
5,,,,Consolidated Statements of Operations (unaudit...,Consolidated Statements of Operations (unaudit...,Consolidated Statements of Operations (unaudit...,2.0,2.0,2.0
6,,,,Consolidated Statements of Changes in Net Asse...,Consolidated Statements of Changes in Net Asse...,Consolidated Statements of Changes in Net Asse...,3.0,3.0,3.0
7,,,,Consolidated Statements of Cash Flows (unaudit...,Consolidated Statements of Cash Flows (unaudit...,Consolidated Statements of Cash Flows (unaudit...,4.0,4.0,4.0
8,,,,Consolidated Schedule of Investments (unaudite...,Consolidated Schedule of Investments (unaudite...,Consolidated Schedule of Investments (unaudite...,5.0,5.0,5.0
9,,,,Consolidated Schedule of Investments—December ...,Consolidated Schedule of Investments—December ...,Consolidated Schedule of Investments—December ...,28.0,28.0,28.0
10,,,,Notes to Consolidated Financial Statements (un...,Notes to Consolidated Financial Statements (un...,Notes to Consolidated Financial Statements (un...,45.0,45.0,45.0


None

0

In [100]:
df,merged_pair_idxs = _clean( "2013-09-30\Schedule_of_Investments_1.csv",merged_pair_idxs)
# df = pd.read_csv("2013-09-30\Schedule_of_Investments_1.csv",index_col=0,na_values=[' ', ''])

# df.replace(['\u200b',None, r'^\s*$'],np.nan,regex=True,inplace=True) #':','$','%'
# df.dropna(axis=0,how='all',inplace=True)
# df,merge_pair_idxs = merge_duplicate_columns(df,merged_pair_idxs=merged_pair_idxs)

df

Unnamed: 0,Portfolio_Company_/_Type_of_Investment,industry,principal_amount,cost,fair_value
0,"Polyconcept North America Holdings, Inc., LIB...",Promotional Products to Corporations and Cons...,979.0,969.0,974.0
1,"Relativity Media, LLC, 10.00% Secured Debt (M...",Full-scale Film and Television Production and...,976.0,976.0,976.0
2,"SCE Partners, LLC, LIBOR Plus 7.25%, Current ...","Hotel and Casino in Sioux City, IA",1000.0,990.0,995.0
3,"Sotera Defense Solutions, Inc., LIBOR Plus 6....",Defense Industry Intelligence Services,952.0,920.0,880.0
4,"Sutherland Global Services, Inc., LIBOR Plus ...",Business Process Outsourcing Provider,975.0,956.0,974.0
5,"Synagro Infrastructure Company, Inc., LIBOR P...",Waste Management Services,1000.0,980.0,988.0
6,"Tervita Corporation, LIBOR Plus 5.00%, Curren...",Oil and Gas Environmental Services,498.0,493.0,488.0
7,"Therakos, Inc., LIBOR Plus 6.25%, Current Cou...",Immune System Disease Treatment,993.0,965.0,994.0
8,"Universal Fiber Systems, LLC, LIBOR plus 5.75...",Manufacturer of Synthetic Fibers,1753.0,1729.0,1762.0
9,"Vantage Oncology, Inc., 9.50% Secured Bond, (...",Outpatient Radiation Oncology Treatment Centers,1000.0,1000.0,1009.0
