In [72]:
import os
import re
import glob
import datetime
import warnings
import pandas as pd
import numpy as np
from fuzzywuzzy import process

import sys
import warnings
sys.path.insert(0, '../') 
# from utils import init_logger

def common_subheaders()->tuple:
    return tuple(map(lambda header:header.replace(' ', r'\s*'),
        ('Advertising, Public Relations and Marketing ',
        'Air Transportation',
        'Amusement and Recreation',
        'Apparel Manufacturing',
        'Building Equipment Contractors',
        'Business Support Services',
        'Chemicals',
        'Communications Equipment Manufacturing',
        'Credit Related Activities',
        'Computer Systems Design and Related Services',
        'Credit (Nondepository)',
        'Data Processing and Hosting Services',
        'Educational Support Services',
        'Electronic Component Manufacturing',
        'Equipment Leasing',
        'Facilities Support Services',
        'Grocery Stores',
        'Hospitals',
        'Insurance',
        'Lessors of Nonfinancial Licenses',
        'Management, Scientific, and Technical Consulting Services',
        'Motion Picture and Video Industries',
        'Other Information Services',
        'Other Manufacturing',
        'Other Publishing',
        'Other Real Estate Activities',
        'Other Telecommunications',
        'Plastics Manufacturing',
        'Radio and Television Broadcasting',
        'Real Estate Leasing',
        'Restaurants',
        'Retail',
        'Satellite Telecommunications',
        'Scientific Research and Development Services',
        'Texttile Furnishings Mills',
        'Traveler Arrangement',
        'Software Publishing',
        'Utility System Construction',
        'Wholesalers',
        'Wired Telecommunications Carriers',
        'Wireless Telecommunications Carriers',
        )
    ))


def standard_field_names()->tuple:
    return (
        'Portfolio Company',
        'Portfolio Company /Principal Business',
        'Investment /Interest Rate /Maturity',
        'Principal',
        'Cost',
        'Value',
        'Percent of Class Held',
        'Investment',
        'CDO Fund Investments',
        'Percent of Interests Held',
        'Industry',
        'Spread Above Index',
        'Aquisition Date',
        'Interest Rate',
        'Maturity',
        'Principal/Shares',
        'Investment Type',
        'of Net Assets',
        'business description',
        'type of investment',
        'investment date',
        'reference rate and spread',
        'pik rate',
        'maturity date',
        'cost',
        'footnotes',
        'industry',
        'principal amount', # TODO change stand names for more dynamic fuzzywuzzy matching
        'fair value',
    )

    
def company_control_headers()->tuple:
    return tuple(map(lambda header:header.replace(' ', r'\s*'),
        (
        'Debt Investments',
        'Debt Investments (82.23%)',
        'Debt Investments (A)',
        'Debt Investments (continued)',
        'Equity Securities',
        'Equity Securities (continued)',
        'Cash and Cash Equivalents',
        )
    ))

def exceptions()->tuple:
    return (
        '2006-12-31\\Schedule_of_Investments_1.csv', 
        '2006-12-31\\Schedule_of_Investments_3.csv', 
        '2008-03-31\\Schedule_of_Investments_7.csv',
        '2008-03-31\\Schedule_of_Investments_8.csv',  
        '2008-12-31\\Schedule_of_Investments_6.csv',
        '2008-12-31\\Schedule_of_Investments_11.csv',
        '2008-12-31\\Schedule_of_Investments_13.csv'
    )

def except_rows()->tuple:
    return (
        'Timet',
    )

# https://www.sec.gov/robots.txt
def get_standard_name(col, choices, score_cutoff=60):
    best_match, score = process.extractOne(col, choices)
    if score > score_cutoff:
        return best_match
    return col

def stopping_criterion(qtr:str)->str:
    return '{}'.format(r'Total\s*Investments')

 
def concat(*dfs)->list:
    final = []
    for df in dfs:
        final.extend(df.values.tolist())
    return final

    
def get_key_fields(
    df_cur:pd.DataFrame,
)->tuple:
    important_fields = standard_field_names() + common_subheaders()
    for idx,row in enumerate(df_cur.iterrows()):
        found = any(any(
            key in str(field).lower() 
            for key in important_fields)
                    for field in row[-1].dropna().tolist()
            )
        if found and len(set(row[-1].dropna().tolist())) >= 6:
            cols = df_cur.iloc[:idx + 1].apply(lambda row: ' '.join(row.dropna()), axis=0).tolist()
            fields = strip_string(cols,standardize=found) 
            return fields
    return strip_string(df_cur.iloc[0].tolist())

def strip_string(
    columns_names:list,
    standardize:bool=False
)->tuple:
    # columns = tuple(map(lambda col:re.sub(r'[^a-z]', '', str(col).lower()),columns_names))
    if standardize:
        standard_fields = standard_field_names()
        return tuple(
            re.sub(r'\s+', '_',get_standard_name(str(col),standard_fields)) for col in columns_names
        )
    return tuple(re.sub(r'\s+', '_',str(col)) for col in columns_names)


# Function to extract date and convert to datetime object
def extract_date(file_path):
    # Extract date from file path (assuming date is always in 'YYYY-MM-DD' format)
    date_str = re.search(r'\d{4}-\d{2}-\d{2}', file_path).group()
    return datetime.datetime.strptime(date_str, '%Y-%m-%d')


def merge_duplicate_columns(
    df:pd.DataFrame,
    merged_pair_idxs:dict={}
)->pd.DataFrame:
    duplicate_cols = merged_pair_idxs.keys()
    flag = not merged_pair_idxs.keys()
    if flag: 
        duplicate_cols = df.columns.unique() 
    for col_name in duplicate_cols:
        display(col_name)
        mask = merged_pair_idxs.get(col_name)
        if flag:
            mask = df.columns == col_name
            merged_pair_idxs[col_name] = mask
        duplicate_data = df.loc[:, mask]
        merged_data = duplicate_data.apply(lambda row: ' '.join(set(row.dropna().astype(str))), axis=1)
        df = df.loc[:, ~mask]
        df[col_name] = merged_data
        display(df)
    return df.reset_index(drop=True),merged_pair_idxs

def extract_subheaders(
    df:pd.DataFrame,
    control:bool,
)->pd.DataFrame:
    col_name = 'company_control' if control else 'Type_of_Investment'
    if col_name in df.columns:
        return df
    include = df.apply(
        lambda row: re.search('|'.join(company_control_headers() if control else common_subheaders()), str(row[0]), re.IGNORECASE) is not None,
        axis=1
    )  
    
    exclude = ~df.apply(
        lambda row: row.astype(str).str.contains('total|Inc|Ltd|LLC|Holdings|LP|Co|Corporation', case=False, na=False).any(),
        axis=1
    )
    idx = df[include & exclude].index.tolist()
    df[col_name] = None
    if not idx:
        return df

    prev_header = subheader = None
    df.loc[idx[-1]:,col_name] = df.iloc[idx[-1],1] if isinstance(df.iloc[idx[-1],0],float)  else df.iloc[idx[-1],0]
    for j,i in enumerate(idx[:-1]):
        prev_header = subheader
        subheader = df.iloc[i,1] if isinstance(df.iloc[i,0],float)  else df.iloc[i,0]
        df.loc[idx[j]:idx[j+1],col_name] = subheader if subheader != '' else prev_header
    return df


def remove_row_duplicates(row:pd.Series)->pd.Series: 
    out = []
    for v in row:
        if v in out and not str(v).replace('$','').isnumeric():
            out.append(np.nan)
        else:
            out.append(v)
    return pd.Series(out)

  

def _clean(
    file_path:str,
    except_rows:str,
    merged_pair_idxs:dict={},
)->pd.DataFrame:
    df = pd.read_csv(file_path,index_col=0,na_values=[' ', ''])
    # df.replace(['\xa0','\u200b',r'^\s$',r'^\s%'],np.nan,regex=True,inplace=True) #':','$','%' r'^\s*$',r'^\s*%'
    df.replace(['\xa0','\u200b',r'^\s$',r'^\s*%',' ','˄'],'',regex=True,inplace=True)
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df.dropna(axis=0,how='all',inplace=True)
    df = df[~df.apply(lambda row:row.astype(str).str.contains(except_rows,case=False, na=False).any(),axis=1)]
    duplicate_idx = df.apply(lambda row:row[pd.to_numeric(row,errors='coerce').isna()].duplicated().sum() > 1 ,axis=1)
    clean_rows = df.loc[duplicate_idx].apply(remove_row_duplicates, axis=1).reset_index(drop=True)
    j = 0
    for i,flag in enumerate(duplicate_idx):
        if not flag:
            continue
        df.iloc[i,:] = clean_rows.loc[j,:]
        j += 1
    if not merged_pair_idxs:
        important_fields = strip_string(get_header_rows(df),standardize=True)#get_key_fields(df)
        df.columns = important_fields
    df,merge_pair_idxs = merge_duplicate_columns(df,merged_pair_idxs=merged_pair_idxs)

    df.replace([r'^\s*$'],np.nan,regex=True,inplace=True) #':','$','%'
    df.dropna(axis=1,how='all',inplace=True)
    
    columns = (~df.isna()).sum(axis=0) <= 4  if df.shape[0] > 12 else 1
    df.drop(columns=df.columns[columns],inplace=True)
    return df.reset_index(drop=True),merge_pair_idxs




def get_header_rows(
    df_cur:pd.DataFrame,
)->tuple:
    for idx,row in df_cur.iterrows():
        found = any(str(v).replace("$",'').replace("%",'').isnumeric() for v in row)
        if found:
            out = df_cur.iloc[:idx,:].apply(lambda row: ' '.join(row[row.notna()].values), axis=0)
            return out
    
    return strip_string(df_cur.iloc[0].tolist())


def main()->None:
    qtrs = os.listdir('.')
    ex = exceptions()
    ex_rows = '|'.join(except_rows())
    for qtr in qtrs:
        if '.csv' in qtr or not os.path.exists(os.path.join(qtr,f'Schedule_of_Investments_0.csv')):
            continue
        qtr = '2008-12-31'
        # logger.info(qtr)
        display(qtr)
        index_list_sum = i = 0
        soi_files = sorted([
            os.path.join(qtr,file) 
            for file in os.listdir(qtr)
            if file.endswith('.csv')
        ],key=lambda f: int(f.split('_')[-1].split('.')[0]))
        # soi_files = [f for f in soi_files] # if f not in ex]
        df,merged_pair_idxs = _clean(soi_files[i],except_rows=ex_rows,merged_pair_idxs={})
        if soi_files[i] in ex:
            merged_pair_idxs = {}

        index_list = df.apply(
            lambda row:row.astype(str).str.contains(stopping_criterion(qtr), case=False, na=False).any(),
            axis=1
        )
        index_list_sum = index_list.sum()
        dfs = [df]     
        i += 1

        while index_list_sum == 0:
            # logger.info(soi_files[i])
            soi_files[i] = r'2008-12-31\\Schedule_of_Investments_14.csv'
            if soi_files[i] == r'2008-12-31\\Schedule_of_Investments_14.csv':
                merged_pair_idxs = {
                    'Portfolio_Company_/Principal_Business': np.array([ True, False, False, False, False, False, False, False, False,False, False, False, False, False,False]),
                    '': np.array([ True, False,  True, True, False, True,  True, True, False,True,  True, True, False, True,False]),
                    'Investment': np.array([ True, False, False, False, False, False]),
                    'Percent_of_Interests_Held': np.array([ True,  False,  False, False, False, False]),
                    'Cost': np.array([True,  False, False, False, False, False]),
                    'Value': np.array([True,  False, False, False, False, False])
                }
            display(soi_files[i])


            # display(merged_pair_idxs)
            df,merged_pair_idxs = _clean(soi_files[i],except_rows=ex_rows,merged_pair_idxs=merged_pair_idxs if soi_files[i] not in ex else {})
            dfs.append(df)
            index_list = df.apply(
                lambda row:row.astype(str).str.contains(stopping_criterion(qtr), case=False, na=False).any(),
                axis=1
            )
            index_list_sum = index_list.sum()
            i += 1
            break
        date_final = dfs[0]
        if len(dfs) > 1:
            date_final = pd.concat(dfs,axis=0,ignore_index=True)#pd.DataFrame(concat(*dfs))
        # date_final = extract_subheaders(date_final,control=True)
        # date_final = extract_subheaders(date_final,control=False)

        date_final['qtr'] = qtr.split('\\')[-1]
        if not os.path.exists(os.path.join(qtr,'output')):
            os.makedirs(os.path.join(qtr,'output'))
        columns_to_drop = date_final.notna().sum() <= 2
        date_final.drop(columns=columns_to_drop[columns_to_drop].index)
        date_final.to_csv(os.path.join(qtr,'output',f'{qtr}.csv'),index=False)
        break
    
    # Use glob to find files
    files = sorted(glob.glob(f'*/output/*.csv'), key=extract_date)
    single_truth = pd.concat([
        pd.read_csv(df) for df in files
    ],axis=0,ignore_index=True)
    single_truth.drop(columns=single_truth.columns[['Unnamed' in col for col in single_truth.columns]],inplace=True)
    single_truth.to_csv(f'{cik}_soi_table.csv',index=False)
    
warnings.simplefilter(action='ignore', category=FutureWarning)
cik = 1372807
# logger = init_logger(cik)
# logger.info(cik)

main()

    


'2008-12-31'



'Portfolio_Company_/Principal_Business'

Unnamed: 0,Unnamed: 1,Investment_/Interest_Rate_/Maturity,Unnamed: 3,Principal,Unnamed: 5,Unnamed: 6,Unnamed: 7,Cost,Unnamed: 9,Unnamed: 10,Unnamed: 11,Value,Unnamed: 13,Unnamed: 14,Portfolio_Company_/Principal_Business
0,,InvestmentInterestRate¹/Maturity,,Principal,,,,Cost,,,,Value2,,,PortfolioCompany/PrincipalBusiness
1,,SeniorSecuredLoan—DeferredDrawTermLoan(FirstLi...,,$,356819.0,,,,356819.0,,,,356819.0,,"AdvancedLightingTechnologies,Inc.6HomeandOffic..."
3,,"SeniorSecuredLoan—RevolvingLoan3.9%,Due6/13",,,960000.0,,,,952585.0,,,,960000.0,,"AdvancedLightingTechnologies,Inc.HomeandOffice..."
5,,"JuniorSecuredLoan—SecondLienTermLoanNote8.5%,D...",,,5000000.0,,,,5000000.0,,,,5000000.0,,"AdvancedLightingTechnologies,Inc.6HomeandOffic..."
7,,"SeniorSecuredLoan—TermLoan(FirstLien)4.6%,Due6/13",,,1834277.0,,,,1834277.0,,,,1834277.0,,"AdvancedLightingTechnologies,Inc.6HomeandOffic..."
9,,"SeniorSecuredLoan—TermLoan7.0%,Due4/12",,,3118560.0,,,,3118560.0,,,,3118560.0,,"AeroProductsInternational,Inc.6PersonalandNonD..."
11,,"SeniorSecuredLoan—DelayedDrawTermLoan7.5%,Due3/13",,,429397.0,,,,429397.0,,,,429397.0,,AerostructuresAcquisitionLLC6AerospaceandDefense
13,,"SeniorSecuredLoan—TermLoan7.5%,Due3/13",,,5436949.0,,,,5436949.0,,,,5436949.0,,AerostructuresAcquisitionLLC6AerospaceandDefense
15,,"SeniorSecuredLoan—TrancheBTermLoan4.2%,Due4/13",,,3832209.0,,,,3829883.0,,,,3458569.0,,"AGAMedicalCorporation6Healthcare,EducationandC..."
17,,"SeniorSecuredLoan—DelayedDrawTermLoan3.5%,Due5/13",,,442044.0,,,,436817.0,,,,419942.0,,"AGSLLC6Hotels,Motels,Inns,andGaming"


''

Unnamed: 0,Investment_/Interest_Rate_/Maturity,Principal,Cost,Value,Portfolio_Company_/Principal_Business,Unnamed: 6
0,InvestmentInterestRate¹/Maturity,Principal,Cost,Value2,PortfolioCompany/PrincipalBusiness,
1,SeniorSecuredLoan—DeferredDrawTermLoan(FirstLi...,$,,,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",356819
3,"SeniorSecuredLoan—RevolvingLoan3.9%,Due6/13",,,,"AdvancedLightingTechnologies,Inc.HomeandOffice...",960000 952585
5,"JuniorSecuredLoan—SecondLienTermLoanNote8.5%,D...",,,,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",5000000
7,"SeniorSecuredLoan—TermLoan(FirstLien)4.6%,Due6/13",,,,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",1834277
9,"SeniorSecuredLoan—TermLoan7.0%,Due4/12",,,,"AeroProductsInternational,Inc.6PersonalandNonD...",3118560
11,"SeniorSecuredLoan—DelayedDrawTermLoan7.5%,Due3/13",,,,AerostructuresAcquisitionLLC6AerospaceandDefense,429397
13,"SeniorSecuredLoan—TermLoan7.5%,Due3/13",,,,AerostructuresAcquisitionLLC6AerospaceandDefense,5436949
15,"SeniorSecuredLoan—TrancheBTermLoan4.2%,Due4/13",,,,"AGAMedicalCorporation6Healthcare,EducationandC...",3458569 3829883 3832209
17,"SeniorSecuredLoan—DelayedDrawTermLoan3.5%,Due5/13",,,,"AGSLLC6Hotels,Motels,Inns,andGaming",442044 419942 436817


'Investment_/Interest_Rate_/Maturity'

Unnamed: 0,Principal,Cost,Value,Portfolio_Company_/Principal_Business,Unnamed: 5,Investment_/Interest_Rate_/Maturity
0,Principal,Cost,Value2,PortfolioCompany/PrincipalBusiness,,InvestmentInterestRate¹/Maturity
1,$,,,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",356819,SeniorSecuredLoan—DeferredDrawTermLoan(FirstLi...
3,,,,"AdvancedLightingTechnologies,Inc.HomeandOffice...",960000 952585,"SeniorSecuredLoan—RevolvingLoan3.9%,Due6/13"
5,,,,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",5000000,"JuniorSecuredLoan—SecondLienTermLoanNote8.5%,D..."
7,,,,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",1834277,"SeniorSecuredLoan—TermLoan(FirstLien)4.6%,Due6/13"
9,,,,"AeroProductsInternational,Inc.6PersonalandNonD...",3118560,"SeniorSecuredLoan—TermLoan7.0%,Due4/12"
11,,,,AerostructuresAcquisitionLLC6AerospaceandDefense,429397,"SeniorSecuredLoan—DelayedDrawTermLoan7.5%,Due3/13"
13,,,,AerostructuresAcquisitionLLC6AerospaceandDefense,5436949,"SeniorSecuredLoan—TermLoan7.5%,Due3/13"
15,,,,"AGAMedicalCorporation6Healthcare,EducationandC...",3458569 3829883 3832209,"SeniorSecuredLoan—TrancheBTermLoan4.2%,Due4/13"
17,,,,"AGSLLC6Hotels,Motels,Inns,andGaming",442044 419942 436817,"SeniorSecuredLoan—DelayedDrawTermLoan3.5%,Due5/13"


'Principal'

Unnamed: 0,Cost,Value,Portfolio_Company_/Principal_Business,Unnamed: 4,Investment_/Interest_Rate_/Maturity,Principal
0,Cost,Value2,PortfolioCompany/PrincipalBusiness,,InvestmentInterestRate¹/Maturity,Principal
1,,,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",356819,SeniorSecuredLoan—DeferredDrawTermLoan(FirstLi...,$
3,,,"AdvancedLightingTechnologies,Inc.HomeandOffice...",960000 952585,"SeniorSecuredLoan—RevolvingLoan3.9%,Due6/13",
5,,,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",5000000,"JuniorSecuredLoan—SecondLienTermLoanNote8.5%,D...",
7,,,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",1834277,"SeniorSecuredLoan—TermLoan(FirstLien)4.6%,Due6/13",
9,,,"AeroProductsInternational,Inc.6PersonalandNonD...",3118560,"SeniorSecuredLoan—TermLoan7.0%,Due4/12",
11,,,AerostructuresAcquisitionLLC6AerospaceandDefense,429397,"SeniorSecuredLoan—DelayedDrawTermLoan7.5%,Due3/13",
13,,,AerostructuresAcquisitionLLC6AerospaceandDefense,5436949,"SeniorSecuredLoan—TermLoan7.5%,Due3/13",
15,,,"AGAMedicalCorporation6Healthcare,EducationandC...",3458569 3829883 3832209,"SeniorSecuredLoan—TrancheBTermLoan4.2%,Due4/13",
17,,,"AGSLLC6Hotels,Motels,Inns,andGaming",442044 419942 436817,"SeniorSecuredLoan—DelayedDrawTermLoan3.5%,Due5/13",


'Cost'

Unnamed: 0,Value,Portfolio_Company_/Principal_Business,Unnamed: 3,Investment_/Interest_Rate_/Maturity,Principal,Cost
0,Value2,PortfolioCompany/PrincipalBusiness,,InvestmentInterestRate¹/Maturity,Principal,Cost
1,,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",356819,SeniorSecuredLoan—DeferredDrawTermLoan(FirstLi...,$,
3,,"AdvancedLightingTechnologies,Inc.HomeandOffice...",960000 952585,"SeniorSecuredLoan—RevolvingLoan3.9%,Due6/13",,
5,,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",5000000,"JuniorSecuredLoan—SecondLienTermLoanNote8.5%,D...",,
7,,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",1834277,"SeniorSecuredLoan—TermLoan(FirstLien)4.6%,Due6/13",,
9,,"AeroProductsInternational,Inc.6PersonalandNonD...",3118560,"SeniorSecuredLoan—TermLoan7.0%,Due4/12",,
11,,AerostructuresAcquisitionLLC6AerospaceandDefense,429397,"SeniorSecuredLoan—DelayedDrawTermLoan7.5%,Due3/13",,
13,,AerostructuresAcquisitionLLC6AerospaceandDefense,5436949,"SeniorSecuredLoan—TermLoan7.5%,Due3/13",,
15,,"AGAMedicalCorporation6Healthcare,EducationandC...",3458569 3829883 3832209,"SeniorSecuredLoan—TrancheBTermLoan4.2%,Due4/13",,
17,,"AGSLLC6Hotels,Motels,Inns,andGaming",442044 419942 436817,"SeniorSecuredLoan—DelayedDrawTermLoan3.5%,Due5/13",,


'Value'

Unnamed: 0,Portfolio_Company_/Principal_Business,Unnamed: 2,Investment_/Interest_Rate_/Maturity,Principal,Cost,Value
0,PortfolioCompany/PrincipalBusiness,,InvestmentInterestRate¹/Maturity,Principal,Cost,Value2
1,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",356819,SeniorSecuredLoan—DeferredDrawTermLoan(FirstLi...,$,,
3,"AdvancedLightingTechnologies,Inc.HomeandOffice...",960000 952585,"SeniorSecuredLoan—RevolvingLoan3.9%,Due6/13",,,
5,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",5000000,"JuniorSecuredLoan—SecondLienTermLoanNote8.5%,D...",,,
7,"AdvancedLightingTechnologies,Inc.6HomeandOffic...",1834277,"SeniorSecuredLoan—TermLoan(FirstLien)4.6%,Due6/13",,,
9,"AeroProductsInternational,Inc.6PersonalandNonD...",3118560,"SeniorSecuredLoan—TermLoan7.0%,Due4/12",,,
11,AerostructuresAcquisitionLLC6AerospaceandDefense,429397,"SeniorSecuredLoan—DelayedDrawTermLoan7.5%,Due3/13",,,
13,AerostructuresAcquisitionLLC6AerospaceandDefense,5436949,"SeniorSecuredLoan—TermLoan7.5%,Due3/13",,,
15,"AGAMedicalCorporation6Healthcare,EducationandC...",3458569 3829883 3832209,"SeniorSecuredLoan—TrancheBTermLoan4.2%,Due4/13",,,
17,"AGSLLC6Hotels,Motels,Inns,andGaming",442044 419942 436817,"SeniorSecuredLoan—DelayedDrawTermLoan3.5%,Due5/13",,,


'2008-12-31\\\\Schedule_of_Investments_14.csv'

'Portfolio_Company_/Principal_Business'

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Portfolio_Company_/Principal_Business
0,,,,,,,,,,,,,,,TimeDepositsandMoneyMarketAccount
2,,Investment,Yield,,,,Cost,,,,Value2,,,,TimeDepositsandMoneyMarketAccount
3,,TimeDeposit,,,0.1,,,$,10462702.0,,,,10462702.0,,USBankEurodollarSweepCL2
5,,TimeDeposit,,,0.2,,,,1723295.0,,,,1723295.0,,JPMorganAssetAccount
7,,MoneyMarketAccount,,,0.19,,,,10.0,,,,10.0,,JPMorganBusinessMoneyMarketAccount
9,,,,,,,,$,12186007.0,,,,12186007.0,,TotalInvestmentinTimeDepositandMoneyMarketAcco...
11,,,,,,,,$,546626619.0,,,,514225273.0,,TotalInvestments5(205%ofnetassetvalueatfairvalue)


''

Unnamed: 0,2,5,9,13,Portfolio_Company_/Principal_Business,Unnamed: 6
0,,,,,TimeDepositsandMoneyMarketAccount,
2,Investment,,,,TimeDepositsandMoneyMarketAccount,Yield Cost Value2
3,TimeDeposit,0.1,10462702.0,10462702.0,USBankEurodollarSweepCL2,$
5,TimeDeposit,0.2,1723295.0,1723295.0,JPMorganAssetAccount,
7,MoneyMarketAccount,0.19,10.0,10.0,JPMorganBusinessMoneyMarketAccount,
9,,,12186007.0,12186007.0,TotalInvestmentinTimeDepositandMoneyMarketAcco...,$
11,,,546626619.0,514225273.0,TotalInvestments5(205%ofnetassetvalueatfairvalue),$


'Investment'

Unnamed: 0,5,9,13,Portfolio_Company_/Principal_Business,Unnamed: 5,Investment
0,,,,TimeDepositsandMoneyMarketAccount,,
2,,,,TimeDepositsandMoneyMarketAccount,Yield Cost Value2,Investment
3,0.1,10462702.0,10462702.0,USBankEurodollarSweepCL2,$,TimeDeposit
5,0.2,1723295.0,1723295.0,JPMorganAssetAccount,,TimeDeposit
7,0.19,10.0,10.0,JPMorganBusinessMoneyMarketAccount,,MoneyMarketAccount
9,,12186007.0,12186007.0,TotalInvestmentinTimeDepositandMoneyMarketAcco...,$,
11,,546626619.0,514225273.0,TotalInvestments5(205%ofnetassetvalueatfairvalue),$,


'Percent_of_Interests_Held'

Unnamed: 0,9,13,Portfolio_Company_/Principal_Business,Unnamed: 4,Investment,Percent_of_Interests_Held
0,,,TimeDepositsandMoneyMarketAccount,,,
2,,,TimeDepositsandMoneyMarketAccount,Yield Cost Value2,Investment,
3,10462702.0,10462702.0,USBankEurodollarSweepCL2,$,TimeDeposit,0.1
5,1723295.0,1723295.0,JPMorganAssetAccount,,TimeDeposit,0.2
7,10.0,10.0,JPMorganBusinessMoneyMarketAccount,,MoneyMarketAccount,0.19
9,12186007.0,12186007.0,TotalInvestmentinTimeDepositandMoneyMarketAcco...,$,,
11,546626619.0,514225273.0,TotalInvestments5(205%ofnetassetvalueatfairvalue),$,,


'Cost'

Unnamed: 0,13,Portfolio_Company_/Principal_Business,Unnamed: 3,Investment,Percent_of_Interests_Held,Cost
0,,TimeDepositsandMoneyMarketAccount,,,,
2,,TimeDepositsandMoneyMarketAccount,Yield Cost Value2,Investment,,
3,10462702.0,USBankEurodollarSweepCL2,$,TimeDeposit,0.1,10462702.0
5,1723295.0,JPMorganAssetAccount,,TimeDeposit,0.2,1723295.0
7,10.0,JPMorganBusinessMoneyMarketAccount,,MoneyMarketAccount,0.19,10.0
9,12186007.0,TotalInvestmentinTimeDepositandMoneyMarketAcco...,$,,,12186007.0
11,514225273.0,TotalInvestments5(205%ofnetassetvalueatfairvalue),$,,,546626619.0


'Value'

Unnamed: 0,Portfolio_Company_/Principal_Business,Unnamed: 2,Investment,Percent_of_Interests_Held,Cost,Value
0,TimeDepositsandMoneyMarketAccount,,,,,
2,TimeDepositsandMoneyMarketAccount,Yield Cost Value2,Investment,,,
3,USBankEurodollarSweepCL2,$,TimeDeposit,0.1,10462702.0,10462702.0
5,JPMorganAssetAccount,,TimeDeposit,0.2,1723295.0,1723295.0
7,JPMorganBusinessMoneyMarketAccount,,MoneyMarketAccount,0.19,10.0,10.0
9,TotalInvestmentinTimeDepositandMoneyMarketAcco...,$,,,12186007.0,12186007.0
11,TotalInvestments5(205%ofnetassetvalueatfairvalue),$,,,546626619.0,514225273.0


In [58]:
file_path = r'2008-12-31\\Schedule_of_Investments_14.csv'


def standard_field_names()->tuple:
    return (
        'Portfolio Company',
        'Portfolio Company /Principal Business',
        'Investment /Interest Rate /Maturity',
        'Principal',
        'Cost',
        'Value',
        'Percent of Class Held',
        'Investment',
        'CDO Fund Investments',
        'Percent of Interests Held',
        'Industry',
        'Spread Above Index',
        'Aquisition Date',
        'Interest Rate',
        'Maturity',
        'Principal/Shares',
        'Investment Type',
        'of Net Assets',
        'business description',
        'type of investment',
        'investment date',
        'reference rate and spread',
        'pik rate',
        'maturity date',
        'cost',
        'footnotes',
        'industry',
        'principal amount', # TODO change stand names for more dynamic fuzzywuzzy matching
        'fair value',
    )

def merge_duplicate_columns(
    df:pd.DataFrame,
    merged_pair_idxs:dict={}
)->pd.DataFrame:
    duplicate_cols = merged_pair_idxs.keys()
    flag = not merged_pair_idxs.keys()
    if flag: 
        duplicate_cols = df.columns.unique() 
    for col_name in duplicate_cols:
        display(col_name)
        display(df)
        mask = merged_pair_idxs.get(col_name)
        if flag:
            mask = df.columns == col_name
            merged_pair_idxs[col_name] = mask
        duplicate_data = df.loc[:, mask]
        merged_data = duplicate_data.apply(lambda row: ' '.join(set(row.dropna().astype(str))), axis=1)
        df = df.loc[:, ~mask]
        df[col_name] = merged_data
    return df.reset_index(drop=True),merged_pair_idxs


def _clean(
    file_path:str,
    except_rows:str,
    merged_pair_idxs:dict={},
)->pd.DataFrame:
    df = pd.read_csv(file_path,index_col=0,na_values=[' ', ''])
    # df.replace(['\xa0','\u200b',r'^\s$',r'^\s%'],np.nan,regex=True,inplace=True) #':','$','%' r'^\s*$',r'^\s*%'
    df.replace(['\xa0','\u200b',r'^\s*$',r'^\s*%',' ','˄'],'',regex=True,inplace=True)
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df.dropna(axis=0,how='all',inplace=True)
    df = df[~df.apply(lambda row:row.astype(str).str.contains(except_rows,case=False, na=False).any(),axis=1)]
    duplicate_idx = df.apply(lambda row:row[pd.to_numeric(row,errors='coerce').isna()].duplicated().sum() > 1 ,axis=1)
    clean_rows = df.loc[duplicate_idx].apply(remove_row_duplicates, axis=1).reset_index(drop=True)
    j = 0
    for i,flag in enumerate(duplicate_idx):
        if not flag:
            continue
        df.iloc[i,:] = clean_rows.loc[j,:]
        j += 1
    if not merged_pair_idxs:
        important_fields = strip_string(get_header_rows(df),standardize=True)#get_key_fields(df)
        df.columns = important_fields
    df,merge_pair_idxs = merge_duplicate_columns(df,merged_pair_idxs=merged_pair_idxs)

    df.replace([r'^\s*$'],np.nan,regex=True,inplace=True) #':','$','%'
    df.dropna(axis=1,how='all',inplace=True)
    
    columns = (~df.isna()).sum(axis=0) <= 4  if df.shape[0] > 12 else 1
    df.drop(columns=df.columns[columns],inplace=True)
    return df.reset_index(drop=True),merge_pair_idxs


merged_pair_idxs = {
    'Portfolio_Company_/Principal_Business': np.array([ True, False, False, False, False, False, False, False, False,False, False, False, False, False, False]),
    '': np.array([ True, False,  True, True, False, True,  True, True, False,True,  True, True, False, True,False]),
    'Investment': np.array([ True, False, False, False, False, False]),
    'Percent_of_Interests_Held': np.array([ True,  False,  False, False, False, False]),
    'Cost': np.array([True,  False, False, False, False, False]),
    'Value': np.array([True,  False, False, False, False, False])
}

ex = exceptions()
ex_rows = '|'.join(except_rows())
df,merged_pair_idxs = _clean(file_path,except_rows=ex_rows,merged_pair_idxs=merged_pair_idxs)
display(df)
# display(merged_pair_idxs)
index_list = df.apply(
    lambda row:row.astype(str).str.contains(stopping_criterion(None), case=False, na=False).any(),
    axis=1
)
index_list_sum = index_list.sum()
index_list_sum

'Portfolio_Company_/Principal_Business'

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,TimeDepositsandMoneyMarketAccount,,,,,,,,,,,,,,
2,TimeDepositsandMoneyMarketAccount,,Investment,Yield,,,,Cost,,,,Value2,,,
3,USBankEurodollarSweepCL2,,TimeDeposit,,,0.1,,,$,10462702.0,,,,10462702.0,
5,JPMorganAssetAccount,,TimeDeposit,,,0.2,,,,1723295.0,,,,1723295.0,
7,JPMorganBusinessMoneyMarketAccount,,MoneyMarketAccount,,,0.19,,,,10.0,,,,10.0,
9,TotalInvestmentinTimeDepositandMoneyMarketAcco...,,,,,,,,$,12186007.0,,,,12186007.0,
11,TotalInvestments5(205%ofnetassetvalueatfairvalue),,,,,,,,$,546626619.0,,,,514225273.0,


''

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Portfolio_Company_/Principal_Business
0,,,,,,,,,,,,,,,TimeDepositsandMoneyMarketAccount
2,,Investment,Yield,,,,Cost,,,,Value2,,,,TimeDepositsandMoneyMarketAccount
3,,TimeDeposit,,,0.1,,,$,10462702.0,,,,10462702.0,,USBankEurodollarSweepCL2
5,,TimeDeposit,,,0.2,,,,1723295.0,,,,1723295.0,,JPMorganAssetAccount
7,,MoneyMarketAccount,,,0.19,,,,10.0,,,,10.0,,JPMorganBusinessMoneyMarketAccount
9,,,,,,,,$,12186007.0,,,,12186007.0,,TotalInvestmentinTimeDepositandMoneyMarketAcco...
11,,,,,,,,$,546626619.0,,,,514225273.0,,TotalInvestments5(205%ofnetassetvalueatfairvalue)


'Investment'

Unnamed: 0,2,5,9,13,Portfolio_Company_/Principal_Business,Unnamed: 6
0,,,,,TimeDepositsandMoneyMarketAccount,
2,Investment,,,,TimeDepositsandMoneyMarketAccount,Yield Cost Value2
3,TimeDeposit,0.1,10462702.0,10462702.0,USBankEurodollarSweepCL2,$
5,TimeDeposit,0.2,1723295.0,1723295.0,JPMorganAssetAccount,
7,MoneyMarketAccount,0.19,10.0,10.0,JPMorganBusinessMoneyMarketAccount,
9,,,12186007.0,12186007.0,TotalInvestmentinTimeDepositandMoneyMarketAcco...,$
11,,,546626619.0,514225273.0,TotalInvestments5(205%ofnetassetvalueatfairvalue),$


'Percent_of_Interests_Held'

Unnamed: 0,5,9,13,Portfolio_Company_/Principal_Business,Unnamed: 5,Investment
0,,,,TimeDepositsandMoneyMarketAccount,,
2,,,,TimeDepositsandMoneyMarketAccount,Yield Cost Value2,Investment
3,0.1,10462702.0,10462702.0,USBankEurodollarSweepCL2,$,TimeDeposit
5,0.2,1723295.0,1723295.0,JPMorganAssetAccount,,TimeDeposit
7,0.19,10.0,10.0,JPMorganBusinessMoneyMarketAccount,,MoneyMarketAccount
9,,12186007.0,12186007.0,TotalInvestmentinTimeDepositandMoneyMarketAcco...,$,
11,,546626619.0,514225273.0,TotalInvestments5(205%ofnetassetvalueatfairvalue),$,


'Cost'

Unnamed: 0,9,13,Portfolio_Company_/Principal_Business,Unnamed: 4,Investment,Percent_of_Interests_Held
0,,,TimeDepositsandMoneyMarketAccount,,,
2,,,TimeDepositsandMoneyMarketAccount,Yield Cost Value2,Investment,
3,10462702.0,10462702.0,USBankEurodollarSweepCL2,$,TimeDeposit,0.1
5,1723295.0,1723295.0,JPMorganAssetAccount,,TimeDeposit,0.2
7,10.0,10.0,JPMorganBusinessMoneyMarketAccount,,MoneyMarketAccount,0.19
9,12186007.0,12186007.0,TotalInvestmentinTimeDepositandMoneyMarketAcco...,$,,
11,546626619.0,514225273.0,TotalInvestments5(205%ofnetassetvalueatfairvalue),$,,


'Value'

Unnamed: 0,13,Portfolio_Company_/Principal_Business,Unnamed: 3,Investment,Percent_of_Interests_Held,Cost
0,,TimeDepositsandMoneyMarketAccount,,,,
2,,TimeDepositsandMoneyMarketAccount,Yield Cost Value2,Investment,,
3,10462702.0,USBankEurodollarSweepCL2,$,TimeDeposit,0.1,10462702.0
5,1723295.0,JPMorganAssetAccount,,TimeDeposit,0.2,1723295.0
7,10.0,JPMorganBusinessMoneyMarketAccount,,MoneyMarketAccount,0.19,10.0
9,12186007.0,TotalInvestmentinTimeDepositandMoneyMarketAcco...,$,,,12186007.0
11,514225273.0,TotalInvestments5(205%ofnetassetvalueatfairvalue),$,,,546626619.0


Unnamed: 0,Portfolio_Company_/Principal_Business,Investment,Percent_of_Interests_Held,Cost,Value
0,TimeDepositsandMoneyMarketAccount,,,,
1,TimeDepositsandMoneyMarketAccount,Investment,,,
2,USBankEurodollarSweepCL2,TimeDeposit,0.1,10462702.0,10462702.0
3,JPMorganAssetAccount,TimeDeposit,0.2,1723295.0,1723295.0
4,JPMorganBusinessMoneyMarketAccount,MoneyMarketAccount,0.19,10.0,10.0
5,TotalInvestmentinTimeDepositandMoneyMarketAcco...,,,12186007.0,12186007.0
6,TotalInvestments5(205%ofnetassetvalueatfairvalue),,,546626619.0,514225273.0


1