In [70]:
import pandas as pd
import numpy as np
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
import os
from docx import Document
from tqdm import tqdm

In [71]:
curr_path = os.path.dirname(os.path.realpath(__name__))
data_dir = os.path.join(curr_path, 'data')

# list dir without .DS_Store
data_folders = [f for f in os.listdir(data_dir) if not f.startswith('.')]

In [72]:
data_folders

['arbitrage',
 'multi_asset',
 'conservative_hybrid',
 'equity_savings',
 'aggressive_hybrid']

In [73]:
dfs = {}
for data_folder in data_folders:
    dfs[data_folder] = {}
    curr_path = os.path.join(data_dir, data_folder)
    files = [f for f in os.listdir(curr_path) if not f.startswith('.')]
    print(f"Processing {data_folder} folder")
    for file in tqdm(files):
        if file.endswith('.docx'):
            doc = Document(os.path.join(curr_path, file))
            table = doc.tables[0]
            data = []
            for row in table.rows:
                data.append([cell.text for cell in row.cells])

            df = pd.DataFrame(data)

            # rename columns and drop first 5 rows
            df.columns = df.iloc[4].values
            df = df[5:]
            df.index = df['NAV date']
            df.index.name = 'date'
            indexes = []
            for ind in df.index:
                indexes.append(ind.split('<')[0])
            df.index = indexes
            # display(df.head(10))
            # print(df.index[:10])
            try:
                df.index = pd.to_datetime(df.index)
                
            except:
                try:
                    df.index = pd.to_datetime(df.index, format="%d-%b-%Y")
                except:
                    index_list.append(df.index)
                    print(f"type of index: {type(df.index[0])}")
                    display(df.head(1))
                    print(f"Error converting date in {file}")
    
            df = df.iloc[:, :1]

            dfs[data_folder][file.split('.')[0]] = df

Processing arbitrage folder


100%|██████████| 5/5 [00:01<00:00,  4.23it/s]


Processing multi_asset folder


100%|██████████| 5/5 [00:00<00:00,  5.09it/s]


Processing conservative_hybrid folder


100%|██████████| 5/5 [00:01<00:00,  4.33it/s]


Processing equity_savings folder


100%|██████████| 5/5 [00:01<00:00,  4.26it/s]


Processing aggressive_hybrid folder


100%|██████████| 5/5 [00:01<00:00,  4.11it/s]


In [74]:
dfs.keys()

dict_keys(['arbitrage', 'multi_asset', 'conservative_hybrid', 'equity_savings', 'aggressive_hybrid'])

In [75]:
dfs['arbitrage'].keys()

dict_keys(['Invesco_India_Arbitrage_Fund', 'Kotak_Equity_Arbitrage_Fund', 'SBI_Arbitrage_Opportunities_Fund', 'ICICI_Prudential_Equity_Arbitrage_Fund', 'HDFC_Arbitrage_Fund_Wholesale_Plan'])

In [76]:
dfs['arbitrage']['Invesco_India_Arbitrage_Fund']

Unnamed: 0,Net Asset Value
2020-03-02,24.8453
2020-03-03,24.836
2020-03-04,24.8369
2020-03-05,24.8587
2020-03-06,24.8667
...,...
2025-02-14,33.5898
2025-02-17,33.5797
2025-02-18,33.6183
2025-02-19,33.6177


In [78]:
for df in dfs['arbitrage'].values():

    print(len(df.index))

1285
1285
1288
1285
1285


In [79]:
for big_key in dfs.keys():
    print(f"{big_key}:")
    for key, df in dfs[big_key].items():
        print(f"{key}: {len(df.index)}")

arbitrage:
Invesco_India_Arbitrage_Fund: 1285
Kotak_Equity_Arbitrage_Fund: 1285
SBI_Arbitrage_Opportunities_Fund: 1288
ICICI_Prudential_Equity_Arbitrage_Fund: 1285
HDFC_Arbitrage_Fund_Wholesale_Plan: 1285
multi_asset:
SBI_Multi_Asset_Allocation_Fund: 1263
ICICI_Prudential_Multi_Asset_Fund: 1285
Nippon_India_Multi_Asset_Allocation_Fund: 1106
Kotak_Multi_Asset_Allocation_Fund: 343
UTI_Multi_Asset_Allocation_Fund: 1285
conservative_hybrid:
HDFC_Hybrid_Debt_Fund: 1261
SBI_Conservative_Hybrid_Fund: 1263
Kotak_Debt_Hybrid_Fund: 1260
ICICI_Prudential_Regular_Savings_Fund: 1260
UTI_Conservative_Hybrid_Fund: 1260
equity_savings:
ICICI_Prudential_Equity_Savings_Fund: 1285
HDFC_Equity_Savings_Fund: 1285
Kotak_Equity_Savings_Fund: 1285
SBI_Equity_Savings_Fund: 1289
DSP_Equity_Savings_Fund: 1285
aggressive_hybrid:
Canara_Robeco_Equity_Hybrid_fund: 1285
DSP_Aggresive_Hybrid_Fund: 1285
HDFC_Hybrid_Equity_Fund: 1285
SBI_Equity_Hybrid_Fund: 1288
ICICI_Prudential_Equity_and_Debt_Fund: 1285


In [80]:
dfs['arbitrage']['Invesco_India_Arbitrage_Fund'].tail()

Unnamed: 0,Net Asset Value
2025-02-14,33.5898
2025-02-17,33.5797
2025-02-18,33.6183
2025-02-19,33.6177
2025-02-20,33.6235


In [81]:
df

Unnamed: 0,Net Asset Value
2020-03-02,139.44
2020-03-03,142.24
2020-03-04,141.66
2020-03-05,141.47
2020-03-06,138.48
...,...
2025-02-14,393.00
2025-02-17,393.39
2025-02-18,393.86
2025-02-19,394.89
