In [186]:
import pandas as pd
import numpy as np
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
import os
from docx import Document
from tqdm import tqdm

In [187]:
curr_path = os.path.dirname(os.path.realpath(__name__))
data_dir = os.path.join(curr_path, 'data')

# list dir without .DS_Store
data_folders = [f for f in os.listdir(data_dir) if not f.startswith('.')]

In [188]:
data_folders

['arbitrage',
 'multi_asset',
 'conservative_hybrid',
 'equity_savings',
 'aggressive_hybrid']

In [189]:
dfs = {}
for data_folder in data_folders:
    dfs[data_folder] = {}
    curr_path = os.path.join(data_dir, data_folder)
    files = [f for f in os.listdir(curr_path) if not f.startswith('.')]
    print(f"Processing {data_folder} folder")
    for file in tqdm(files):
        if file.endswith('.docx'):
            doc = Document(os.path.join(curr_path, file))
            table = doc.tables[0]
            data = []
            for row in table.rows:
                data.append([cell.text for cell in row.cells])

            df = pd.DataFrame(data)

            # rename columns and drop first 5 rows
            df.columns = df.iloc[4].values
            df = df[5:]
            df.index = df['NAV date']
            df.index.name = 'date'
            indexes = []
            for ind in df.index:
                indexes.append(ind.split('<')[0])
            df.index = indexes
            # display(df.head(10))
            # print(df.index[:10])
            try:
                df.index = pd.to_datetime(df.index)
                
            except:
                try:
                    df.index = pd.to_datetime(df.index, format="%d-%b-%Y")
                except:
                    print(f"type of index: {type(df.index[0])}")
                    display(df.head(1))
                    print(f"Error converting date in {file}")
    
            df = df.iloc[:, :1]
            df = df.add_suffix(f'_{file.split(".")[0]}')
            # df = df[~df.index.duplicated(keep='first')]
            
            dfs[data_folder][file.split('.')[0]] = df

Processing arbitrage folder


100%|██████████| 5/5 [00:01<00:00,  4.10it/s]


Processing multi_asset folder


100%|██████████| 5/5 [00:00<00:00,  5.08it/s]


Processing conservative_hybrid folder


100%|██████████| 5/5 [00:01<00:00,  4.20it/s]


Processing equity_savings folder


100%|██████████| 5/5 [00:01<00:00,  3.86it/s]


Processing aggressive_hybrid folder


100%|██████████| 5/5 [00:01<00:00,  4.08it/s]


In [190]:
dfs.keys()

dict_keys(['arbitrage', 'multi_asset', 'conservative_hybrid', 'equity_savings', 'aggressive_hybrid'])

In [191]:
dfs['arbitrage'].keys()

dict_keys(['Invesco_India_Arbitrage_Fund', 'Kotak_Equity_Arbitrage_Fund', 'SBI_Arbitrage_Opportunities_Fund', 'ICICI_Prudential_Equity_Arbitrage_Fund', 'HDFC_Arbitrage_Fund_Wholesale_Plan'])

In [192]:
dfs['arbitrage']['Invesco_India_Arbitrage_Fund']

Unnamed: 0,Net Asset Value_Invesco_India_Arbitrage_Fund
2020-03-02,24.8453
2020-03-03,24.836
2020-03-04,24.8369
2020-03-05,24.8587
2020-03-06,24.8667
...,...
2025-02-14,33.5898
2025-02-17,33.5797
2025-02-18,33.6183
2025-02-19,33.6177


In [193]:
for df in dfs['arbitrage'].values():

    print(len(df.index))

1285
1285
1288
1285
1285


In [194]:
for big_key in dfs.keys():
    print(f"{big_key}:")
    for key, df in dfs[big_key].items():
        print(f"{key}: {len(df.index)}")

arbitrage:
Invesco_India_Arbitrage_Fund: 1285
Kotak_Equity_Arbitrage_Fund: 1285
SBI_Arbitrage_Opportunities_Fund: 1288
ICICI_Prudential_Equity_Arbitrage_Fund: 1285
HDFC_Arbitrage_Fund_Wholesale_Plan: 1285
multi_asset:
SBI_Multi_Asset_Allocation_Fund: 1263
ICICI_Prudential_Multi_Asset_Fund: 1285
Nippon_India_Multi_Asset_Allocation_Fund: 1106
Kotak_Multi_Asset_Allocation_Fund: 343
UTI_Multi_Asset_Allocation_Fund: 1285
conservative_hybrid:
HDFC_Hybrid_Debt_Fund: 1261
SBI_Conservative_Hybrid_Fund: 1263
Kotak_Debt_Hybrid_Fund: 1260
ICICI_Prudential_Regular_Savings_Fund: 1260
UTI_Conservative_Hybrid_Fund: 1260
equity_savings:
ICICI_Prudential_Equity_Savings_Fund: 1285
HDFC_Equity_Savings_Fund: 1285
Kotak_Equity_Savings_Fund: 1285
SBI_Equity_Savings_Fund: 1289
DSP_Equity_Savings_Fund: 1285
aggressive_hybrid:
Canara_Robeco_Equity_Hybrid_fund: 1285
DSP_Aggresive_Hybrid_Fund: 1285
HDFC_Hybrid_Equity_Fund: 1285
SBI_Equity_Hybrid_Fund: 1288
ICICI_Prudential_Equity_and_Debt_Fund: 1285


In [195]:
df_merged = pd.concat(list(dfs['arbitrage'].values())[1:3], axis=1)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [None]:
list(dfs['arbitrage'].values())[1:3]

[           Net Asset Value_Kotak_Equity_Arbitrage_Fund
 2020-03-02                                     28.9058
 2020-03-03                                     28.8917
 2020-03-04                                     28.8987
 2020-03-05                                     28.9122
 2020-03-06                                     28.9292
 ...                                                ...
 2025-02-14                                     38.9752
 2025-02-17                                     38.9679
 2025-02-18                                     39.0128
 2025-02-19                                     39.0094
 2025-02-20                                     39.0103
 
 [1229 rows x 1 columns],
            Net Asset Value_SBI_Arbitrage_Opportunities_Fund
 2020-03-02                                          26.3494
 2020-03-03                                          26.3293
 2020-03-04                                          26.3292
 2020-03-05                                          26.

In [None]:
index1 = list(dfs['arbitrage'].values())[1].index
index2 = list(dfs['arbitrage'].values())[2].index

In [None]:
index1

DatetimeIndex(['2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05',
               '2020-03-06', '2020-03-09', '2020-03-11', '2020-03-12',
               '2020-03-13', '2020-03-16',
               ...
               '2025-02-07', '2025-02-10', '2025-02-11', '2025-02-12',
               '2025-02-13', '2025-02-14', '2025-02-17', '2025-02-18',
               '2025-02-19', '2025-02-20'],
              dtype='datetime64[ns]', length=1229, freq=None)

In [None]:
index2

DatetimeIndex(['2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05',
               '2020-03-06', '2020-03-09', '2020-03-11', '2020-03-12',
               '2020-03-13', '2020-03-16',
               ...
               '2025-02-07', '2025-02-10', '2025-02-11', '2025-02-12',
               '2025-02-13', '2025-02-14', '2025-02-17', '2025-02-18',
               '2025-02-19', '2025-02-20'],
              dtype='datetime64[ns]', length=1232, freq=None)

In [None]:
index1.unique()

DatetimeIndex(['2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05',
               '2020-03-06', '2020-03-09', '2020-03-11', '2020-03-12',
               '2020-03-13', '2020-03-16',
               ...
               '2025-02-07', '2025-02-10', '2025-02-11', '2025-02-12',
               '2025-02-13', '2025-02-14', '2025-02-17', '2025-02-18',
               '2025-02-19', '2025-02-20'],
              dtype='datetime64[ns]', length=1229, freq=None)

In [None]:
index2.unique()

DatetimeIndex(['2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05',
               '2020-03-06', '2020-03-09', '2020-03-11', '2020-03-12',
               '2020-03-13', '2020-03-16',
               ...
               '2025-02-07', '2025-02-10', '2025-02-11', '2025-02-12',
               '2025-02-13', '2025-02-14', '2025-02-17', '2025-02-18',
               '2025-02-19', '2025-02-20'],
              dtype='datetime64[ns]', length=1232, freq=None)

In [None]:
df_merged

Unnamed: 0,Net Asset Value_Kotak_Equity_Arbitrage_Fund,Net Asset Value_SBI_Arbitrage_Opportunities_Fund
2020-03-02,28.9058,26.3494
2020-03-03,28.8917,26.3293
2020-03-04,28.8987,26.3292
2020-03-05,28.9122,26.3461
2020-03-06,28.9292,26.3748
...,...,...
2025-02-14,38.9752,34.9810
2025-02-17,38.9679,34.9729
2025-02-18,39.0128,35.0093
2025-02-19,39.0094,35.0097
