In [69]:
import pandas as pd
import numpy as np
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
import os
from docx import Document
from tqdm import tqdm
from collections import Counter
import seaborn as sns
import utils
import matplotlib.pyplot as plt

In [52]:
curr_path = os.path.dirname(os.path.realpath(__name__))
data_dir = os.path.join(curr_path, 'data')

# list dir without .DS_Store
data_folders = [f for f in os.listdir(data_dir) if not f.startswith('.')]

In [53]:
data_folders

['arbitrage',
 'multi_asset',
 'conservative_hybrid',
 'equity_savings',
 'aggressive_hybrid']

In [54]:
dfs = {}
for data_folder in data_folders:
    dfs[data_folder] = {}
    curr_path = os.path.join(data_dir, data_folder)
    files = [f for f in os.listdir(curr_path) if not f.startswith('.')]
    print(f"Processing {data_folder} folder")
    for file in tqdm(files):
        if file.endswith('.docx'):
            doc = Document(os.path.join(curr_path, file))
            table = doc.tables[0]
            data = []
            for row in table.rows:
                data.append([cell.text for cell in row.cells])

            df = pd.DataFrame(data)

            # rename columns and drop first 5 rows
            df.columns = df.iloc[4].values
            df = df[5:]
            df.index = df['NAV date']
            df.index.name = 'date'
            indexes = []
            for ind in df.index:
                indexes.append(ind.split('<')[0])
            df.index = indexes
            # display(df.head(10))
            # print(df.index[:10])
            try:
                df.index = pd.to_datetime(df.index)
                
            except:
                try:
                    df.index = pd.to_datetime(df.index, format="%d-%b-%Y")
                except:
                    print(f"type of index: {type(df.index[0])}")
                    display(df.head(1))
                    print(f"Error converting date in {file}")
    
            df = df.iloc[:, :1]
            df.columns = ["NAV"]
            # set NAV columns as float type
            df['NAV'] = df['NAV'].str.replace(',', '').astype(float)
            # df = df.add_suffix(f'_{file.split(".")[0]}')
            df = df[~df.index.duplicated(keep='last')]
            
            dfs[data_folder][file.split('.')[0]] = df

Processing arbitrage folder


100%|██████████| 5/5 [00:01<00:00,  4.14it/s]


Processing multi_asset folder


100%|██████████| 5/5 [00:01<00:00,  4.48it/s]


Processing conservative_hybrid folder


100%|██████████| 5/5 [00:01<00:00,  4.41it/s]


Processing equity_savings folder


100%|██████████| 5/5 [00:01<00:00,  4.39it/s]


Processing aggressive_hybrid folder


100%|██████████| 5/5 [00:01<00:00,  4.33it/s]


In [55]:
dfs.keys()

dict_keys(['arbitrage', 'multi_asset', 'conservative_hybrid', 'equity_savings', 'aggressive_hybrid'])

In [56]:
dfs['arbitrage'].keys()

dict_keys(['Invesco_India_Arbitrage_Fund', 'Kotak_Equity_Arbitrage_Fund', 'SBI_Arbitrage_Opportunities_Fund', 'ICICI_Prudential_Equity_Arbitrage_Fund', 'HDFC_Arbitrage_Fund_Wholesale_Plan'])

In [57]:
dfs['arbitrage']['Invesco_India_Arbitrage_Fund']

Unnamed: 0,NAV
2020-03-02,24.8453
2020-03-03,24.8360
2020-03-04,24.8369
2020-03-05,24.8587
2020-03-06,24.8667
...,...
2025-02-14,33.5898
2025-02-17,33.5797
2025-02-18,33.6183
2025-02-19,33.6177


In [58]:
for df in dfs['arbitrage'].values():

    print(len(df.index))

1229
1229
1232
1229
1229


In [59]:
for big_key in dfs.keys():
    print(f"{big_key}:")
    for key, df in dfs[big_key].items():
        print(f"{key}: {len(df.index)}")

arbitrage:
Invesco_India_Arbitrage_Fund: 1229
Kotak_Equity_Arbitrage_Fund: 1229
SBI_Arbitrage_Opportunities_Fund: 1232
ICICI_Prudential_Equity_Arbitrage_Fund: 1229
HDFC_Arbitrage_Fund_Wholesale_Plan: 1229
multi_asset:
SBI_Multi_Asset_Allocation_Fund: 1209
ICICI_Prudential_Multi_Asset_Fund: 1229
HDFC_Multi_Asset_Fund: 1229
Nippon_India_Multi_Asset_Allocation_Fund: 1106
UTI_Multi_Asset_Allocation_Fund: 1229
conservative_hybrid:
HDFC_Hybrid_Debt_Fund: 1207
SBI_Conservative_Hybrid_Fund: 1209
Kotak_Debt_Hybrid_Fund: 1206
ICICI_Prudential_Regular_Savings_Fund: 1206
UTI_Conservative_Hybrid_Fund: 1206
equity_savings:
ICICI_Prudential_Equity_Savings_Fund: 1229
HDFC_Equity_Savings_Fund: 1229
Kotak_Equity_Savings_Fund: 1229
SBI_Equity_Savings_Fund: 1232
DSP_Equity_Savings_Fund: 1229
aggressive_hybrid:
Canara_Robeco_Equity_Hybrid_fund: 1229
DSP_Aggresive_Hybrid_Fund: 1229
HDFC_Hybrid_Equity_Fund: 1229
SBI_Equity_Hybrid_Fund: 1232
ICICI_Prudential_Equity_and_Debt_Fund: 1229


In [60]:
merged_df = None
for big_key in dfs.keys():
    # print(f"Processing {big_key}")
    for key, df in tqdm(dfs[big_key].items()):
        df = df.copy().add_suffix(f'_{big_key}_{key}')
        if merged_df is None:
            merged_df = df
        else:
            merged_df = merged_df.merge(df, how='outer', left_index=True, right_index=True)

merged_df.dropna(inplace=True)
merged_df.sort_index(inplace=True)

100%|██████████| 5/5 [00:00<00:00, 1617.67it/s]
100%|██████████| 5/5 [00:00<00:00, 1767.36it/s]
100%|██████████| 5/5 [00:00<00:00, 2139.51it/s]
100%|██████████| 5/5 [00:00<00:00, 2305.32it/s]
100%|██████████| 5/5 [00:00<00:00, 1906.33it/s]


In [61]:
merged_df

Unnamed: 0,NAV_arbitrage_Invesco_India_Arbitrage_Fund,NAV_arbitrage_Kotak_Equity_Arbitrage_Fund,NAV_arbitrage_SBI_Arbitrage_Opportunities_Fund,NAV_arbitrage_ICICI_Prudential_Equity_Arbitrage_Fund,NAV_arbitrage_HDFC_Arbitrage_Fund_Wholesale_Plan,NAV_multi_asset_SBI_Multi_Asset_Allocation_Fund,NAV_multi_asset_ICICI_Prudential_Multi_Asset_Fund,NAV_multi_asset_HDFC_Multi_Asset_Fund,NAV_multi_asset_Nippon_India_Multi_Asset_Allocation_Fund,NAV_multi_asset_UTI_Multi_Asset_Allocation_Fund,...,NAV_equity_savings_ICICI_Prudential_Equity_Savings_Fund,NAV_equity_savings_HDFC_Equity_Savings_Fund,NAV_equity_savings_Kotak_Equity_Savings_Fund,NAV_equity_savings_SBI_Equity_Savings_Fund,NAV_equity_savings_DSP_Equity_Savings_Fund,NAV_aggressive_hybrid_Canara_Robeco_Equity_Hybrid_fund,NAV_aggressive_hybrid_DSP_Aggresive_Hybrid_Fund,NAV_aggressive_hybrid_HDFC_Hybrid_Equity_Fund,NAV_aggressive_hybrid_SBI_Equity_Hybrid_Fund,NAV_aggressive_hybrid_ICICI_Prudential_Equity_and_Debt_Fund
2020-08-31,25.4211,29.5394,26.7231,27.3938,15.078,32.1628,287.8636,37.309,9.9220,38.2235,...,15.15,39.897,15.9160,14.7307,13.740,184.65,173.529,55.886,151.2363,142.45
2020-09-01,25.3963,29.5134,26.6938,27.3812,15.058,32.4572,291.2120,37.488,9.9562,38.3948,...,15.20,40.036,15.9548,14.7996,13.807,185.71,174.930,56.125,152.6903,143.93
2020-09-02,25.4084,29.5244,26.7084,27.3935,15.067,32.5219,291.9228,37.622,10.0002,38.5434,...,15.24,40.126,16.0017,14.8511,13.851,186.78,175.594,56.378,153.6755,144.42
2020-09-03,25.4188,29.5370,26.7201,27.4074,15.072,32.6015,289.9180,37.565,9.9214,38.4933,...,15.23,40.094,16.0169,14.8543,13.852,186.84,175.436,56.298,153.4914,143.71
2020-09-04,25.4265,29.5504,26.7204,27.4153,15.078,32.3780,285.4421,37.205,9.8052,38.1414,...,15.14,39.768,15.9315,14.7546,13.782,185.16,173.647,55.494,151.5306,141.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-02-13,33.5721,38.9609,34.9659,35.7932,19.636,60.4248,772.1037,73.649,20.9540,76.9516,...,23.17,69.808,26.9512,24.6317,23.309,370.30,371.043,118.515,299.8113,395.68
2025-02-14,33.5898,38.9752,34.9810,35.8078,19.642,60.2079,769.5966,73.397,20.8358,76.4988,...,23.14,69.594,26.7976,24.4879,23.249,366.50,367.602,118.026,298.3150,393.00
2025-02-17,33.5797,38.9679,34.9729,35.8005,19.640,60.0667,770.4111,73.382,20.7984,76.3587,...,23.15,69.669,26.8137,24.4576,23.232,366.69,367.209,117.851,297.9564,393.39
2025-02-18,33.6183,39.0128,35.0093,35.8409,19.662,60.0516,771.3346,73.380,20.8171,76.3092,...,23.17,69.661,26.8587,24.4015,23.283,366.61,366.961,117.465,297.7959,393.86


In [62]:
merged_df.columns

Index(['NAV_arbitrage_Invesco_India_Arbitrage_Fund',
       'NAV_arbitrage_Kotak_Equity_Arbitrage_Fund',
       'NAV_arbitrage_SBI_Arbitrage_Opportunities_Fund',
       'NAV_arbitrage_ICICI_Prudential_Equity_Arbitrage_Fund',
       'NAV_arbitrage_HDFC_Arbitrage_Fund_Wholesale_Plan',
       'NAV_multi_asset_SBI_Multi_Asset_Allocation_Fund',
       'NAV_multi_asset_ICICI_Prudential_Multi_Asset_Fund',
       'NAV_multi_asset_HDFC_Multi_Asset_Fund',
       'NAV_multi_asset_Nippon_India_Multi_Asset_Allocation_Fund',
       'NAV_multi_asset_UTI_Multi_Asset_Allocation_Fund',
       'NAV_conservative_hybrid_HDFC_Hybrid_Debt_Fund',
       'NAV_conservative_hybrid_SBI_Conservative_Hybrid_Fund',
       'NAV_conservative_hybrid_Kotak_Debt_Hybrid_Fund',
       'NAV_conservative_hybrid_ICICI_Prudential_Regular_Savings_Fund',
       'NAV_conservative_hybrid_UTI_Conservative_Hybrid_Fund',
       'NAV_equity_savings_ICICI_Prudential_Equity_Savings_Fund',
       'NAV_equity_savings_HDFC_Equity_Savings_F

In [63]:
merged_df

Unnamed: 0,NAV_arbitrage_Invesco_India_Arbitrage_Fund,NAV_arbitrage_Kotak_Equity_Arbitrage_Fund,NAV_arbitrage_SBI_Arbitrage_Opportunities_Fund,NAV_arbitrage_ICICI_Prudential_Equity_Arbitrage_Fund,NAV_arbitrage_HDFC_Arbitrage_Fund_Wholesale_Plan,NAV_multi_asset_SBI_Multi_Asset_Allocation_Fund,NAV_multi_asset_ICICI_Prudential_Multi_Asset_Fund,NAV_multi_asset_HDFC_Multi_Asset_Fund,NAV_multi_asset_Nippon_India_Multi_Asset_Allocation_Fund,NAV_multi_asset_UTI_Multi_Asset_Allocation_Fund,...,NAV_equity_savings_ICICI_Prudential_Equity_Savings_Fund,NAV_equity_savings_HDFC_Equity_Savings_Fund,NAV_equity_savings_Kotak_Equity_Savings_Fund,NAV_equity_savings_SBI_Equity_Savings_Fund,NAV_equity_savings_DSP_Equity_Savings_Fund,NAV_aggressive_hybrid_Canara_Robeco_Equity_Hybrid_fund,NAV_aggressive_hybrid_DSP_Aggresive_Hybrid_Fund,NAV_aggressive_hybrid_HDFC_Hybrid_Equity_Fund,NAV_aggressive_hybrid_SBI_Equity_Hybrid_Fund,NAV_aggressive_hybrid_ICICI_Prudential_Equity_and_Debt_Fund
2020-08-31,25.4211,29.5394,26.7231,27.3938,15.078,32.1628,287.8636,37.309,9.9220,38.2235,...,15.15,39.897,15.9160,14.7307,13.740,184.65,173.529,55.886,151.2363,142.45
2020-09-01,25.3963,29.5134,26.6938,27.3812,15.058,32.4572,291.2120,37.488,9.9562,38.3948,...,15.20,40.036,15.9548,14.7996,13.807,185.71,174.930,56.125,152.6903,143.93
2020-09-02,25.4084,29.5244,26.7084,27.3935,15.067,32.5219,291.9228,37.622,10.0002,38.5434,...,15.24,40.126,16.0017,14.8511,13.851,186.78,175.594,56.378,153.6755,144.42
2020-09-03,25.4188,29.5370,26.7201,27.4074,15.072,32.6015,289.9180,37.565,9.9214,38.4933,...,15.23,40.094,16.0169,14.8543,13.852,186.84,175.436,56.298,153.4914,143.71
2020-09-04,25.4265,29.5504,26.7204,27.4153,15.078,32.3780,285.4421,37.205,9.8052,38.1414,...,15.14,39.768,15.9315,14.7546,13.782,185.16,173.647,55.494,151.5306,141.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-02-13,33.5721,38.9609,34.9659,35.7932,19.636,60.4248,772.1037,73.649,20.9540,76.9516,...,23.17,69.808,26.9512,24.6317,23.309,370.30,371.043,118.515,299.8113,395.68
2025-02-14,33.5898,38.9752,34.9810,35.8078,19.642,60.2079,769.5966,73.397,20.8358,76.4988,...,23.14,69.594,26.7976,24.4879,23.249,366.50,367.602,118.026,298.3150,393.00
2025-02-17,33.5797,38.9679,34.9729,35.8005,19.640,60.0667,770.4111,73.382,20.7984,76.3587,...,23.15,69.669,26.8137,24.4576,23.232,366.69,367.209,117.851,297.9564,393.39
2025-02-18,33.6183,39.0128,35.0093,35.8409,19.662,60.0516,771.3346,73.380,20.8171,76.3092,...,23.17,69.661,26.8587,24.4015,23.283,366.61,366.961,117.465,297.7959,393.86


In [68]:
px.line(merged_df)

In [76]:
utils.plot_correlation_matrix(merged_df, height=1000, width=1200)