## Description

Descirption:

Filter the three main data sources (Fund Info, Returns and Holdings)
based on some parameters and save the result.

Approach:
1. Match fund summary to each portfolio/date pair
2. Filter fund/date pairs based on those infos
3. Filter returns based on final sample of holdings

Parameters: 
- Obj_codes
- Percentage
- Flags

In [1]:
import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [2]:
# import my method from the source code
from data.basic_functions import * 
from scipy import sparse

import numpy as np
import pandas as pd

### Load the three data files

In [None]:
data_path = '../data/raw/'

returns = load_data(data_path,'monthly_returns')
summary = load_data(data_path,'total_summary')
# info_matched = load_data(data_path,'info_df_total')

npz_path = data_path + 'large_sparse.npz'
holdings = sparse.load_npz(npz_path)

# Functions

In [78]:
def port_ID_split(port_ID):
    """
    
    """
    if len(port_ID) == 18:
        report_dt = pd.Timestamp(port_ID[8:])
        port_no = pd.to_numeric(port_ID[:7])
        return(port_no,report_dt)

In [79]:
def port_ID_to_port_info(port_ID,info_df):
    """
    
    """
    if len(port_ID) == 18:
        
        port_no, report_dt = port_ID_split(port_ID)
        
        try: 
            mask = info_df['crsp_portno'].values == port_no
            my_class = info_df.loc[mask]

            my_class_n = my_class.loc[
                (my_class.begdt <= report_dt) & 
                (my_class.enddt >= report_dt)]

            crsp_obj_cd = my_class_n['crsp_obj_cd'].values[0]
            index_fund_flag = my_class_n['index_fund_flag'].values[0]
            et_flag = my_class_n['et_flag'].values[0]

            return(port_ID, port_no, report_dt, index_fund_flag, et_flag, crsp_obj_cd)        

        except: return(port_ID,port_no,report_dt,np.nan,np.nan,np.nan)
    else: return(np.nan,np.nan,np.nan,np.nan,np.nan,np.nan)

In [80]:
def port_map_to_port_ID(port_map):
    """ 
    
    """
    left = []
    right = []
    keys = list(port_no_map.keys())
    for i in keys:
        left_temp, right_temp = i.split('-')
        left.append(left_temp)
        right.append(right_temp)

    int_date = pd.to_numeric(right)
    date = pd.to_timedelta(int_date, unit='D') + pd.Timestamp('1960-1-1')

    port_ID_new = [m + '-' + str(n.date()) for m,n in zip(left,date)]
    return(port_ID_new)

# Data

In [72]:
info = summary

In [75]:
stock_map = np.load('../data/interim/stock_map.npy')  # TODO make portnomap with real date
port_no_map_a = np.load('../data/interim/port_no_map.npy') 
port_no_map = port_no_map_a.item()

In [83]:
port_no_map = port_map_to_port_ID(port_no_map)

NameError: name 'pd' is not defined

In [77]:
print('Shape of summary before cleaning is: {:,} / {:,}'.format(info.shape[0],info.shape[1]))

Shape of summary before cleaning is: 166,461 / 12


In [136]:
# Delet not nas
info_clean = info[info['crsp_portno'].notna()]

# Drop duplicates based on all but two columns
info_clean = info_clean.drop_duplicates(info_clean.columns.difference(['crsp_fundno','fund_name']))

# Downcast portno
info_clean.loc[:,'crsp_portno'] = pd.to_numeric(info_clean.loc[:,'crsp_portno'], downcast='integer')
# info_clean = info_clean.set_index('crsp_portno')

In [133]:
print('Shape after cleaning is: {:,} / {:,}'.format(info_clean.shape[0],info_clean.shape[1]))

Shape after cleaning is: 95,614 / 12


# Match obj code to port_ID

In [398]:
%%time

test = [port_ID_to_port_info(x,info_clean) for x in port_no_map[-10_000:]]

CPU times: user 6min 21s, sys: 1.29 s, total: 6min 22s
Wall time: 6min 23s


In [545]:
labels = ['port_ID','port_no','report_dt','index_fund_flag','et_flag','crsp_obj_cd']

In [546]:
info_df = pd.DataFrame.from_records(test, columns=labels)

In [547]:
info_df.head()

Unnamed: 0,port_ID,port_no,report_dt,index_fund_flag,et_flag,crsp_obj_cd
0,1028236-2010-09-30,1028236.0,2010-09-30,D,F,EDCM
1,1028236-2010-10-31,1028236.0,2010-10-31,D,F,EDCM
2,1028236-2010-11-30,1028236.0,2010-11-30,D,F,EDCM
3,1028236-2010-12-31,1028236.0,2010-12-31,D,F,EDCM
4,1028236-2011-01-31,1028236.0,2011-01-31,D,F,EDCM


In [548]:
info_df[info_df['crsp_obj_cd'].isna()].shape # -> For some portfolios there is simly no row in fund_header

(1143, 6)

# Clean data

In [549]:
# Replace NaN with N for not an ETF
info_df.loc[info_df['et_flag'].isna(),'et_flag'] = 'N'

In [550]:
# Replace NaN with N for not an Index fund
info_df.loc[info_df['index_fund_flag'].isna(),'index_fund_flag'] = 'N'

In [551]:
# Creat new flag var 'mutual_fund'
info_df.loc[(info_df['index_fund_flag'] == 'N') & (info_df['et_flag'] == 'N'),'mutual_fund'] = 'Y'
info_df.loc[info_df['mutual_fund'].isna(),'mutual_fund'] = 'N'

In [556]:
# Creat subsample of crsp_obj_cd
most_common = (info_df["port_no"]
        .groupby([
        info_df["crsp_obj_cd"],
        ])
        .count()
        .sort_values()[-20:]
        .index
)

info_df.loc[info_df["crsp_obj_cd"].isin(most_common),'s_crsp_obj_cd'] = info_df["crsp_obj_cd"]
info_df.loc[info_df["s_crsp_obj_cd"].isna(),'s_crsp_obj_cd'] = 'Other'

info_df.sample(5)

Unnamed: 0,port_ID,port_no,report_dt,index_fund_flag,et_flag,crsp_obj_cd,mutual_fund,s_crsp_obj_cd
165136,1032558-2016-11-30,1032558.0,2016-11-30,B,F,O,N,O
112546,1030897-2014-05-31,1030897.0,2014-05-31,N,N,I,Y,I
77294,1029859-2013-08-31,1029859.0,2013-08-31,N,N,EDYB,Y,EDYB
75948,1029841-2015-07-31,1029841.0,2015-07-31,D,F,EFSF,N,Other
182137,1033328-2015-12-31,1033328.0,2015-12-31,N,N,I,Y,I


In [553]:
info_df.loc[info_df['mutual_fund'].isna(),'mutual_fund'] = 'N'

In [554]:
info_df.head()

Unnamed: 0,port_ID,port_no,report_dt,index_fund_flag,et_flag,crsp_obj_cd,mutual_fund,s_crsp_obj_cd
0,1028236-2010-09-30,1028236.0,2010-09-30,D,F,EDCM,N,EDCM
1,1028236-2010-10-31,1028236.0,2010-10-31,D,F,EDCM,N,EDCM
2,1028236-2010-11-30,1028236.0,2010-11-30,D,F,EDCM,N,EDCM
3,1028236-2010-12-31,1028236.0,2010-12-31,D,F,EDCM,N,EDCM
4,1028236-2011-01-31,1028236.0,2011-01-31,D,F,EDCM,N,EDCM


# Save info_df

In [555]:
path = 'data/info_df_total.feather'
feather.write_dataframe(info_df,path)

### Restrict to last 200k lines

In [69]:
holdings_s = holdings[-200_000:] # TODO -> anderes skript implementieren

### Shape

In [71]:
holdings_s.shape

(200000, 2382968)

In [65]:
info_matched.shape

(200000, 8)

### Selection criteria

In [32]:
selected_obj_codes = ('EDYG','EDYB' ,'EDYH' ,'EDYS' ,'EDYI')
selected_portion = 0.1

### Select funds from summary

First select the funds based on their characteristics in the fund_summary table

TODO: Flags

Then select the matching fund returns and fund holdings

In [34]:
summary.head()

Unnamed: 0,crsp_fundno,crsp_portno,fund_name,first_offer_dt,index_fund_flag,et_flag,begdt,enddt,crsp_obj_cd,lipper_class,lipper_class_name,lipper_obj_cd
0,1.0,,AARP Income Trust: AARP Bond Fund for Income,1997-02-01,,,1997-12-31,1998-12-30,ICQM,,,
1,1.0,,AARP Income Trust: AARP Bond Fund for Income,1997-02-01,,,1998-12-31,1999-12-30,ICQY,,,BBB
2,1.0,,AARP Income Trust: AARP Bond Fund for Income,1997-02-01,,,1999-12-31,2000-07-31,ICQY,BBB,Corporate Debt Funds BBB-Rated,BBB
3,2.0,,AARP Managed Investment Portfolios Trust: AARP...,1997-02-03,,,1997-12-31,1998-12-30,EDYG,,,
4,2.0,,AARP Managed Investment Portfolios Trust: AARP...,1997-02-03,,,1998-12-31,1999-12-30,EDYB,,,GI


In [50]:
# Implement multiple masks
mask_1 = summary['crsp_obj_cd'].isin(selected_obj_codes) # only certain obj_codes
# mask_2 = summary['mutual_fund'] == 'Y' # only mutual funds TODO needs to be implemented
# (info_df['index_fund_flag'] == 'N') & (info_df['et_flag'] == 'N')

# info_df.loc[(info_df['index_fund_flag'] == 'N') & (info_df['et_flag'] == 'N'),'mutual_fund'] = 'Y'
# info_df.loc[info_df['mutual_fund'].isna(),'mutual_fund'] = 'N'

In [37]:
summary_s = summary.loc[mask_1]

In [44]:
summary.drop_duplicates?

### Filter returns

In [45]:
unique_fundnos = summary_s['crsp_fundno'].drop_duplicates()

In [48]:
mask = returns['crsp_fundno'].isin(unique_fundnos)
returns_s = returns[mask]

### Filter holdings

In [None]:
train_mask = (info_df['port_no'].isin(list(X_train_s))))

info_train = info_df[train_mask]
info_test = info_df[test_mask]

sparse_matrix_train = sparse_matrix_ss[train_mask.values,:]
sparse_matrix_test = sparse_matrix_ss[test_mask.values,:]

### Packages

In [41]:
test = load_data('TEST_DF')

In [40]:
save_data(test,'TEST_DF')

In [None]:
path = '../data/final/info_train.feather'
info_train = feather.read_dataframe(path)

path = '../data/final/info_test.feather'
info_test = feather.read_dataframe(path)

path = '../data/final/sparse_matrix_train.npz'
sparse_matrix_train = sparse.load_npz(path)

path = '../data/final/sparse_matrix_test.npz'
sparse_matrix_test = sparse.load_npz(path)

X_train = sparse_matrix_train
X_test = sparse_matrix_test

In [4]:
y_train = list(info_train['s_crsp_obj_cd'])
y_test = list(info_test['s_crsp_obj_cd'])

data_train = np.array(X_train.sum(1)).flatten()
data_test = np.array(X_test.sum(1)).flatten()

TOL = 300

index_train = np.abs(data_train) < TOL
index_test = np.abs(data_test) < TOL

X_train = X_train[index_train.T]
X_test = X_test[index_test.T]

y_train = list(d for d, s in zip(y_train, index_train) if s)
y_test = list(d for d, s in zip(y_test, index_test) if s)

info_train = info_train[index_train.T]
info_test = info_test[index_test.T]

X_total = sparse.vstack((X_train,X_test))
y_total = y_train + y_test

groups = info_train.groupby(['port_no']).ngroup()

print('Rows training: \n{:,} rows info\n{:,} rows data'.format(len(y_train),X_train.shape[0]))
print('')
print('Rows testing: \n{:,} rows info\n{:,} rows data'.format(len(y_test),X_test.shape[0]))
print('')
print('Total rows: {:,}'.format(X_train.shape[0] + X_test.shape[0]))

Rows training: 
33,976 rows info
33,976 rows data

Rows testing: 
15,993 rows info
15,993 rows data

Total rows: 49,969
