In [2]:
import feather
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from scipy import sparse

# Load Data

## Load Sparse matrix

In [3]:
path = 'data/large_sparse.npz'
sparse_matrix = sparse.load_npz(path)

In [4]:
print(('Loaded {:,} rows and {:,} columns of sparse data'.format(sparse_matrix.shape[0],sparse_matrix.shape[1])))
print(('With {:,} nonzero elements'.format(sparse_matrix.getnnz())))

Loaded 738,860 rows and 2,382,968 columns of sparse data
With 177,604,206 nonzero elements


#plt.spy(sparse_matrix, markersize = 0.004)

## Load dictionaries for large sparse matrix

In [5]:
stock_map = np.load('data/stock_map.npy') 
stock_map = stock_map.item()

port_no_map = np.load('data/port_no_map.npy') 
port_no_map = port_no_map.item()

In [6]:
inv_stock_map = {v: k for k, v in stock_map.items()}

## Load info_df

In [5]:
path = 'data/info_df_total.feather'
info_df = feather.read_dataframe(path)
info_df = info_df[:-1]

In [6]:
info_df.sample(3)

Unnamed: 0,port_ID,port_no,report_dt,index_fund_flag,et_flag,crsp_obj_cd,mutual_fund,s_crsp_obj_cd
108578,1030788-2013-10-31,1030788.0,2013-10-31,D,F,EDSI,N,Other
54452,1029375-2014-09-30,1029375.0,2014-09-30,E,F,EDYH,N,EDYH
9856,1028412-2013-05-31,1028412.0,2013-05-31,D,F,EDSM,N,Other


# Prepare subset of sparse data

In [7]:
sparse_matrix_ss = sparse_matrix[-200_000:-1]

In [8]:
print(('Loaded {:,} rows and {:,} columns of sparse data'.format(sparse_matrix_ss.shape[0],sparse_matrix_ss.shape[1])))
print(('With {:,} nonzero elements'.format(sparse_matrix_ss.getnnz())))

Loaded 199,999 rows and 2,382,968 columns of sparse data
With 42,841,300 nonzero elements


#plt.spy(sparse_matrix_ss, markersize = 0.004)

# Sample selection function

In [9]:
info_df_wo_na = info_df[info_df['crsp_obj_cd'].notna()]

obj_per_unique_portno = info_df_wo_na[['port_no','s_crsp_obj_cd']]
obj_per_unique_portno = obj_per_unique_portno.drop_duplicates(subset='port_no')

In [10]:
labels = list(obj_per_unique_portno["s_crsp_obj_cd"])

In [11]:
X_train_s, X_test_s = train_test_split(
    obj_per_unique_portno['port_no'],
    test_size = 0.33,
    stratify = labels,
    random_state = 42)

In [12]:
# Check if some portno appear in both train and test
set(list(X_train_s)) & set(list(X_test_s))

set()

# Set up train/test boolean mask

In [28]:
classes = ['EDCM', 'EDCS', 
           'EDSC', 'EDSC',
           'EDYB', 'EDYG', 'EDYH', 'EDYI', 'EDYS',
           'EF', 'EFRE', 'EFRM', 'EFSN']

In [13]:
classes = ['EDYG', 'EDYB', 'EDYH', 'EDYS', 'EDYI']

In [14]:
train_mask = (info_df['port_no'].isin(list(X_train_s))) & (info_df['mutual_fund'] == 'Y') & (info_df['crsp_obj_cd'].isin(classes))
test_mask = info_df['port_no'].isin(list(X_test_s)) & (info_df['mutual_fund'] == 'Y') & (info_df['crsp_obj_cd'].isin(classes))

## Creat samples

In [15]:
info_train = info_df[train_mask]
info_test = info_df[test_mask]

sparse_matrix_train = sparse_matrix_ss[train_mask.values,:]
sparse_matrix_test = sparse_matrix_ss[test_mask.values,:]

In [16]:
print('Rows training: \n{:,} rows info\n{:,} rows data'.format(info_train.shape[0],sparse_matrix_train.shape[0]))
print('')
print('Rows testing: \n{:,} rows info\n{:,} rows data'.format(info_test.shape[0],sparse_matrix_test.shape[0]))
print('')
print('Total rows: {:,}'.format(sparse_matrix_train.shape[0] + sparse_matrix_test.shape[0]))

Rows training: 
34,003 rows info
34,003 rows data

Rows testing: 
16,000 rows info
16,000 rows data

Total rows: 50,003


# Save info_train/test and sparse_matrix_train/test

In [17]:
path = 'data/final/info_train.feather'
feather.write_dataframe(info_train,path)

path = 'data/final/info_test.feather'
feather.write_dataframe(info_test,path)

In [18]:
path = 'data/final/sparse_matrix_train.npz'
sparse.save_npz(path,sparse_matrix_train)

path = 'data/final/sparse_matrix_test.npz'
sparse.save_npz(path,sparse_matrix_test)

# Check samples agains Database (see extra notebook)

In [None]:
np.random.seed()
test = np.random.randint(0,132000)
test

holdings = sparse_matrix_train[test].data
indices = sparse_matrix_train[test].indices

In [None]:
print(info_train.iloc[test])

print(len(holdings))

list(zip( 
    list(map(inv_stock_map.get, indices)),
    list(np.sort(holdings)[::-1])
))

# Plot col and row sums

In [None]:
col_sums = sparse_matrix_ss.sum(0).T
row_sums = sparse_matrix_ss.sum(1)

col_sums = pd.DataFrame(col_sums)

row_sums = pd.DataFrame(row_sums)

In [None]:
n, bins, patches = plt.hist(row_sums.values, bins = 10, facecolor='g', alpha=0.75)

plt.xlabel('Smarts')
plt.title('Histogram of IQ')
plt.grid(True)
plt.show()