In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [93]:
# igraph
!pip install python-igraph
from igraph import *

# package to show plot of igraph
!pip install cairocffi
import cairocffi



In [20]:
import pandas as pd
import numpy as np


In [37]:
# read in distance matrix
industry_matrix = pd.read_csv('/content/gdrive/My Drive/capstone/distance matrix/industry_matrix.csv',index_col=0)
mktcap_matrix = pd.read_csv('/content/gdrive/My Drive/capstone/distance matrix/mktcap_matrix.csv',index_col=0)
n_instruments_matrix = pd.read_csv('/content/gdrive/My Drive/capstone/distance matrix/n_instruments_matrix.csv',index_col=0)
turnover_matrix = pd.read_csv('/content/gdrive/My Drive/capstone/distance matrix/turnover_matrix.csv',index_col=0)
concentration_matrix = pd.read_csv('/content/gdrive/My Drive/capstone/distance matrix/concentration_matrix.csv',index_col=0)
invstyle_matrix = pd.read_csv('/content/gdrive/My Drive/capstone/distance matrix/invStyle_matrix.csv',index_col=0)
total_hld_matrix = pd.read_csv('/content/gdrive/My Drive/capstone/distance matrix/total_hld_matrix.csv',index_col=0)

In [38]:
def get_adjacency_mat_from_dist_mat(dist_mat):
  adjacency_mat = dist_mat.copy()
  np.fill_diagonal(adjacency_mat.values, np.nan)
  max = np.nanmax(adjacency_mat.to_numpy())
  min = np.nanmin(adjacency_mat.to_numpy())

  # Inverse distance to be edge weight
  adjacency_mat = max - adjacency_mat + min
  adjacency_mat.fillna(0, inplace = True)
  
  return adjacency_mat

In [39]:
# convert all 7 dist mat to adj mat
ad_mktcap = get_adjacency_mat_from_dist_mat(mktcap_matrix)
ad_industry = get_adjacency_mat_from_dist_mat(industry_matrix)
ad_turnover = get_adjacency_mat_from_dist_mat(turnover_matrix)
ad_concentration = get_adjacency_mat_from_dist_mat(concentration_matrix)
ad_invstyle = get_adjacency_mat_from_dist_mat(invstyle_matrix)
ad_n_instruments = get_adjacency_mat_from_dist_mat(n_instruments_matrix)
ad_total_hld = get_adjacency_mat_from_dist_mat(total_hld_matrix)

In [68]:
feature_dic = {'portfolio_mkt': ad_mktcap,
               'portfolio_industry': ad_industry,
               'turnover': ad_turnover,
               'concentration': ad_concentration,
               'invStyle': ad_invstyle,
               'n_instruments': ad_n_instruments,
               'total_hld': ad_total_hld}

# normalize all edge weights of each adj mat to [0,1]
for feature in feature_dic:
  feature_ad_mat = feature_dic[feature]
  feature_dic[feature] = feature_ad_mat/feature_ad_mat.max().max()
  feature_dic[feature].index = feature_dic[feature].index.map(str)

In [70]:
# Get the intersection investor list from all 7 adjacency mat
# the investors list is the intersection of all 7 feature mat index
inv_intersect_ls = set(feature_dic['total_hld'].index)
for feature in feature_dic:
  inv_intersect_ls = set(set(feature_dic[feature].index) & inv_intersect_ls)

print(len(inv_intersect_ls))
ls_investors = inv_intersect_ls

225


In [89]:
# # if adjacency matrix have missing investors, fill in
# def fill_missing(df, ls_investors):
#   missing_investors = ls_investors - set(df.index)
#   for index in missing_investors:
#     df.loc[:,str(index)] = 0
#     df.loc[index] = 0
#   df.fillna(0,inplace=True)
#   df.sort_index(inplace=True)
#   return df

# for feature in feature_dic:
#   feature_ad_mat = feature_dic[feature]
#   feature_dic[feature] = fill_missing(feature_ad_mat,ls_investors)


# create feature dict using only investors in the intersection (225) and sort by both col row index
intersect_feature_dic = {}
for feature in feature_dic:  
  adj_mat = feature_dic[feature][ls_investors].loc[ls_investors]
  adj_mat = adj_mat.sort_index(axis = 1).sort_index(axis = 0)
  intersect_feature_dic[feature] = adj_mat


# create an empty adjacency matrix using sorted intersection index
adjacency = pd.DataFrame(data = 0, 
                         index=intersect_feature_dic['total_hld'].index, 
                         columns=intersect_feature_dic['total_hld'].index)


# add up all features
for feature in intersect_feature_dic:
  adjacency += intersect_feature_dic[feature]

# # change column order to make column order coordinate with index order
# adjacency = adjacency[[str(i) for i in list(adjacency.index)]]

In [90]:
adjacency

LGCYINVESTORID,10061989,10075382,10110259,10233291,10235680,2000264,2001934,2001935,2001989,2001999,2002019,2002142,2002146,2002158,2002179,2002180,2002195,2002205,2002222,2002292,2002295,2002303,2002362,2002389,2002446,2002451,2002468,2002496,2002506,2002511,2002548,2002582,2002628,2002658,2002659,2002678,2002695,2002713,2002715,2002801,...,3015075,3318171,3365036,3676543,3706224,3828352,3965008,4040011,4059340,4889409,4924814,4950170,4962258,5050280,5200477,5205742,5206490,5206923,5275446,5308615,5436717,5480445,5505145,5520904,5529187,5713469,5819294,5844860,5915225,5986931,6053249,6129655,6139212,6248967,6271047,6921752,7185253,9098246,9113038,9969058
LGCYINVESTORID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
10061989,0.000000,5.117883,4.443423,5.230592,5.047739,4.842475,5.023472,4.740072,5.066365,4.926741,5.643603,5.276924,5.381739,5.096421,4.803101,5.029735,5.050604,5.075438,4.874387,5.596114,5.004836,5.538905,5.076159,5.415548,5.196345,4.780825,4.889076,5.413981,4.966220,4.667987,5.109988,4.920678,5.489124,4.819141,5.203088,5.123322,5.353369,4.731662,4.617665,5.457465,...,5.121113,4.856562,4.806564,5.371328,5.226285,5.227323,5.356751,5.295001,4.694719,4.601240,4.908843,5.138862,4.990374,5.283594,4.254359,5.148458,5.421754,5.332191,5.558167,4.485573,4.707733,5.558091,4.563647,4.979926,5.158062,5.916761,5.044509,5.359866,5.221795,4.414188,5.524020,5.782314,5.337294,5.272897,5.270389,4.612794,5.046228,5.199865,4.406134,5.539406
10075382,5.117883,0.000000,5.597658,5.913743,5.890947,4.840502,4.187476,3.970648,4.458919,4.150608,5.078726,5.095742,4.836948,4.358852,4.754323,4.647800,4.260273,4.355428,4.543296,5.098704,4.475552,5.224117,4.441618,5.029265,4.546818,3.957892,4.160610,4.781714,4.246561,3.922354,4.328697,4.114551,4.841495,4.236802,4.567258,4.396654,4.624315,4.069299,3.934612,4.677277,...,4.673919,4.508852,4.112984,5.339365,5.196943,5.986658,5.479327,5.513830,4.014410,4.086181,4.722353,4.775532,5.513655,5.241732,4.908497,4.956269,5.120643,4.960155,5.140098,4.551455,4.161327,4.967918,3.886590,4.573093,4.695447,5.141205,5.843342,5.088918,4.655650,4.480968,4.743729,5.100865,5.469612,4.526607,4.770405,4.294837,4.302241,5.185021,4.696609,5.653267
10110259,4.443423,5.597658,0.000000,5.377915,5.241696,5.347374,4.697801,3.700966,5.151012,4.587845,4.406914,5.340854,4.986874,4.915128,5.215037,5.277254,4.673428,4.942300,4.909382,4.821246,4.682006,5.072880,5.068446,4.236903,4.943798,4.547374,4.257130,4.910916,4.392177,4.508118,4.795527,4.774177,4.874549,4.556678,5.070390,4.886238,4.801988,4.469910,4.549807,4.898168,...,4.541693,4.268475,4.596269,4.721506,4.383409,5.549791,4.747518,4.837884,4.656162,4.729560,4.681633,4.931748,5.163454,4.957441,5.620728,4.890949,4.545987,4.962929,4.435229,4.108715,4.732730,4.753704,4.499867,3.781667,4.818501,4.529664,5.459207,4.385047,4.830760,4.745572,4.112105,4.430296,4.728527,5.134461,4.136295,4.955252,4.991951,4.495316,5.135658,5.012965
10233291,5.230592,5.913743,5.377915,0.000000,6.456974,4.767015,4.202148,4.085740,4.686595,4.215563,5.287623,5.166357,5.033237,4.468913,4.963340,4.714202,4.307930,4.477341,4.645422,5.011951,4.222421,5.123549,4.588318,4.757883,4.438351,4.019862,4.298667,4.754536,4.384560,3.821824,4.416161,4.188990,4.904006,4.441571,4.704210,4.456751,4.705694,4.196872,3.767585,4.682769,...,4.881713,4.267859,4.015103,5.634658,4.934530,6.180806,5.356810,5.760839,4.034098,4.489333,4.902112,4.575807,6.304697,5.061020,5.254205,4.995914,4.983765,4.944095,5.236454,4.549170,4.384916,4.831768,3.995697,4.363835,4.473830,5.486064,5.886350,5.018020,4.619410,4.615082,4.975116,5.132989,5.465780,4.581003,4.823354,4.266616,4.357847,5.193580,4.617180,5.835204
10235680,5.047739,5.890947,5.241696,6.456974,0.000000,4.710823,4.089859,3.922397,4.459292,4.075168,5.124102,5.005697,4.834408,4.363212,4.804657,4.576775,4.172212,4.360113,4.472499,4.889200,4.135842,4.977657,4.372361,4.662159,4.405169,3.893846,4.131323,4.652492,4.217214,3.727469,4.292554,4.032409,4.770081,4.233357,4.532084,4.351081,4.555878,4.037887,3.693675,4.594131,...,4.688993,4.172504,3.894012,5.370441,4.885313,5.968019,5.198953,5.540359,3.879538,4.273215,4.734756,4.462994,6.008563,4.966056,5.018223,4.811046,4.861136,4.897664,5.162170,4.352850,4.256474,4.758838,3.849717,4.251821,4.422727,5.178652,5.846477,4.861226,4.667620,4.471852,4.757424,4.969197,5.349797,4.436237,4.614071,4.136244,4.159439,5.052006,4.535414,5.612455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6921752,4.612794,4.294837,4.955252,4.266616,4.136244,5.388298,5.078256,3.860055,5.806943,4.857958,4.314552,5.418535,5.303992,5.230467,5.313957,5.892347,5.026308,5.268099,5.248845,5.240962,5.207511,5.088084,5.260696,4.743766,5.100006,4.991313,4.470126,5.304973,4.588630,5.099061,5.065739,5.417250,4.988981,4.925772,5.300063,5.173208,5.072304,4.615409,5.277389,5.154667,...,4.690669,4.544775,5.249858,4.421570,4.349685,4.312327,4.767031,4.422057,5.172592,4.909702,4.962161,5.526999,4.257424,5.118012,5.112217,4.916790,4.760997,5.231195,4.739483,3.979249,4.965957,5.181239,4.903032,4.265817,5.264827,4.576985,4.389863,4.555721,4.960619,4.917241,4.309976,4.804695,4.556549,5.768520,4.380416,0.000000,5.557416,4.564208,5.089152,4.647050
7185253,5.046228,4.302241,4.991951,4.357847,4.159439,5.475617,5.746142,4.569528,5.932105,5.631470,4.983464,5.663203,5.413541,5.860282,5.342139,5.962400,5.725220,6.011689,5.236880,5.279303,5.358036,5.373986,6.315771,4.825469,5.629845,5.650805,5.124323,5.723703,5.262838,5.650336,5.771457,5.849459,5.769845,5.211826,6.102591,5.883427,5.717105,5.374992,5.696685,5.796201,...,4.959578,4.659256,5.357931,4.645569,4.556146,4.428246,4.587949,4.521182,5.761222,5.050058,4.785094,5.496969,4.214170,5.031286,5.150440,4.896325,4.834666,5.224777,4.626689,4.045445,5.301322,5.563161,5.642905,4.373171,5.446389,5.322418,4.536422,4.434170,5.333933,4.714497,4.915718,4.998576,4.580245,6.070515,4.406118,5.557416,0.000000,4.410193,5.041271,4.717819
9098246,5.199865,5.185021,4.495316,5.193580,5.052006,4.425470,4.203375,3.999766,4.603921,4.122001,4.871101,4.785816,5.072892,4.253713,4.970028,4.686111,4.329921,4.267679,4.957479,5.118211,4.612682,5.284181,4.396686,5.238828,4.402913,3.944432,4.110333,4.916872,4.178834,4.022973,4.231015,4.346339,4.752182,4.388700,4.465408,4.289580,4.646966,3.974305,4.112594,4.605988,...,5.162630,4.778815,4.649619,5.030374,4.997048,5.252441,5.381828,5.049919,4.039056,4.495424,5.245005,4.960178,4.995437,5.039615,4.352970,5.194280,5.144506,4.880138,5.596220,4.611182,4.777743,5.050809,3.755948,4.811896,4.788306,5.320256,5.064038,5.193679,4.693638,5.135546,5.023132,5.484237,5.129673,4.617962,5.288707,4.564208,4.410193,0.000000,4.376656,5.604396
9113038,4.406134,4.696609,5.135658,4.617180,4.535414,5.231358,5.393014,4.353525,5.172111,5.264310,4.453681,5.288478,4.968881,5.302678,5.129255,5.367846,5.374235,5.126803,4.947832,4.950773,4.970437,4.921818,5.084093,4.667270,5.184158,5.225613,4.929199,5.410727,5.042961,5.149921,5.200888,5.280087,5.134877,5.123778,5.227546,5.436550,5.441191,5.162479,5.092939,5.280677,...,4.591073,4.443499,4.888534,4.360503,4.281992,4.868768,4.757842,4.567226,5.178648,4.591753,4.731852,5.184367,4.431911,4.818563,4.971804,4.734929,4.670776,5.108073,4.500835,4.196725,4.655585,5.178285,5.040378,4.033044,5.064940,4.652448,4.928944,4.244588,4.931378,4.857021,4.688129,4.503028,4.878109,5.371439,4.093968,5.089152,5.041271,4.376656,0.000000,5.287001


In [114]:
def drop_edges_output_mat(adjacency, pct=0.5, thres = None):
  '''
  Remove some proportion of edges if the weight is low. If threshold is provided, use the threshold
  '''
  if thres is None:
    thres = np.percentile(adjacency.to_numpy(), pct*100)
  print('threshold is', thres)
  # remove bottom pct of edges
  adjacency_filtered = adjacency[adjacency > thres].fillna(0)
  return adjacency_filtered

# drop_edges_output_mat(adjacency, thres = 6).to_csv('/content/gdrive/My Drive/capstone/distance matrix/all_features_adj_mat_thres_6.csv')

threshold is 6


In [108]:
len(set(drop_edges_output_mat(adjacency, thres = 5).columns))

threshold is 5


225

In [91]:
def drop_edges(adjacency, pct=0.5, thres = None):
  '''
  Remove some proportion of edges if the weight is low. If threshold is provided, use the threshold
  '''
  if thres is None:
    # only compute the threshold if threshold is not provided
    thres = np.percentile(adjacency.to_numpy(), pct*100)
  print('threshold is', thres)
  # remove bottom pct of edges
  adjacency_filtered = adjacency[adjacency > thres].fillna(0)
  # print(adjacency_filtered)
  # create undirected, weighted graph
  g = Graph.Weighted_Adjacency(adjacency_filtered.values.tolist(),mode=ADJ_UNDIRECTED)
  return g

In [100]:

g = drop_edges(adjacency,thres=2)
plot(g.community_walktrap(weights=g.es["weight"]).as_clustering(),
     vertex_label = ls_investors,
     vertex_size = 16)


threshold is 2


TypeError: ignored

In [None]:
# # construct a layout
# layout = g.layout_fruchterman_reingold(weights=g.es["weight"])

# # construct the plot settings
# plot_settings = dict(
#         layout=layout,
#         edge_width=rescale(g.es["weight"], out_range=(0.0, 5.0))
# )

# # plot the graph
# plot(g.community_walktrap(weights=g.es["weight"]).as_clustering(),**plot_settings)
# # plot(g, **plot_settings)