# Biased Clustering

This notebook uses time-biased clustering to detect trends in financial journals.

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from Biased_Clusters import get_clusters_dist, get_clusters_timeline, get_top_keywords, get_silhouette, cal_cluster_bias

In [2]:
# load cleaned data
df = pd.read_csv('data/data_cleaned.csv')
df.head()

Unnamed: 0,Year,Year_Scaled,Year_STD,Month,Keywords,Abstract,Abstract_Cleaned,Abstract Length,Keywords_Cleaned,Number of Keywords,Month_Cleaned
0,2020,1.309502,172.088518,March,"['Capital structure', 'Corporate taxation', 'D...","Absent theoretical guidance, empiricists have ...",absent theoret guidance empiricist forc reli u...,1047,"['capital structure', 'corporate taxation', 'd...",5,3
1,2020,1.309502,172.088518,March,"['Credit spreads', 'LBO risk', 'Structural mod...",Recent decades have witnessed several waves of...,recent decad wit sever wave buyout activity fi...,580,"['credit spreads', 'lbo risk', 'structural mod...",4,3
2,2020,1.309502,172.088518,March,"['Fire sales', 'Liquidity management', 'Mutual...",We develop three novel measures of the incenti...,develop three novel measur incent equiti mutua...,586,"['fire sales', 'liquidity management', 'mutual...",3,3
3,2020,1.309502,172.088518,March,"['Asset pricing', 'Leverage constraints', 'Lot...",We test whether the low-risk effect is driven ...,test whether lowrisk effect driven leverag con...,861,"['asset pricing', 'leverage constraints', 'lot...",5,3
4,2020,1.309502,172.088518,March,"['Gender gap', 'Entrepreneurship', 'Angel inve...",We study whether early stage investors have ge...,studi whether earli stage investor gender bias...,742,"['gender gap', 'entrepreneurship', 'angel inve...",4,3


## Biased Clustering

In [3]:
# build trend score table

# load training data
x_vector = np.load('data/x_vector.npy')

# steps = [i for i in range(1,1000, 50)]
data = []

# get Silhouette score, std year for various bias amount
for m in tqdm(range(1,1000,10)):
    m = m*.01

    try:
        data.append(get_silhouette(df, x_vector, m))
    except ValueError:
        continue

# create a dataframe to store the results
df_result = pd.DataFrame(data)

# save df_result to csv file
df_result.to_csv('data/table_trend_score.csv', index=False)

# display the first 5 rows of df_result
df_result.head()

100%|██████████| 100/100 [01:04<00:00,  1.55it/s]


Unnamed: 0,step,Silhouette Score,silhouette_clsuter_1,std_year_cluster_1,silhouette_by_std_year_cluster_1,silhouette_clsuter_2,std_year_cluster_2,silhouette_by_std_year_cluster_2,silhouette_clsuter_3,std_year_cluster_3,...,silhouette_clsuter_14,std_year_cluster_14,silhouette_by_std_year_cluster_14,silhouette_clsuter_15,std_year_cluster_15,silhouette_by_std_year_cluster_15,std_tfidf,avg_std,avg_std_year,avg_silhouette_by_std_year
0,0.01,0.401087,0.138561,0.890458,0.155606,0.091259,1.033853,0.08827,0.471724,0.777657,...,0.548945,0.96518,0.568749,0.148511,0.891817,0.166527,1.398914,0.162029,0.949418,0.408044
1,0.11,0.392377,0.417502,1.011822,0.412624,0.10258,0.912889,0.112369,0.464789,1.226889,...,0.106491,0.883233,0.12057,0.415601,1.177908,0.35283,1.381413,0.163185,0.938209,0.410346
2,0.21,0.394902,0.305599,0.711571,0.429471,0.14167,0.857936,0.165129,0.072841,1.081688,...,0.12919,0.885802,0.145845,0.160048,0.954357,0.167702,1.389396,0.162186,0.949212,0.402559
3,0.31,0.394768,0.507219,0.969751,0.52304,0.118583,0.903324,0.131274,0.545162,0.961835,...,0.501449,0.693164,0.723421,0.397489,0.959855,0.414114,1.389313,0.155819,0.956667,0.410818
4,0.41,0.39166,0.517036,0.879088,0.58815,0.10548,0.917896,0.114915,0.405788,1.181116,...,0.374557,0.858492,0.436296,0.498144,0.693164,0.718652,1.383925,0.173859,0.94876,0.404727


In [4]:
import json
import scipy.sparse

# load terms sparse matrix
terms_sparse_matrix = scipy.sparse.load_npz('data/terms_sparse_matrix.npz')

# load terms label
with open("data/terms_label.txt", "r") as fp:
    terms_label = json.load(fp)

# make predictions
summary, predictions = cal_cluster_bias(df, x_vector, terms_sparse_matrix, terms_label, 0.57)

In [5]:
summary

Unnamed: 0,Terms,Trend Score,Bias Avg Std Year,Silhoutte Score,Number of Articles,Percentage of Articles,Cluster Id,Timeline
0,"effect, find, returns, estim, term, structur, ...",0.475847,0.944438,0.449408,263,0.092022,1,"1974-1978, 1980-1982, 1984-1992, 1995-2020"
1,"ep, dates, noinform, clientel, aggreg, larg, a...",0.493886,1.001476,0.494615,14,0.004899,2,"1986-1988, 1994, 2001, 2006, 2012, 2014-2016"
2,"returns, interv, increas, find, asset, aggreg,...",0.908194,0.646902,0.587512,10,0.003499,3,"1989, 2006, 2008, 2010-2011, 2014-2015, 2017"
3,"order, relat, use, control, ownership, tax, st...",0.141796,0.885826,0.125607,403,0.141008,4,"1974-1977, 1979-1980, 1983-2020"
4,"team, signals, interquartil, label, japanes, c...",1.037225,0.855856,0.887715,10,0.003499,5,"1983, 1994-1998, 2001-2002, 2015, 2020"
5,"use, effect, activ, trade, market, posit, earn...",0.368486,0.871256,0.321045,276,0.096571,6,"1975, 1977-1978, 1980, 1983-1993, 1995-2020"
6,"acquisit, announc, return, firms, invest, incr...",0.616076,0.908284,0.559572,131,0.045836,7,"1975, 1978, 1983, 1985-1987, 1989-1992, 1994-2020"
7,"perform, models, data, factor, empir, portfoli...",0.377243,1.182285,0.446009,171,0.059832,8,1974-2020
8,"condit, paper, invest, predict, find, explain,...",0.377685,1.017945,0.384462,257,0.089923,9,"1974-1994, 1996-2020"
9,"investor, use, portfolio, forward, call, valua...",0.367841,1.230563,0.452651,217,0.075927,10,"1974-1987, 1989-2020"


In [6]:
predictions.head()

Unnamed: 0,Year,Year_Scaled,Year_STD,Month,Keywords,Abstract,Abstract_Cleaned,Abstract Length,Keywords_Cleaned,Number of Keywords,Month_Cleaned,Cluster Id
0,2020,1.309502,172.088518,March,"['Capital structure', 'Corporate taxation', 'D...","Absent theoretical guidance, empiricists have ...",absent theoret guidance empiricist forc reli u...,1047,"['capital structure', 'corporate taxation', 'd...",5,3,10
1,2020,1.309502,172.088518,March,"['Credit spreads', 'LBO risk', 'Structural mod...",Recent decades have witnessed several waves of...,recent decad wit sever wave buyout activity fi...,580,"['credit spreads', 'lbo risk', 'structural mod...",4,3,14
2,2020,1.309502,172.088518,March,"['Fire sales', 'Liquidity management', 'Mutual...",We develop three novel measures of the incenti...,develop three novel measur incent equiti mutua...,586,"['fire sales', 'liquidity management', 'mutual...",3,3,6
3,2020,1.309502,172.088518,March,"['Asset pricing', 'Leverage constraints', 'Lot...",We test whether the low-risk effect is driven ...,test whether lowrisk effect driven leverag con...,861,"['asset pricing', 'leverage constraints', 'lot...",5,3,9
4,2020,1.309502,172.088518,March,"['Gender gap', 'Entrepreneurship', 'Angel inve...",We study whether early stage investors have ge...,studi whether earli stage investor gender bias...,742,"['gender gap', 'entrepreneurship', 'angel inve...",4,3,4


In [7]:
get_clusters_dist(predictions).head()

Unnamed: 0,Year,Cluster Id,Number of Articles
0,1974,1,2
1,1974,4,1
2,1974,8,1
3,1974,9,4
4,1974,10,4


In [8]:
# save prediction
summary.to_csv('data/summary_bias_0.57.csv', index=False)
predictions.to_csv('data/predictions_0.57.csv', index=False)