In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
from scipy import stats
import numpy as np
from tqdm.notebook import tqdm

In [7]:
# read in Vjt, IQRjt, weighted_IQRjt, and industry mapping data
df_industry_mapping = pd.read_csv('/content/gdrive/My Drive/capstone/full database/instrument_industry_match.csv')
df_Vjt = pd.read_csv('/content/gdrive/My Drive/capstone/data for trend/variance_jt_instrument.csv')
df_IQRjt = pd.read_csv('/content/gdrive/My Drive/capstone/data for trend/225 investors/quantile_range_jt_instrument.csv')
df_weighted_IQRjt = pd.read_csv('/content/gdrive/My Drive/capstone/data for trend/225 investors/weighted_delta_IQR_jt.csv')
df_industry_mapping.HierarchicalId_2 = df_industry_mapping.HierarchicalId_2.astype(str)

In [8]:
df_weighted_IQRjt.head()

Unnamed: 0,INSTRID,HOLDDATE,weighted_delta
0,8589934597,2010-06-30,5.582703
1,8589934597,2010-09-30,0.00833
2,8589934597,2010-12-31,0.008464
3,8589934597,2011-03-31,0.004761
4,8589934597,2011-06-30,0.625148


In [9]:
merged_Vjt = pd.merge(df_Vjt, df_industry_mapping, how = 'left', on='INSTRID')[['INSTRID', 'HOLDDATE', 'Delta', 'HierarchicalId_2']]
merged_IQRjt = pd.merge(df_IQRjt, df_industry_mapping, how = 'left', on='INSTRID')[['INSTRID', 'HOLDDATE', 'Delta', 'HierarchicalId_2']]
merged_weighted_IQRjt = pd.merge(df_weighted_IQRjt, df_industry_mapping, how = 'left', on='INSTRID')[['INSTRID', 'HOLDDATE', 'weighted_delta', 'HierarchicalId_2']]
merged_weighted_IQRjt=merged_weighted_IQRjt.rename(columns = {'weighted_delta':'Delta'})

In [7]:
# HOLDDATE_list = list(set(merged_IQRjt.HOLDDATE))
# HOLDDATE_list.sort() # from '2010-06-30' to '2020-06-30'
# industry_list = [x for x in list(set(merged_IQRjt.HierarchicalId)) if x == x]

In [24]:
def get_time_series_t_test(df):
  industry_list = []
  t_statistic_list = []
  p_value_list = []
  date_list = []
  HOLDDATE_list = list(set(df.HOLDDATE))
  HOLDDATE_list.sort() # from '2010-06-30' to '2020-06-30'
  HierarchicalId_list = [x for x in list(set(df.HierarchicalId_2)) if x == x]
  for industry in tqdm(HierarchicalId_list):
    for i in range(1,len(HOLDDATE_list)):
      date_current = HOLDDATE_list[i]
      date_before = HOLDDATE_list[i-1]
      sample_current = df.loc[(df['HOLDDATE']==date_current) & (df['HierarchicalId_2']==industry), 'Delta']
      sample_before = df.loc[(df['HOLDDATE']==date_before) & (df['HierarchicalId_2']==industry), 'Delta']
      t_statistic, p_value = stats.ttest_ind(sample_current,sample_before, equal_var=False, nan_policy = 'raise')
      p_value = p_value / 2
      t_statistic_list.append(t_statistic)
      p_value_list.append(p_value)
      industry_list.append(industry)
      date_list.append(date_current)
      
  result = pd.DataFrame({
      'HierarchicalId_2': industry_list,
      'HOLDDATE': date_list,
      't_statistic': t_statistic_list,
      'p_value': p_value_list
  })
  return result

In [33]:
date_current = '2013-03-31'
date_before = '2012-12-31'
industry = '54'
df = merged_IQRjt
sample_current = df.loc[(df['HOLDDATE']==date_current) & (df['HierarchicalId_2']==industry), 'Delta']
sample_before = df.loc[(df['HOLDDATE']==date_before) & (df['HierarchicalId_2']==industry), 'Delta']
t_statistic, p_value = stats.ttest_ind(sample_current,sample_before, equal_var=False, nan_policy = 'raise')
print(t_statistic, p_value)
print(sample_current.mean())
print(sample_before.mean())
print(len(sample_before))
print(len(sample_current))

0.9527016032733964 0.34164345465943247
0.00017547998262951948
0.00011260532254828837
214
216


In [34]:
# !pip install bioinfokit

from bioinfokit.analys import get_data, stat

label = len(sample_before) * ['A'] + len(sample_current) * ['B']
samples = list(sample_before) + list(sample_current)
df_test = pd.DataFrame({
    'label': label,
    'data': samples
})
res = stat()
res.ttest(df=df_test, xfac="label", res="data", test_type=2, evar=False)
print(res.summary)



Two sample t-test with unequal variance (Welch's t-test)

------------------  -------------
Mean diff            -6.28747e-05
t                    -0.952702
Std Error             6.59962e-05
df                  255
P-value (one-tail)    0.170822
P-value (two-tail)    0.341643
Lower 95.0%          -0.000192842
Upper 95.0%           6.70923e-05
------------------  -------------

Parameter estimates

Level      Number         Mean      Std Dev    Std Error    Lower 95.0%    Upper 95.0%
-------  --------  -----------  -----------  -----------  -------------  -------------
A             214  0.000112605  0.000282847  1.9335e-05     7.44928e-05    0.000150718
B             216  0.00017548   0.000927382  6.31003e-05    5.11055e-05    0.000299854



In [28]:
df_IQRjt_t_test = get_time_series_t_test(merged_IQRjt)
df_weighted_IQRjt_t_test = get_time_series_t_test(merged_weighted_IQRjt)
# df_Vjt_t_test = get_time_series_t_test(merged_Vjt)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [29]:
# df_IQRjt_t_test
df_weighted_IQRjt_t_test.to_csv('/content/gdrive/My Drive/capstone/data for trend/225 investors/weighted_IQRjt_t_test.csv')
df_IQRjt_t_test.to_csv('/content/gdrive/My Drive/capstone/data for trend/225 investors/IQRjt_t_test.csv')

In [8]:

print(len(df_Vjt[df_Vjt['Delta'].isna()]))
print(len(df_Vjt))

43904
315889
