In [33]:
import pandas as pd
import numpy as np
import matplotlib 
import seaborn as sns
import random
import matplotlib.pyplot as plt
from dateutil.parser import parse
from scipy import signal
from scipy.interpolate import interp1d
from scipy import stats
from statsmodels.tsa.stattools import adfuller, kpss, acf, pacf, grangercausalitytests
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_squared_error

df_collection = {}

dfs_may = pd.read_html('stock_data/May_2000.htm', header =0)
df_collection['May'] = pd.DataFrame(dfs_may[0])
dfs_june = pd.read_html('stock_data/June_2000.htm', header =0)
df_collection['June'] = dfs_june[0]
dfs_july = pd.read_html('stock_data/July_2000.htm', header =0)
df_collection['July'] = dfs_july[0]
dfs_august = pd.read_html('stock_data/August_2000.htm', header =0)
df_collection['Aug'] = dfs_august[0]
dfs_sept = pd.read_html('stock_data/Sept_2000.htm', header =0)
df_collection['Sept'] = dfs_sept[0]
dfs_oct = pd.read_html('stock_data/Oct_2000.htm', header =0)
df_collection['Oct'] = dfs_oct[0]

print((df_collection['May']))

        Date Contract  Units  $Volume  LowPrice  HighPrice  AvgPrice  \
0   05/01/00      Dem    224  112.043     0.490      0.550     0.500   
1   05/01/00   Reform      2    0.067     0.019      0.048     0.034   
2   05/01/00      Rep    116   57.950     0.488      0.501     0.500   
3   05/02/00      Dem     87   44.369     0.501      0.522     0.510   
4   05/02/00   Reform     50    0.196     0.003      0.005     0.004   
..       ...      ...    ...      ...       ...        ...       ...   
88  05/30/00   Reform     53    0.318     0.006      0.006     0.006   
89  05/30/00      Rep    249  126.944     0.504      0.510     0.510   
90  05/31/00      Dem    252  126.890     0.492      0.514     0.504   
91  05/31/00   Reform    115    0.445     0.003      0.004     0.004   
92  05/31/00      Rep     67   33.803     0.495      0.507     0.505   

    LastPrice  
0       0.550  
1       0.019  
2       0.500  
3       0.508  
4       0.003  
..        ...  
88      0.006  
89     

In [34]:
for key in df_collection:
    df = df_collection[key]
    df_collection[key] = df_collection[key][df_collection[key].Contract != 'Reform']
    df_collection[key] = df_collection[key].drop(['Units','$Volume','LowPrice','HighPrice','LastPrice'], axis = 1)
    df_collection[key] = df_collection[key].dropna()
    df_collection[key]['NormalizedPrice'] = 0
    
    for index,row in df_collection[key].iterrows():
        if(row['Contract'] == 'Dem'):
            demPrice = float(row['AvgPrice'])
            #print(demPrice)
            repPrice = df_collection[key].loc[df_collection[key]['Date'] == row['Date']]
            repPrice = repPrice[repPrice.Contract != 'Dem']
            repPrice.reset_index(drop=True, inplace=True)
            if(repPrice.empty):
                df_collection[key].loc[index,'NormalizedPrice'] = -1
            else:
                repPrice = repPrice.iloc[0]['AvgPrice']
                repPrice = float(repPrice)
                df_collection[key].loc[index,'NormalizedPrice'] = demPrice / (demPrice+repPrice)
        if(row['Contract'] == 'Rep'):
            repPrice = float(row['AvgPrice'])
            #print(demPrice)
            demPrice = df_collection[key].loc[df_collection[key]['Date'] == row['Date']]
            demPrice = demPrice[demPrice.Contract != 'Rep']
            demPrice.reset_index(drop=True, inplace=True)
            if(demPrice.empty):
                df_collection[key].loc[index,'NormalizedPrice'] = -1
            else:
                demPrice = demPrice.loc[0,'AvgPrice']
                demPrice = float(demPrice)
                df_collection[key].loc[index,'NormalizedPrice'] = repPrice / (demPrice+repPrice)

In [35]:
frames = []
for key in df_collection:
    frames.append(df_collection[key])
    
df_all_normalized = pd.concat(frames)
df_all_normalized

Unnamed: 0,Date,Contract,AvgPrice,NormalizedPrice
0,05/01/00,Dem,0.500,0.500000
2,05/01/00,Rep,0.500,0.500000
3,05/02/00,Dem,0.510,0.507463
5,05/02/00,Rep,0.495,0.492537
6,05/03/00,Dem,0.509,0.508492
...,...,...,...,...
86,10/29/00,Rep,0.674,0.670647
87,10/30/00,Dem,0.356,0.352475
89,10/30/00,Rep,0.654,0.647525
90,10/31/00,Dem,0.381,0.382530


In [36]:
df_plsa = pd.read_csv ('plsa.csv')
print(df_plsa)

         date     topic  probability
0    05/03/00    warner     0.008921
1    05/04/00    connor     0.005673
2    05/05/00      bush     0.006957
3    05/02/00     cable     0.010275
4    05/20/00  giuliani     0.014083
..        ...       ...          ...
179  06/12/00      code     0.006021
180  06/13/00     court     0.010863
181  06/14/00     court     0.013246
182  06/22/00      drug     0.005958
183  06/25/00  saturday     0.003432

[184 rows x 3 columns]


In [37]:
topics = df_plsa.topic.unique()

df_topics_collection = {} 

for topic in topics:
    df_topics_collection[topic] = df_plsa.loc[df_plsa['topic'] == topic]

#print(df_topics_collection[topics[1]])

In [45]:
#Run a sample granger test 
df = df_topics_collection[topics[2]]
df = df.rename(columns={"date": "Date"})
df = pd.merge(df, df_all_normalized, on="Date")
temp_df = df.loc[(df['Contract'] == 'Rep')]

temp_df    

res = grangercausalitytests(df[['probability', 'NormalizedPrice']], maxlag=5)
res


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.8525  , p=0.3617  , df_denom=38, df_num=1
ssr based chi2 test:   chi2=0.9198  , p=0.3375  , df=1
likelihood ratio test: chi2=0.9096  , p=0.3402  , df=1
parameter F test:         F=0.8525  , p=0.3617  , df_denom=38, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.2803  , p=0.7573  , df_denom=35, df_num=2
ssr based chi2 test:   chi2=0.6406  , p=0.7259  , df=2
likelihood ratio test: chi2=0.6356  , p=0.7278  , df=2
parameter F test:         F=0.2803  , p=0.7573  , df_denom=35, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.2918  , p=0.8310  , df_denom=32, df_num=3
ssr based chi2 test:   chi2=1.0668  , p=0.7851  , df=3
likelihood ratio test: chi2=1.0525  , p=0.7886  , df=3
parameter F test:         F=0.2918  , p=0.8310  , df_denom=32, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.1406  , p=0.9657  , df_d

{1: ({'ssr_ftest': (0.8524938095918814, 0.36167367053895305, 38.0, 1),
   'ssr_chi2test': (0.9197959524543983, 0.33752855419014605, 1),
   'lrtest': (0.9096303357590045, 0.34021252053964746, 1),
   'params_ftest': (0.8524938095918764, 0.3616736705389544, 38.0, 1.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fd92c330f10>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fd92ca20880>,
   array([[0., 1., 0.]])]),
 2: ({'ssr_ftest': (0.28027810554315613, 0.7572534038858721, 35.0, 2),
   'ssr_chi2test': (0.6406356698129282, 0.725918278235963, 2),
   'lrtest': (0.635559620623269, 0.7277630166854896, 2),
   'params_ftest': (0.28027810554315247, 0.7572534038858751, 35.0, 2.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fd92ea5ffd0>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fd92ec6cd00>,
   array([[0., 0., 1., 0., 0.],
          [0., 0., 0., 1., 0.]])]),
 3: ({'ssr_ftest': (0.291777258