In [53]:
import pandas as pd
import numpy as np
import matplotlib 
import seaborn as sns
import random
import matplotlib.pyplot as plt
from dateutil.parser import parse
from scipy import signal
from scipy.interpolate import interp1d
from scipy import stats
from statsmodels.tsa.stattools import adfuller, kpss, acf, pacf, grangercausalitytests
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_squared_error

#This cell reads stock data from the Iowa Stock exchange that has been 
#pre-downloaded and stored in a folder called 'stock_data' 
#The output of this cell is a dictionary with each Month as a key and
#a dataframe containging that Month's stock data as the value 

df_collection = {}

dfs_may = pd.read_html('stock_data/May_2000.htm', header =0)
df_collection['May'] = pd.DataFrame(dfs_may[0])
dfs_june = pd.read_html('stock_data/June_2000.htm', header =0)
df_collection['June'] = dfs_june[0]
dfs_july = pd.read_html('stock_data/July_2000.htm', header =0)
df_collection['July'] = dfs_july[0]
dfs_august = pd.read_html('stock_data/August_2000.htm', header =0)
df_collection['Aug'] = dfs_august[0]
dfs_sept = pd.read_html('stock_data/Sept_2000.htm', header =0)
df_collection['Sept'] = dfs_sept[0]
dfs_oct = pd.read_html('stock_data/Oct_2000.htm', header =0)
df_collection['Oct'] = dfs_oct[0]

        Date Contract  Units  $Volume  LowPrice  HighPrice  AvgPrice  \
0   05/01/00      Dem    224  112.043     0.490      0.550     0.500   
1   05/01/00   Reform      2    0.067     0.019      0.048     0.034   
2   05/01/00      Rep    116   57.950     0.488      0.501     0.500   
3   05/02/00      Dem     87   44.369     0.501      0.522     0.510   
4   05/02/00   Reform     50    0.196     0.003      0.005     0.004   
..       ...      ...    ...      ...       ...        ...       ...   
88  05/30/00   Reform     53    0.318     0.006      0.006     0.006   
89  05/30/00      Rep    249  126.944     0.504      0.510     0.510   
90  05/31/00      Dem    252  126.890     0.492      0.514     0.504   
91  05/31/00   Reform    115    0.445     0.003      0.004     0.004   
92  05/31/00      Rep     67   33.803     0.495      0.507     0.505   

    LastPrice  
0       0.550  
1       0.019  
2       0.500  
3       0.508  
4       0.003  
..        ...  
88      0.006  
89     

In [54]:
#In order to perform any analysis for this data, we need to normalize this data. 
#We use the formula from our paper (Dem Price /(Dem Price + Rep Price)) as the basis
#for normalization. This new Noramalized value is added as a new column for each month's
#dataframe. 
for key in df_collection:
    df = df_collection[key]
    df_collection[key] = df_collection[key][df_collection[key].Contract != 'Reform']
    df_collection[key] = df_collection[key].drop(['Units','$Volume','LowPrice','HighPrice','LastPrice'], axis = 1)
    df_collection[key] = df_collection[key].dropna()
    df_collection[key]['NormalizedPrice'] = 0
    
    for index,row in df_collection[key].iterrows():
        if(row['Contract'] == 'Dem'):
            demPrice = float(row['AvgPrice'])
            #print(demPrice)
            repPrice = df_collection[key].loc[df_collection[key]['Date'] == row['Date']]
            repPrice = repPrice[repPrice.Contract != 'Dem']
            repPrice.reset_index(drop=True, inplace=True)
            if(repPrice.empty):
                df_collection[key].loc[index,'NormalizedPrice'] = -1
            else:
                repPrice = repPrice.iloc[0]['AvgPrice']
                repPrice = float(repPrice)
                df_collection[key].loc[index,'NormalizedPrice'] = demPrice / (demPrice+repPrice)
        if(row['Contract'] == 'Rep'):
            repPrice = float(row['AvgPrice'])
            #print(demPrice)
            demPrice = df_collection[key].loc[df_collection[key]['Date'] == row['Date']]
            demPrice = demPrice[demPrice.Contract != 'Rep']
            demPrice.reset_index(drop=True, inplace=True)
            if(demPrice.empty):
                df_collection[key].loc[index,'NormalizedPrice'] = -1
            else:
                demPrice = demPrice.loc[0,'AvgPrice']
                demPrice = float(demPrice)
                df_collection[key].loc[index,'NormalizedPrice'] = repPrice / (demPrice+repPrice)

In [55]:
#We want to create a master dataframe with the stock data for all months. 
#This master dataframe is called df_all_normalized 
frames = []
for key in df_collection:
    frames.append(df_collection[key])
    
df_all_normalized = pd.concat(frames)
df_all_normalized = df_all_normalized[df_all_normalized.NormalizedPrice != -1]
df_all_normalized

Unnamed: 0,Date,Contract,AvgPrice,NormalizedPrice
0,05/01/00,Dem,0.500,0.500000
2,05/01/00,Rep,0.500,0.500000
3,05/02/00,Dem,0.510,0.507463
5,05/02/00,Rep,0.495,0.492537
6,05/03/00,Dem,0.509,0.508492
...,...,...,...,...
86,10/29/00,Rep,0.674,0.670647
87,10/30/00,Dem,0.356,0.352475
89,10/30/00,Rep,0.654,0.647525
90,10/31/00,Dem,0.381,0.382530


In [56]:
#The input for this cell is a list of topics for each day and it's respective probability. 
#This is provided via the PLSA algorithm and is called `plsa_again.csv`. 
#The output of this cell is a new dataframe containging the data we would like: df_plsa
df_plsa = pd.read_csv('plsa_again.csv', error_bad_lines=False)
df_plsa = df_plsa.drop(df_plsa.columns[3], axis=1)
df_plsa = df_plsa.drop(df_plsa.columns[3], axis=1)
print(df_plsa)

         date        topic  probability
0    05/03/00       warner     0.009858
1    05/03/00       disney     0.008048
2    05/03/00      company     0.007265
3    05/03/00        cable     0.007149
4    05/03/00         sale     0.005059
..        ...          ...          ...
915  10/25/00       doctor     0.008764
916  10/25/00         site     0.008665
917  10/25/00      company     0.008446
918  10/25/00       health     0.008025
919  10/25/00  information     0.007380

[920 rows x 3 columns]


In [57]:
#This cell finds all unique topics and adds each topic as a key to a dictionary: df_topics_collection
#The value for each key in this topic is a dataframe containing the dates and probilities for that topic  
topics = df_plsa.topic.unique()

df_topics_collection = {} 

for topic in topics:
    df_topics_collection[topic] = df_plsa.loc[df_plsa['topic'] == topic]

In [84]:
#This method iterates through each topic and creates stationary time series for the probabilities 
#And the stock data. This is then used in a granger test, who's results are manually inspected to find 
#Relevent topics. These topics are added to a dictionary, relevent_topic
relevent_topic = {}
for topic in topics: 
    df = df_topics_collection[topic]
    df = df.rename(columns={"date": "Date"})
    df = pd.merge(df, df_all_normalized, on="Date")
    temp_df = df.loc[(df['Contract'] == 'Dem')]
    temp_df['Probablity_stationary'] = temp_df['probability']-temp_df['probability'].shift(1)
    temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice']-temp_df['NormalizedPrice'].shift(1)
    temp_df = temp_df.dropna()
    #temp_df = temp_df['NormalizedPrice_stationary'].dropna()
    try:
        res = grangercausalitytests(temp_df[['Probablity_stationary', 'NormalizedPrice_stationary']], maxlag=5)
        relevent_topic[topic] = res
        print(res)
    except:
        continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Probablity_stationary'] = temp_df['probability']-temp_df['probability'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice']-temp_df['NormalizedPrice'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Pro


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=3.9473  , p=0.0684  , df_denom=13, df_num=1
ssr based chi2 test:   chi2=4.8582  , p=0.0275  , df=1
likelihood ratio test: chi2=4.2425  , p=0.0394  , df=1
parameter F test:         F=3.9473  , p=0.0684  , df_denom=13, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=3.0253  , p=0.0939  , df_denom=10, df_num=2
ssr based chi2 test:   chi2=9.0759  , p=0.0107  , df=2
likelihood ratio test: chi2=7.0974  , p=0.0288  , df=2
parameter F test:         F=3.0253  , p=0.0939  , df_denom=10, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.7981  , p=0.2351  , df_denom=7, df_num=3
ssr based chi2 test:   chi2=10.7887 , p=0.0129  , df=3
likelihood ratio test: chi2=7.9986  , p=0.0460  , df=3
parameter F test:         F=1.7981  , p=0.2351  , df_denom=7, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.9678  , p=0.5123  , df_den

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice']-temp_df['NormalizedPrice'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Probablity_stationary'] = temp_df['probability']-temp_df['probability'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Nor


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.2265  , p=0.6420  , df_denom=13, df_num=1
ssr based chi2 test:   chi2=0.2788  , p=0.5975  , df=1
likelihood ratio test: chi2=0.2764  , p=0.5991  , df=1
parameter F test:         F=0.2265  , p=0.6420  , df_denom=13, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.5041  , p=0.6186  , df_denom=10, df_num=2
ssr based chi2 test:   chi2=1.5122  , p=0.4695  , df=2
likelihood ratio test: chi2=1.4407  , p=0.4866  , df=2
parameter F test:         F=0.5041  , p=0.6186  , df_denom=10, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=2.7103  , p=0.1252  , df_denom=7, df_num=3
ssr based chi2 test:   chi2=16.2619 , p=0.0010  , df=3
likelihood ratio test: chi2=10.7916 , p=0.0129  , df=3
parameter F test:         F=2.7103  , p=0.1252  , df_denom=7, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=2.4036  , p=0.2082  , df_den

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Probablity_stationary'] = temp_df['probability']-temp_df['probability'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice']-temp_df['NormalizedPrice'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Pro


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.5333  , p=0.4723  , df_denom=24, df_num=1
ssr based chi2 test:   chi2=0.6000  , p=0.4386  , df=1
likelihood ratio test: chi2=0.5934  , p=0.4411  , df=1
parameter F test:         F=0.5333  , p=0.4723  , df_denom=24, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.5611  , p=0.5789  , df_denom=21, df_num=2
ssr based chi2 test:   chi2=1.3894  , p=0.4992  , df=2
likelihood ratio test: chi2=1.3535  , p=0.5083  , df=2
parameter F test:         F=0.5611  , p=0.5789  , df_denom=21, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.4064  , p=0.7502  , df_denom=18, df_num=3
ssr based chi2 test:   chi2=1.6934  , p=0.6384  , df=3
likelihood ratio test: chi2=1.6385  , p=0.6507  , df=3
parameter F test:         F=0.4064  , p=0.7502  , df_denom=18, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.4833  , p=0.7478  , df_d

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Probablity_stationary'] = temp_df['probability']-temp_df['probability'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice']-temp_df['NormalizedPrice'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Pro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Probablity_stationary'] = temp_df['probability']-temp_df['probability'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice']-temp_df['NormalizedPrice'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Pro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Probablity_stationary'] = temp_df['probability']-temp_df['probability'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice']-temp_df['NormalizedPrice'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Pro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice']-temp_df['NormalizedPrice'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Probablity_stationary'] = temp_df['probability']-temp_df['probability'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Nor

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Probablity_stationary'] = temp_df['probability']-temp_df['probability'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice']-temp_df['NormalizedPrice'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Pro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Probablity_stationary'] = temp_df['probability']-temp_df['probability'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice']-temp_df['NormalizedPrice'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Pro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Probablity_stationary'] = temp_df['probability']-temp_df['probability'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice']-temp_df['NormalizedPrice'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Pro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['NormalizedPrice_stationary'] = temp_df['NormalizedPrice']-temp_df['NormalizedPrice'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Probablity_stationary'] = temp_df['probability']-temp_df['probability'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Nor

In [85]:
for key in relevent_topic: 
    print(key)

company
bush
gore
lazio
campaign
clinton
bush
gore
campaign
clinton


{'bush': {1: ({'ssr_ftest': (0.8401872015526504, 0.36327390422022854, 56.0, 1),
    'ssr_chi2test': (0.8851972302072566, 0.3467821401325717, 1),
    'lrtest': (0.8786224522865496, 0.34857925635258524, 1),
    'params_ftest': (0.8401872015526454, 0.3632739042202323, 56.0, 1.0)},
   [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fe5d6919f40>,
    <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fe5d7159a60>,
    array([[0., 1., 0.]])]),
  2: ({'ssr_ftest': (0.794016128809264, 0.45732863742582486, 53.0, 2),
    'ssr_chi2test': (1.737846621544804, 0.4194028727915985, 2),
    'lrtest': (1.712319833634524, 0.42479018393575385, 2),
    'params_ftest': (0.7940161288092309, 0.45732863742583874, 53.0, 2.0)},
   [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fe5d71080a0>,
    <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fe5d7108070>,
    array([[0., 0., 1., 0., 0.],
           [0., 0., 0., 1., 0.]])]),
  3: ({'s

In [126]:
#We find relevant words for each topic, and create a new CSV 'words_per_topic.csv'
#This CSV is loaded into a data frame. 
df_word_per_topic = pd.read_csv('words_per_topic.csv', error_bad_lines=False, header=None)
df_word_per_topic

Unnamed: 0,0,1
0,bush,"lehrer,missile,fund,oil,mccain,defense,tax,imm..."
1,gore,"oil,lehrer,missile,convention,clancy,teacher,t..."
2,campaign,"school,mccain,immigrant,cheney,bush,clinton,pl..."
3,clinton,"lieberman,missile,gore,tire,bush,defense,chene..."


In [69]:
#We load another CSV called 'word_frequency.csv' to find the frequency of all words that we are interested 
#in. This is loaded into a dataframe df_word_frequency 
df_word_frequency = pd.read_csv('word_frequency.csv', error_bad_lines=False)
df_word_frequency

Unnamed: 0,date,word,frequency
0,05/03/00,debate,11
1,05/03/00,defense,27
2,05/03/00,print,5
3,05/03/00,judge,19
4,05/03/00,planet,1
...,...,...,...
9195,06/25/00,missile,6
9196,06/25/00,lehrer,0
9197,06/25/00,jones,0
9198,06/25/00,company,359


In [125]:
#In this cell, we find the positive and negatively correlated words for each topic.
#We run a pearson coefficient test on these topics and then find all topics that add up to our probability mass.
#This then creates two new topics which are reported in two new CSVs '[topic]_[positive/negative].csv' 
#which are written to disk.  

from scipy.stats import pearsonr
import operator
import math
for index, row in df_word_per_topic.iterrows():
    topic = row[0]
    words = row[1].split(",")
    pearson_results = {}
    prob_mass = 0.75
    for word in words:
        df_word_freq = df_word_frequency[df_word_frequency.word == word]
        df_word_freq = df_word_freq.rename(columns={"date": "Date"})
        df_word_freq = pd.merge(df_word_freq, df_all_normalized, on="Date")
        df_word_freq = df_word_freq.loc[(df_word_freq['Contract'] == 'Dem')]
        #df_word_freq['frequency_stationary'] = df_word_freq['frequency']-df_word_freq['frequency'].shift(1)
        df_word_freq['NormalizedPrice_stationary'] = df_word_freq['NormalizedPrice']-df_word_freq['NormalizedPrice'].shift(1)
        #print(df_word_freq)
        df_word_freq = df_word_freq.dropna()
        corr,_ = pearsonr(df_word_freq['frequency'], df_word_freq['NormalizedPrice'])
        if(math.isnan(corr) == False):
            pearson_results[word] = corr
            
    sorted_ascending = sorted(pearson_results.items(), key=operator.itemgetter(1))
    temp_mass = prob_mass
    negative_words = []
    for key in sorted_ascending:
        if key[1] > 0:
            break;
        elif temp_mass + key[1] > 0:
            negative_words.append([key[0],key[1]])
            temp_mass = temp_mass + key[1]
    df_neg = pd.DataFrame(negative_words, columns = ['Word', 'Probability'])
    file_name = "./prior_csvs/" + topic + " _negative.csv"
    df_neg.to_csv(file_name)
    
    sorted_descending = sorted(pearson_results.items(), key=operator.itemgetter(1),reverse=True)
    temp_mass = prob_mass
    positive_words = []
    for key in sorted_descending:
        if key[1] < 0:
            break;
        elif temp_mass - key[1] > 0:
            positive_words.append([key[0],key[1]])
            temp_mass = temp_mass - key[1]
            print(temp_mass)
    df_pos = pd.DataFrame(positive_words, columns = ['Word', 'Probability'])
    file_name = "./prior_csvs/" + topic + " _positive.csv"
    df_pos.to_csv(file_name)



0.2828414915861492
0.013118895104955175
0.007076801527531878
0.2828414915861492
0.04283556596591276
0.022464666351033988
0.004348167106644794




0.2828414915861492
0.015757745259956213
0.482916253673807
0.3303656814439444
0.18117737608346776
0.0785979701213549
0.019727698457722646
0.017855516878596725
