In [1]:
import numpy as np
import pandas as pd
from datetime import timedelta
from scipy.stats import zscore
#from kaggle.competitions import twosigmanews

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

# Get 2Sigma environment
#env = twosigmanews.make_env()

In [2]:
# Daten laden

stock_data = pd.read_csv('stock_data/Rohdaten/stock_pfizer_d.csv')
stock_data.rename(columns={'date':'date', '1. open':'open', '2. high':'high', '3. low':'low','4. close':'close', '5. volume':'volume'}, inplace=True)
twitter_data = pd.read_csv('Final_data/SVM-TFIDF/Pfizer_Ticker_classified_svm_tfidf_bigram.csv')

In [3]:
stock_data['date'] = pd.to_datetime(stock_data['date'])
twitter_data['date'] = pd.to_datetime(twitter_data['date'])

In [4]:
stock_data.head()

Unnamed: 0,date,open,high,low,close,volume
0,2019-04-08,43.07,43.28,42.91,43.14,14620975.0
1,2019-04-09,42.98,43.0,42.58,42.84,17860595.0
2,2019-04-10,42.92,42.92,42.6467,42.73,12526176.0
3,2019-04-11,42.79,42.83,41.99,42.27,19179152.0
4,2019-04-12,42.34,42.34,41.58,41.71,20245333.0


In [5]:
twitter_data.head()

Unnamed: 0,date,text,score
0,2019-06-09 23:38:01,labcorp selects new ceo,1.0
1,2019-06-09 23:30:00,bioxcel pharmaceuticals ceo drug pipeline ai c...,-1.0
2,2019-06-09 21:49:40,over more tweets good luck,1.0
3,2019-06-09 21:45:19,over more tweets good luck,1.0
4,2019-06-09 19:58:14,know changed,1.0


In [6]:
def date_(x):
    return x.date()

In [7]:
# Es werden nur die Schlusskurse betrachtet

stock_data = stock_data[['date', 'close', 'volume']]
stock_data['date'] = stock_data['date'].map(date_)
stock_data['date'] = pd.to_datetime(stock_data['date'])
stock_data.head()

Unnamed: 0,date,close,volume
0,2019-04-08,43.14,14620975.0
1,2019-04-09,42.84,17860595.0
2,2019-04-10,42.73,12526176.0
3,2019-04-11,42.27,19179152.0
4,2019-04-12,41.71,20245333.0


In [8]:
twitter_data = twitter_data[['date','text','score']]
twitter_data['date_only'] = twitter_data['date'].map(date_)
twitter_data['date_only'] = pd.to_datetime(twitter_data['date_only'])
twitter_data.head()

Unnamed: 0,date,text,score,date_only
0,2019-06-09 23:38:01,labcorp selects new ceo,1.0,2019-06-09
1,2019-06-09 23:30:00,bioxcel pharmaceuticals ceo drug pipeline ai c...,-1.0,2019-06-09
2,2019-06-09 21:49:40,over more tweets good luck,1.0,2019-06-09
3,2019-06-09 21:45:19,over more tweets good luck,1.0,2019-06-09
4,2019-06-09 19:58:14,know changed,1.0,2019-06-09


In [9]:
date_ranges = pd.date_range(start=stock_data['date'].min(), end=stock_data['date'].max())
all_dates = stock_data['date'].values
append_df = pd.DataFrame()
for i in range(len(date_ranges)):
    d = date_ranges[i]
    internal_df = pd.DataFrame(columns=['day', 'close_value'],index=[i])
    df = stock_data[stock_data['date']==pd.to_datetime(d)]
    if df.shape[0]!=0:
        internal_df['close_value'] = df['close'].values[0]
        internal_df['day'] = d
        append_df = append_df.append(internal_df)
    else:
        internal_df['close_value'] = 0.0
        internal_df['day'] = d
        append_df = append_df.append(internal_df)

In [10]:
true_close_values = append_df['close_value'].values
modified_ = []
for i in range(len(true_close_values)):
    close_ = true_close_values[i]
    if close_!=0.0:
        modified_.append(close_)
    else:
        i_p_2 = true_close_values[next((k for k, x in enumerate(true_close_values[i-1:]) if x), None)]
        modified_.append((true_close_values[i-1]+i_p_2)/2.0)

In [11]:
true_close_values = modified_
modified_new = []
for i in range(len(true_close_values)):
    close_ = true_close_values[i]
    if close_!=0.0:
        modified_new.append(close_)
    else:
        i_p_2 = true_close_values[next((k for k, x in enumerate(true_close_values[i-1:]) if x), None)]
        modified_new.append((true_close_values[i-1]+i_p_2)/2.0)

In [12]:
append_df['close_value'] = modified_new
append_df.head(15)

Unnamed: 0,day,close_value
0,2019-04-08,43.14
1,2019-04-09,42.84
2,2019-04-10,42.73
3,2019-04-11,42.27
4,2019-04-12,41.71
5,2019-04-13,42.425
6,2019-04-14,21.365
7,2019-04-15,42.09
8,2019-04-16,40.92
9,2019-04-17,39.88


In [18]:
twitter_data_avg_sentiments = twitter_data.groupby('date_only').agg({'score':np.mean}).reset_index()
twitter_data_avg_sentiments.head(15)

Unnamed: 0,date_only,score
0,2019-04-07,0.0
1,2019-04-08,0.314286
2,2019-04-09,0.243243
3,2019-04-10,0.25
4,2019-04-11,0.321101
5,2019-04-12,0.25
6,2019-04-13,0.157895
7,2019-04-14,0.222222
8,2019-04-15,0.134615
9,2019-04-16,0.128205


In [20]:
dates_common = list(set(twitter_data_avg_sentiments['date_only'].values).intersection(set(append_df['day'].values)))
difference_df = pd.DataFrame()
for date in dates_common:
    internal_df = pd.DataFrame(columns=['sentiment_value','difference_before_sentiment','difference_after_sentiment'],index=[0])
    if date == min(dates_common) or date==max(dates_common):
        pass
    else:
        internal_df['sentiment_value'] = twitter_data_avg_sentiments[nt_df_petro_china_avg_sentiments['date_only']==date]['sentiment_max'].values[0]
        internal_df['difference_before_sentiment'] = append_df[append_df['day']==pd.to_datetime(date)]['open_value'].values[0] - append_df[append_df['day']==(pd.to_datetime(date)-timedelta(days=1))]['open_value'].values[0]
        internal_df['difference_after_sentiment'] = append_df[append_df['day']==pd.to_datetime(date)]['open_value'].values[0] - append_df[append_df['day']==(pd.to_datetime(date)+timedelta(days=1))]['open_value'].values[0]
        internal_df['date'] = date
        difference_df = difference_df.append(internal_df)

KeyError: 'date'

In [None]:
from statsmodels.tsa.stattools import grangercausalitytests
granger_test_result = grangercausalitytests(differen_df)