In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import pandas_datareader as pdr
from sklearn.preprocessing import MinMaxScaler

In [2]:
# read in data for the two stocks chosen (best performing)
zts = pd.read_excel('~/capp30254_fight_potatoes/data processing/data_processed_v2_0513.xlsx', sheet_name='ZTS')
bio = pd.read_excel('~/capp30254_fight_potatoes/data processing/data_processed_v2_0513.xlsx', sheet_name='BIO')
date = zts['Report Date']

In [3]:
# get data for daily stock indicators
tickers = ['ZTS','BIO']
all_data = pd.DataFrame()
test_data = pd.DataFrame()

for i in tickers:
    test_data = pdr.get_data_yahoo(i, start = dt.datetime(2020,2,28), end = dt.date.today())
    test_data['symbol'] = i
    all_data = all_data.append(test_data)

#Creating Return column
all_data['return'] = all_data.groupby('symbol')['Close'].pct_change()

  all_data = all_data.append(test_data)
  all_data = all_data.append(test_data)


In [4]:
# RSI
all_data['Diff'] = all_data.groupby('symbol')['Close'].transform(lambda x: x.diff())
all_data['Up'] = all_data['Diff']
all_data.loc[(all_data['Up']<0), 'Up'] = 0

all_data['Down'] = all_data['Diff']
all_data.loc[(all_data['Down']>0), 'Down'] = 0 
all_data['Down'] = abs(all_data['Down'])

all_data['avg_5up'] = all_data.groupby('symbol')['Up'].transform(lambda x: x.rolling(window=5).mean())
all_data['avg_5down'] = all_data.groupby('symbol')['Down'].transform(lambda x: x.rolling(window=5).mean())

all_data['avg_15up'] = all_data.groupby('symbol')['Up'].transform(lambda x: x.rolling(window=15).mean())
all_data['avg_15down'] = all_data.groupby('symbol')['Down'].transform(lambda x: x.rolling(window=15).mean())

all_data['RS_5'] = all_data['avg_5up'] / all_data['avg_5down']
all_data['RS_15'] = all_data['avg_15up'] / all_data['avg_15down']

all_data['RSI_5'] = 100 - (100/(1+all_data['RS_5']))
all_data['RSI_15'] = 100 - (100/(1+all_data['RS_15']))

all_data['RSI_ratio'] = all_data['RSI_5']/all_data['RSI_15']

In [5]:
# MACD
all_data['5Ewm'] = all_data.groupby('symbol')['Close'].transform(lambda x: x.ewm(span=5, adjust=False).mean())
all_data['15Ewm'] = all_data.groupby('symbol')['Close'].transform(lambda x: x.ewm(span=15, adjust=False).mean())
all_data['MACD'] = all_data['15Ewm'] - all_data['5Ewm']

In [6]:
# keep only RSI and MACD
all_data['Report Date'] = all_data.index
all_data.reset_index(drop=True)
all_data = all_data[['symbol', 'Report Date', 'RSI_ratio', 'MACD']]

In [7]:
# merge indicators and rest of data
zts = zts.merge(all_data, how = 'left', left_on = ['Report Date' , 'Ticker'], right_on = ['Report Date', 'symbol'])
bio = bio.merge(all_data, how = 'left', left_on = ['Report Date' , 'Ticker'], right_on = ['Report Date', 'symbol'])

In [33]:
zts.columns

Index(['Ticker', 'Report Date', 'Stock price',
       '(Dividends + Share Buyback) / FCF', 'Asset Turnover',
       'CapEx / (Depr + Amor)', 'Current Ratio', 'Debt Ratio',
       'Dividends / FCF', 'Dummy_Dividends', 'Gross Profit Margin',
       'Interest Coverage', 'Inventory Turnover', 'Log Revenue',
       'Net Profit Margin', 'Quick Ratio', 'R&D / Gross Profit',
       'R&D / Revenue', 'Return on Assets', 'Return on Equity',
       'Return on Research Capital', 'Share Buyback / FCF', 'tweet_polarity',
       'tweet_subjectivity', 'p_diff', 's_diff', 'price_diff', 'symbol',
       'RSI_ratio', 'MACD'],
      dtype='object')

In [8]:
# relative change for polarity/subjectivity
# 

def processing(df, change = 0):
    '''
    Transforms absolute polarity and subjectivity
    to relative terms.

    Add a third label that represents no change in
    the stock price. Y_dummy ∈ {-1, 0 ,1}

    Inputs:
        df: pandas df
        change: threshold for pct change; ex: 0.025 means +/- 2.5%
    
    Returns:
        X: df features
        y_price: Stock prices
        y_binary: Y_dummy ∈ {-1, 0 ,1}
    '''
    df['p_diff'] = df['tweet_polarity'].pct_change()
    df['s_diff'] = df['tweet_subjectivity'].pct_change()
    df['price_diff'] = df['Stock price'].pct_change()
    condlist = [df['price_diff'] < -change, df['price_diff'] > change]
    choices = [-1, 1]
    y_binary = np.select(condlist, choices, 0)
    y_price = df[['Stock price']]
    X = df.iloc[:, 3: -2]
    X.drop(['symbol'], axis=1, inplace=True)
    return X, y_price, y_binary

In [35]:
# example call to processing for zts with 2.5% point margin
t, typ, tyb = processing(zts, 0.025)

In [31]:
# feature normalization
def normalize(X_train, X_test):
    '''
    Normalize features using sklearn MinMaxScaler
    '''
    norm = MinMaxScaler().fit(X_train)
    X_train_norm = norm.transform(X_train)
    X_test_norm = norm.transform(X_test)
    #scaler = MinMaxScaler()
    #X_norm = scaler.fit_transform(X)
    # make into pd df
    X_train_norm = pd.DataFrame(X_train_norm, columns = X.columns)
    X_test_norm = pd.DataFrame(X_test_norm, columns = X.columns)
    return X_train_norm, X_test_norm

In [23]:
# function to select data given date window
def window(start, end, df, date):
    '''
    Given a start and end date, return df with data only
    from that period
    
    Inputs:
        start/end: start and end dates
            ex: start = '2022-01-23' ('YYYY-MM-DD')
        df: pd dataframe
        date: pd series with dates of all possible dates in data
    
    Returns: pd dataframe
    '''
    date = date[date.between(start, end, inclusive='both')]
    # concat dates and dataframe
    df = pd.concat([date, df], axis = 1, join="inner")
    return df

In [38]:
# example for training data 2021 Q4
start = '2021-10-01'
end = '2021-12-31'
# stock price
typ1 = window(start, end, typ, date)
# training
zts_norm1 = window(start, end, t, date)

In [32]:
# subset data for different train/test windows
zts = pd.read_csv('~/capp30254_fight_potatoes/data processing/zts_all_date_processed.csv')
bio = pd.read_csv('~/capp30254_fight_potatoes/data processing/bio_all_date_processed.csv')
# convert report date to pd datetime
zts['Report Date'] = pd.to_datetime(zts['Report Date'])
bio['Report Date'] = pd.to_datetime(bio['Report Date'])
date = zts['Report Date']

In [33]:
# subset data for different train/test windows
all_windows = [[['2020-03-02', '2020-12-31'], ['2021-01-01', '2021-06-30']],[['2020-06-01', '2021-03-31'],['2021-04-01','2021-09-30']],
      [['2020-09-01', '2021-06-30'],['2021-07-01','2021-12-31']], [['2021-01-01', '2021-09-30'],['2021-10-01','2022-04-24']], [['2021-03-01', '2021-12-31'],['2022-01-01','2022-04-24']]]
stock = [zts, bio]
stock_str = ['zts', 'bio']
cols = list(zts.columns)
cols.append('s_diff')

for i, tic in enumerate(stock):
    group = 0
    for wd in all_windows:
        train_wd, test_wd = wd
        train_start, train_end = train_wd
        test_start, test_end = test_wd
        
        # process (subjectivity pct change)
        df = tic
        df['s_diff'] = df['tweet_subjectivity'].pct_change()
        X = df.iloc[:, 3:]
        #t_norm = normalize(X)
        # get Y_price
        typ = df[['Stock price']]

        # save training data
        typ_train = window(train_start, train_end, typ, date)
        
        Y_dummy_train = pd.DataFrame(tic, columns = ['Y_boolean'])
        tyb_train = window(train_start, train_end, Y_dummy_train, date)
        X_train = window(train_start, train_end, X, date)
        X_test = window(test_start, test_end, X, date)
        # normalize data
        X_train_norm, X_test_norm = normalize(X_train.iloc[:, 1:], X_test.iloc[:, 1:])
        df_train = pd.merge(typ_train, tyb_train)
        #df_train = pd.merge(df_train,X_train_norm)
        df_train = pd.concat([df_train, X_train_norm], ignore_index=True, axis = 1)
        df_train.columns = cols
        df_train.to_csv(f'~/capp30254_fight_potatoes/data processing/window_data/{stock_str[i]}_train_group{group}_{train_start}_{train_end}.csv')
        
        # save testing data
        typ_test = window(test_start, test_end, typ, date)
        Y_dummy_test = pd.DataFrame(tic, columns = ['Y_boolean'])
        tyb_test = window(test_start, test_end, Y_dummy_test, date)
        #X_test = window(test_start, test_end, t_norm, date)
        df_test = pd.merge(typ_test, tyb_test)
        #df_test = pd.merge(df_test,X_train_norm)
        df_test = pd.concat([df_test, X_test_norm], ignore_index=True, axis = 1)
        df_test.columns = cols
        df_test.to_csv(f'~/capp30254_fight_potatoes/data processing/window_data/{stock_str[i]}_test_group{group}_{test_start}_{test_end}.csv')
        
        group += 1  

In [14]:
# phase 3
# subset data for different train/test windows
zts = pd.read_csv('~/capp30254_fight_potatoes/data processing/zts_all_date_processed.csv')
bio = pd.read_csv('~/capp30254_fight_potatoes/data processing/bio_all_date_processed.csv')
# convert report date to pd datetime
zts['Report Date'] = pd.to_datetime(zts['Report Date'])
bio['Report Date'] = pd.to_datetime(bio['Report Date'])
date = zts['Report Date']

In [15]:
# phase 3 normalization

def normalize(X):
    '''
    Normalize features using sklearn
    '''
    scaler = MinMaxScaler()
    X_norm = scaler.fit_transform(X)
    # make into pd df
    X_norm = pd.DataFrame(X_norm, columns = X.columns)
    return X_norm

In [18]:
stock = [zts, bio]
stock_str = ['zts', 'bio']
cols = zts.columns
for i, tic in enumerate(stock):
    df = tic
    df['s_diff'] = df['tweet_subjectivity'].pct_change()
    Y = df.iloc[:, :3]
    X = df.iloc[:, 3:]
    X_norm = normalize(X)
    # put back together
    out = pd.concat([Y, X_norm], ignore_index=True, axis = 1)
    out.columns = cols
    out.to_csv(f'~/capp30254_fight_potatoes/data processing/{stock_str[i]}_norm.csv')
    