In [5]:
import pandas as pd
import numpy as np
from data import transformations as trf

In [6]:
data = pd.read_csv("dataset/transformed_data/initial_train_data.csv")
weekly = pd.read_csv("dataset/raw_data/timeseries/time_series_weekly_adjusted.csv")

weekly = weekly[["timestamp", "symbol", "adjusted close"]]

In [7]:
# Calculate fundamental metrics

data["PERatio"] = np.where(data['eps'] != 0, data['adjusted close'] / data['eps'], 0)
data["currentEquity"] = data["totalCurrentAssets"] - data["totalCurrentLiabilities"]

data = data[['symbol', 'timestamp', 'adjusted close', 'freeCashFlow',
       'eps', 'Sector', 'grossProfitRatio', 'operatingIncomeRatio',
       'totalEquity', "PERatio", "currentEquity"
       ]]

In [8]:
# Add economic indicators

ffr = pd.read_csv("dataset/raw_data/economy/federal_funds_rate.csv")
cpi = pd.read_csv("dataset/raw_data/economy/cpi.csv")
sp500_bench = pd.read_csv("dataset/info_data/S&P500-prices.csv")


data["timestamp"] = pd.to_datetime(data["timestamp"])
weekly["timestamp"] = pd.to_datetime(weekly["timestamp"])
ffr["timestamp"] = pd.to_datetime(ffr["timestamp"])
cpi["timestamp"] = pd.to_datetime(cpi["timestamp"])
sp500_bench["timestamp"] = pd.to_datetime(sp500_bench["timestamp"])

ffr = ffr.rename({"value": "ffr"}, axis=1)
cpi = cpi.rename({"value": "cpi"}, axis=1)
sp500_bench['bm_performance'] = sp500_bench.sort_values(["timestamp"])["adjusted close"].pct_change(periods=1).shift(-1)
sp500_bench['bm_performance_last_month'] = sp500_bench["bm_performance"].shift(-1)

sp500_bench = sp500_bench[["timestamp", "bm_performance_last_month"]]

data = pd.merge_asof(data.sort_values(['timestamp']), ffr.sort_values(['timestamp']),
                              on='timestamp', direction='backward')

data = pd.merge_asof(data.sort_values(['timestamp']), cpi.sort_values(['timestamp']),
                              on='timestamp', direction='backward')

data = pd.merge_asof(data.sort_values(['timestamp']), sp500_bench.sort_values(['timestamp']),
                              on='timestamp', direction='backward')

data = data.sort_values(['symbol', 'timestamp'], ascending=[True, True])
weekly = weekly.sort_values(['symbol', 'timestamp'], ascending=[True, True])

data = data.rename({"adjusted close": "close"}, axis=1)
weekly = weekly.rename({"adjusted close": "close"}, axis=1)

In [9]:
# Add techincal indicators

timeperiods = [3, 6, 12]

data = trf.calculate_sma(data, "close", timeperiods)
data = trf.calculate_ema(data, "close", timeperiods)
data = trf.calculate_rolling_high_low(data, "close", [12])
data = trf.calculate_rsi(data, "close", timeperiods)
data, histogram = trf.calculate_macd(data, "close", [(12, 26, 9)])


In [10]:
# Transform columns into ML readable values

data = pd.get_dummies(data, columns=["Sector"])
data = trf.extract_month_quarter(data, "timestamp")

In [11]:
# Remove stocks which are listed less than two years

data = data[data["symbol"].isin(list(data["symbol"].value_counts().loc[lambda x: x>26].index))]
weekly = weekly[weekly["symbol"].isin(list(weekly["symbol"].value_counts().loc[lambda x: x>26*4.5].index))]

In [12]:
# Label data

data = trf.label_performance(data, "close")
weekly = trf.label_performance(weekly, "close")

In [13]:
# Remove last two month values with incomplete data

data = data[data["timestamp"] < "2023-04-01"]
weekly = weekly[weekly["timestamp"] < "2023-04-01"]

In [14]:
# Add performance metrics

data["performance_last_month"] = data.groupby('symbol')["performance"].shift(1)
weekly["performance_last_month"] = weekly.groupby('symbol')["performance"].shift(1)

data = trf.calculate_sma(data, "performance_last_month", timeperiods)
data = trf.calculate_ema(data, "performance_last_month", timeperiods)


In [15]:
data.to_csv("dataset/final_data/gold_data.csv", index=False)
weekly.to_csv("dataset/final_data/weekly_gold_data.csv", index=False)