In [11]:
import pandas as pd
import numpy as np
import ta, ta.trend, ta.momentum
import joblib
import matplotlib.pyplot as plt

def add_ta_features(df, timeframe,rsi_window=14, sma_windows=(5, 20)):
    df = df.copy()
    
    rsi = ta.momentum.RSIIndicator(close=df["close"], window=rsi_window)
    df["rsi_14"] = rsi.rsi()
    
    macd = ta.trend.MACD(close=df["close"])
    df["macd"] = macd.macd()
    df["macd_signal"] = macd.macd_signal()

    sma_short = ta.trend.SMAIndicator(close=df["close"], window=sma_windows[0])
    sma_long = ta.trend.SMAIndicator(close=df["close"], window=sma_windows[1])
    
    df[f"sma_{sma_windows[0]}"] = sma_short.sma_indicator()
    df[f"sma_{sma_windows[1]}"] = sma_long.sma_indicator()

    df["return"] = df["close"].pct_change(periods=timeframe)

    df["future_return"] = df["close"].shift(-1) / df["close"] - 1
    df["target"] = (df["future_return"] > 0).astype(int)
    df = df.dropna()
    return df

def aggregate_and_forwardfill_news(news_df, timeframe='1H'):
    news_df['timestamp'] = pd.to_datetime(news_df['timestamp'])
    numeric_cols = news_df.select_dtypes(include='number').columns.tolist()
    numeric_cols.append('timestamp')
    news_df = news_df[numeric_cols]
    news_df = news_df.set_index('timestamp')
    news_aggregated = news_df.resample(timeframe).mean()
    sentiment_columns = news_aggregated.columns
    news_aggregated[sentiment_columns] = news_aggregated[sentiment_columns].ffill()
    news_aggregated[sentiment_columns] = news_aggregated[sentiment_columns].fillna(0)
    news_aggregated = news_aggregated.reset_index()
    return news_aggregated

def merge_df(df1, df2):
    df1['timestamp'] = pd.to_datetime(df1['timestamp'])
    df2['timestamp'] = pd.to_datetime(df2['timestamp'])

    df1 = df1.set_index('timestamp')
    df2 = df2.set_index('timestamp')

    merged_df = df1.join(df2, how='left')
    merged_df = merged_df.reset_index()
    sentiment_cols = df2.columns
    merged_df[sentiment_cols] = merged_df[sentiment_cols].fillna(0)
    return merged_df

def keep_news(df):
    news_df = df[df['ticker_sentiment'] != 0.0]
    return news_df

In [22]:
btc_hourly = pd.read_csv("./datasets/crypto/hourly/BTC.csv")
btc_4hourly = pd.read_csv("./datasets/crypto/4hourly/BTC.csv")
btc_daily = pd.read_csv("./datasets/crypto/daily/BTC.csv")
btc_weekly = pd.read_csv("./datasets/crypto/weekly/BTC.csv")
btc_news = pd.read_csv("./datasets/crypto/news/BTC_news.csv")


eth_hourly = pd.read_csv("./datasets/crypto/hourly/BTC.csv")
eth_4hourly = pd.read_csv("./datasets/crypto/4hourly/BTC.csv")
eth_daily = pd.read_csv("./datasets/crypto/daily/BTC.csv")
eth_weekly = pd.read_csv("./datasets/crypto/weekly/BTC.csv")
eth_news = pd.read_csv("./datasets/crypto/news/ETH_news.csv")


aapl_hourly = pd.read_csv("./datasets/stocks/hourly/AAPL.csv")
aapl_4hourly = pd.read_csv("./datasets/stocks/4hourly/AAPL.csv")
aapl_daily = pd.read_csv("./datasets/stocks/daily/AAPL.csv")
aapl_weekly = pd.read_csv("./datasets/stocks/weekly/AAPL.csv")
aapl_news = pd.read_csv("./datasets/stocks/news/AAPL_news.csv")


tsla_hourly = pd.read_csv("./datasets/stocks/hourly/TSLA.csv")
tsla_4hourly = pd.read_csv("./datasets/stocks/4hourly/TSLA.csv")
tsla_daily = pd.read_csv("./datasets/stocks/daily/TSLA.csv")
tsla_weekly = pd.read_csv("./datasets/stocks/weekly/TSLA.csv")
tsla_news = pd.read_csv("./datasets/stocks/news/TSLA_news.csv")


amzn_hourly = pd.read_csv("./datasets/stocks/hourly/AMZN.csv")
amzn_4hourly = pd.read_csv("./datasets/stocks/4hourly/AMZN.csv")
amzn_daily = pd.read_csv("./datasets/stocks/daily/AMZN.csv")
amzn_weekly = pd.read_csv("./datasets/stocks/weekly/AMZN.csv")
amzn_news = pd.read_csv("./datasets/stocks/news/AMZN_news.csv")

In [24]:
btc_hourly = add_ta_features(btc_hourly, 1)
btc_4hourly = add_ta_features(btc_4hourly, 4)
btc_daily = add_ta_features(btc_daily, 24)
btc_weekly = add_ta_features(btc_weekly, 1)

btc_news_hourly = aggregate_and_forwardfill_news(btc_news, '1h')
btc_news_4hourly = aggregate_and_forwardfill_news(btc_news, '4h')
btc_news_daily = aggregate_and_forwardfill_news(btc_news, '1d')
btc_news_weekly = aggregate_and_forwardfill_news(btc_news, '1W')

eth_hourly = add_ta_features(eth_hourly, 1)
eth_4hourly = add_ta_features(eth_4hourly, 4)
eth_daily = add_ta_features(eth_daily, 24)
eth_weekly = add_ta_features(eth_weekly, 1)

eth_news_hourly = aggregate_and_forwardfill_news(eth_news, '1h')
eth_news_4hourly = aggregate_and_forwardfill_news(eth_news, '4h')
eth_news_daily = aggregate_and_forwardfill_news(eth_news, '1d')
eth_news_weekly = aggregate_and_forwardfill_news(eth_news, '1W')

aapl_hourly = add_ta_features(aapl_hourly, 1)
aapl_4hourly = add_ta_features(aapl_4hourly, 4)
aapl_daily = add_ta_features(aapl_daily, 24)
aapl_weekly = add_ta_features(aapl_weekly, 1)

aapl_news_hourly = aggregate_and_forwardfill_news(aapl_news, '1h')
aapl_news_4hourly = aggregate_and_forwardfill_news(aapl_news, '4h')
aapl_news_daily = aggregate_and_forwardfill_news(aapl_news, '1d')
aapl_news_weekly = aggregate_and_forwardfill_news(aapl_news, 'W-FRI')

tsla_hourly = add_ta_features(tsla_hourly, 1)
tsla_4hourly = add_ta_features(tsla_4hourly, 4)
tsla_daily = add_ta_features(tsla_daily, 24)
tsla_weekly = add_ta_features(tsla_weekly, 1)

tsla_news_hourly = aggregate_and_forwardfill_news(tsla_news, '1h')
tsla_news_4hourly = aggregate_and_forwardfill_news(tsla_news, '4h')
tsla_news_daily = aggregate_and_forwardfill_news(tsla_news, '1d')
tsla_news_weekly = aggregate_and_forwardfill_news(tsla_news, 'W-FRI')

amzn_hourly = add_ta_features(amzn_hourly, 1)
amzn_4hourly = add_ta_features(amzn_4hourly, 4)
amzn_daily = add_ta_features(amzn_daily, 24)
amzn_weekly = add_ta_features(amzn_weekly, 1)

amzn_news_hourly = aggregate_and_forwardfill_news(amzn_news, '1h')
amzn_news_4hourly = aggregate_and_forwardfill_news(amzn_news, '4h')
amzn_news_daily = aggregate_and_forwardfill_news(amzn_news, '1d')
amzn_news_weekly = aggregate_and_forwardfill_news(amzn_news, 'W-FRI')

In [14]:
merged_btc_hourly = merge_df(btc_hourly, btc_news_hourly)
merged_btc_4hourly = merge_df(btc_4hourly, btc_news_4hourly)
merged_btc_daily = merge_df(btc_daily, btc_news_daily)
merged_btc_weekly = merge_df(btc_weekly, btc_news_weekly)

merged_eth_hourly = merge_df(eth_hourly, eth_news_hourly)
merged_eth_4hourly = merge_df(eth_4hourly, eth_news_4hourly)
merged_eth_daily = merge_df(eth_daily, eth_news_daily)
merged_eth_weekly = merge_df(eth_weekly, eth_news_weekly)

merged_aapl_hourly = merge_df(aapl_hourly, aapl_news_hourly)
merged_aapl_4hourly = merge_df(aapl_4hourly, aapl_news_4hourly)
merged_aapl_daily = merge_df(aapl_daily, aapl_news_daily)
merged_aapl_weekly = merge_df(aapl_weekly, aapl_news_weekly)

merged_tsla_hourly = merge_df(tsla_hourly, tsla_news_hourly)
merged_tsla_4hourly = merge_df(tsla_4hourly, tsla_news_4hourly)
merged_tsla_daily = merge_df(tsla_daily, tsla_news_daily)
merged_tsla_weekly = merge_df(tsla_weekly, tsla_news_weekly)

merged_amzn_hourly = merge_df(amzn_hourly, amzn_news_hourly)
merged_amzn_4hourly = merge_df(amzn_4hourly, amzn_news_4hourly)
merged_amzn_daily = merge_df(amzn_daily, amzn_news_daily)
merged_amzn_weekly = merge_df(amzn_weekly, amzn_news_weekly)

In [15]:
# Saving for BTC
merged_btc_hourly.to_csv("./merged_datasets/crypto/hourly/BTC.csv")
merged_btc_4hourly.to_csv("./merged_datasets/crypto/4hourly/BTC.csv")
merged_btc_daily.to_csv("./merged_datasets/crypto/daily/BTC.csv")
merged_btc_weekly.to_csv("./merged_datasets/crypto/weekly/BTC.csv")

# Saving for ETH
merged_eth_hourly.to_csv("./merged_datasets/crypto/hourly/ETH.csv")
merged_eth_4hourly.to_csv("./merged_datasets/crypto/4hourly/ETH.csv")
merged_eth_daily.to_csv("./merged_datasets/crypto/daily/ETH.csv")
merged_eth_weekly.to_csv("./merged_datasets/crypto/weekly/ETH.csv")

# Saving for AAPL
merged_aapl_hourly.to_csv("./merged_datasets/stocks/hourly/AAPL.csv")
merged_aapl_4hourly.to_csv("./merged_datasets/stocks/4hourly/AAPL.csv")
merged_aapl_daily.to_csv("./merged_datasets/stocks/daily/AAPL.csv")
merged_aapl_weekly.to_csv("./merged_datasets/stocks/weekly/AAPL.csv")

# Saving for TSLA
merged_tsla_hourly.to_csv("./merged_datasets/stocks/hourly/TSLA.csv")
merged_tsla_4hourly.to_csv("./merged_datasets/stocks/4hourly/TSLA.csv")
merged_tsla_daily.to_csv("./merged_datasets/stocks/daily/TSLA.csv")
merged_tsla_weekly.to_csv("./merged_datasets/stocks/weekly/TSLA.csv")

# Saving for AMZN
merged_amzn_hourly.to_csv("./merged_datasets/stocks/hourly/AMZN.csv")
merged_amzn_4hourly.to_csv("./merged_datasets/stocks/4hourly/AMZN.csv")
merged_amzn_daily.to_csv("./merged_datasets/stocks/daily/AMZN.csv")
merged_amzn_weekly.to_csv("./merged_datasets/stocks/weekly/AMZN.csv")

In [26]:
eth_daily.head(30)

Unnamed: 0,timestamp,open,high,low,close,volume,rsi_14,macd,macd_signal,sma_5,sma_20,return,future_return,target
33,2018-03-06,11455.0,11455.0,10555.48,10716.48,323842200.0,54.545257,499.104319,452.028326,11237.792,10596.4995,0.28428,-0.075256,0
34,2018-03-07,10716.48,10899.0,9389.31,9910.0,514808000.0,46.997706,392.249531,440.072567,11011.992,10591.995,0.228937,-0.064416,0
35,2018-03-08,9910.0,10099.0,9060.0,9271.64,394641300.0,42.039103,253.138008,402.685655,10573.424,10547.578,0.041406,-0.004815,0
36,2018-03-09,9267.07,9410.0,8329.0,9227.0,567082900.0,41.707711,137.701709,349.688866,10115.824,10456.9505,0.080458,-0.049505,0
37,2018-03-10,9230.0,9490.0,8667.07,8770.22,341210200.0,38.374244,9.252623,281.601617,9579.068,10376.29,-0.071933,0.087039,1
38,2018-03-11,8770.22,9740.0,8450.0,9533.57,403491300.0,46.12382,-30.595551,219.162184,9342.486,10295.3185,-0.046652,-0.042191,0
39,2018-03-12,9533.57,9888.88,8780.0,9131.34,395562700.0,43.051678,-93.553687,156.61901,9186.754,10191.836,-0.101244,0.002044,1
40,2018-03-13,9131.34,9474.0,8823.0,9150.0,367354000.0,43.240554,-140.325175,97.230173,9162.426,10127.456,-0.171162,-0.107104,0
41,2018-03-14,9151.92,9333.78,7900.28,8170.0,426058000.0,36.410545,-253.546995,27.074739,8951.026,10045.404,-0.213169,0.008688,1
42,2018-03-15,8184.01,8430.0,7650.0,8240.98,422130700.0,37.184457,-333.701902,-45.080589,8845.178,9950.901,-0.261097,0.002308,1
