In [157]:
import warnings
import pathlib
import os
import sys
import concurrent.futures
import re

import pandas as pd

from benzinga import news_data
from datetime import datetime
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning

warnings.filterwarnings("ignore")

In [2]:
api_key = '092fe3da34ad421f99c42265ce3a7cbd'

In [3]:
project_folder = pathlib.Path(os.path.abspath("__file__")).resolve().parent.parent
sys.path.insert(1, str(project_folder))

from classes.yahoo_parser import SP500Parser

In [4]:
yahoo_parser = SP500Parser()

start = '2013-01-01'
end = datetime.today().strftime('%Y-%m-%d')
stocks = yahoo_parser.download_sp500_data(start, end)

[*********************100%%**********************]  503 of 503 completed

2 Failed downloads:
['BF.B']: Exception('%ticker%: No price data found, symbol may be delisted (1d 2013-01-01 -> 2024-02-23)')
['BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')


In [5]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def get_news(ticker, page, date_from, date_to, display_output="full"):
    news = paper.news(company_tickers=ticker, display_output=display_output, date_from=date_from, date_to=date_to, page=page, pagesize=100)
    if (len(news) == 0):
        return []
    df = pd.DataFrame(news)
    df['teaser'] = df['teaser'].apply(remove_html_tags)
    df['body'] = df['body'].apply(remove_html_tags)
    return df

def create_datasets_folder():
    if not os.path.exists('datasets'):
        os.makedirs('datasets')

def get_news_by_ticker(ticker):
    try:
        page = 0
        main_df = pd.DataFrame()
        date_from = start
        total = 0
        while True:
            if page > 100:
                date_from  = datetime.strptime(main_df['updated'].iloc[-1], "%a, %d %b %Y %H:%M:%S %z").strftime('%Y-%m-%d')
                page = 0
            news_df = get_news(ticker, page, date_from, today_date, 'full')
            if (len(news_df) == 0):
                break
            main_df = pd.concat([main_df, news_df], ignore_index=True)
            main_df = main_df.drop_duplicates(subset=['id'])
            page += 1
            total += 1
            print(f"{ticker} - {total} page. Added rows: {len(news_df)} total: {len(main_df)}")
        if ticker == 'BRK.B' or ticker == 'BRK.A':
            ticker = 'BRK-B'
        main_df.to_csv(f"datasets/news_sp_500_{ticker}.csv")
        return ticker
    except Exception as e:
        print(e)

def merge_all_in_one_file():
    main_df = pd.DataFrame()
    for ticker in tickers:
        if ticker == 'BRK.B':
            ticker = 'BRK-B'
        df = pd.read_csv(f"datasets/news_sp_500_{ticker}.csv")
        main_df = pd.concat([main_df, df], ignore_index=True)
        main_df = main_df.drop_duplicates(subset=['id'])
        os.remove(f"datasets/news_sp_500_{ticker}.csv")
    print(f"Rows in total {len(main_df)}")
    main_df.to_csv(f"datasets/news_sp_500.csv")

def run_concurent(max_workers=10):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(get_news_by_ticker, ticker) for ticker in tickers]
        for future in concurrent.futures.as_completed(futures):
            ticker = future.result()
            print(f"Ticker {ticker} done")

def check_all_files():
    for ticker in tickers:
        if ticker == 'BRK.B':
            ticker = 'BRK-B'
        df = pd.read_csv(f"datasets/news_sp_500_{ticker}.csv")
        if len(df) == 0:
            print(f"{ticker} - {len(df)}")

def zip_all_datasets():
    zf = zipfile.ZipFile('news_datasets.zip', mode='w')
    for ticker in tickers:
        if ticker == 'BRK.B':
            ticker = 'BRK-B'
        zf.write(f"datasets/news_sp_500_{ticker}.csv")
    zf.write(f"datasets/news_sp_500.csv")
    zf.close()

In [7]:
main_df = pd.DataFrame()
today_date = datetime.today().strftime('%Y-%m-%d')
paper = news_data.News(api_key, log=False)
tickers = yahoo_parser.get_sp500_tickers()

In [7]:
create_datasets_folder()
run_concurent()

Ticker AAL done
Ticker ABNB done
Ticker ACGL done
Ticker ABT done
Ticker ACN done
Ticker ADI done
Ticker ABBV done
Ticker ADBE done
Ticker A done
AAPL - 1 page. Added rows: 100 total: 29
Ticker ADP done
Ticker ADM done
Ticker ADSK done
Ticker AIZ done
Ticker AFL done
Ticker AIG done
Ticker AEP done
Ticker AEE done
Ticker AES done
Ticker AJG done
Ticker ALB done
Ticker ALL done
Ticker ALGN done
Ticker ALLE done
Ticker AMAT done
Ticker AMCR done
Ticker AKAM done
Ticker AMD done
Ticker AME done
Ticker AMT done
Ticker ANSS done
Ticker AMZN done
Ticker AMP done
AAPL - 2 page. Added rows: 100 total: 58
Ticker ANET done
Ticker AOS done
AMGN - 1 page. Added rows: 100 total: 28
Ticker APD done
Ticker AVGO done
Ticker AVY done
AMGN - 2 page. Added rows: 100 total: 60
Ticker AWK done
Ticker APTV done
Ticker ARE done
Ticker APH done
Ticker ATO done
Ticker APA done
Ticker AVB done
Ticker AXON done
Ticker AON done
Ticker AZO done
Ticker BALL done
Ticker BAC done
Ticker BAX done
AAPL - 3 page. Added 

In [8]:
merge_all_in_one_file()

Rows in total 45407


In [19]:
df_news = pd.read_csv('datasets/news_sp_500.csv').iloc[:,2:][['updated', 'stocks', 'body']].dropna()

df_news['stocks'] = df_news['stocks'].apply(eval).apply(lambda x: [entry['name'] for entry in x])

df_news = df_news.explode('stocks')

df_news['updated'] = pd.to_datetime(df_news['updated']).dt.tz_localize(None)

df_news['updated'] = df_news['updated'].dt.date
df_news['updated'] = pd.to_datetime(df_news['updated'])

df_news.head()

Unnamed: 0,updated,stocks,body
1,2013-01-02,AAPL,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...
1,2013-01-02,BROAD,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...
1,2013-01-02,BZSUM,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...
1,2013-01-02,CAR,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...
1,2013-01-02,EARLY,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...


In [15]:
df_stocks = stocks[['Symbol', 'Adj Close']]
df_stocks['pct_diff'] = df_stocks.groupby('Symbol')['Adj Close'].pct_change() * 100
df_stocks = df_stocks.dropna().reset_index()
df_stocks['Date'] = pd.to_datetime(df_stocks['Date'])

In [16]:
df_merged = df_news.merge(df_stocks,
                          how='inner',
                          left_on=['updated', 'stocks'],
                          right_on=['Date', 'Symbol'])[['Symbol', 'body', 'pct_diff']]

In [20]:
df.to_csv('datasets/merged.csv')

In [17]:
df_merged = pd.read_csv('datasets/merged.csv')[['Symbol', 'body', 'pct_diff']]

In [18]:
df_merged

Unnamed: 0,Symbol,body,pct_diff
0,AAPL,Microsoft (NASDAQ: MSFT) has purchased technol...,-1.262273
1,AAPL,Apple (NASDAQ: AAPL) is reportedly interested ...,-1.262273
2,AAPL,The year has opened with a bit of good news fo...,-1.262273
3,AAPL,"On Thursday, the NASDAQ stock market is appare...",-1.262273
4,AAPL,Sony (NYSE: SNE) is getting a lot of attention...,-1.262273
...,...,...,...
143526,EBAY,After Ford Motor (NYSE:F) decided to cut the o...,-1.572028
143527,F,After Ford Motor (NYSE:F) decided to cut the o...,1.991338
143528,WMT,After Ford Motor (NYSE:F) decided to cut the o...,0.333254
143529,WMT,Walmart Inc. (NYSE:WMT) announced on Tuesday e...,0.333254


In [117]:
import pandas as pd
import numpy as np
import nltk

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize 

import re
from string import punctuation

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/teal/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/teal/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [18]:
def text_to_wordlist(text, remove_stop_words=True, stem_words=False): 
    text = text.replace('\n', '')
    text = text.replace('\r\r', '')
    text = re.sub(r"[^A-Za-z0-9]", " ", text).lower()
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        stop_words = set(stopwords.words("english")) 
        word_tokens = word_tokenize(text) 
        text = [word for word in word_tokens if word not in stop_words] 
    
    # Optionally, shorten words to their stems
    if stem_words:
        # text = text.split()
        stemmer = SnowballStemmer('english')
        text = [stemmer.stem(word) for word in text]
    
    # Return a list of words
    return text

In [19]:
df_merged['body_preprocessed'] = df_merged['body'].apply(lambda x: text_to_wordlist(x, stem_words=True))

KeyboardInterrupt: 

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

vectorizer = TfidfVectorizer(
    input="content",
    tokenizer=text_to_wordlist,
    token_pattern=None,
)

X_train, X_test, y_train, y_test = train_test_split(df_merged, df_merged["pct_diff"], test_size=0.3)


X_train_body_vectorized = vectorizer.fit_transform(X_train["body"])
X_test_body_vectorized = vectorizer.transform(X_test["body"])

In [74]:
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

encoder = OneHotEncoder(drop='first', handle_unknown='ignore')

X_train_encoded = encoder.fit_transform(np.array(X_train['Symbol']).reshape(-1, 1))
X_train_proccesed = hstack([X_train_body_vectorized, X_train_encoded])

X_test_encoded = encoder.transform(np.array(X_test['Symbol']).reshape(-1, 1))
X_test_proccesed = hstack([X_test_body_vectorized, X_test_encoded])

In [75]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

model = CatBoostRegressor()

model.fit(X_train_proccesed, y_train)

scores = {
    'mean_squared_error': mean_squared_error(y_test, model.predict(X_test_proccesed)),
    'mean_squared_error': mean_squared_error(y_test, model.predict(X_test_proccesed)),
    'mean_absolute_percentage_error': mean_absolute_percentage_error(y_test, model.predict(X_test_proccesed)),
    'r2': r2_score(y_test, model.predict(X_test_proccesed)),
}
scores

Learning rate set to 0.084821
0:	learn: 3.5141730	total: 1.04s	remaining: 17m 20s
1:	learn: 3.4933734	total: 1.81s	remaining: 15m 3s
2:	learn: 3.4747799	total: 2.56s	remaining: 14m 10s
3:	learn: 3.4595400	total: 3.41s	remaining: 14m 9s
4:	learn: 3.4453526	total: 4.22s	remaining: 14m
5:	learn: 3.4327733	total: 5.1s	remaining: 14m 4s
6:	learn: 3.4229517	total: 5.99s	remaining: 14m 10s
7:	learn: 3.4136131	total: 6.86s	remaining: 14m 10s
8:	learn: 3.4062459	total: 7.7s	remaining: 14m 8s
9:	learn: 3.3993854	total: 8.55s	remaining: 14m 6s
10:	learn: 3.3927377	total: 9.45s	remaining: 14m 9s
11:	learn: 3.3863389	total: 10.4s	remaining: 14m 14s
12:	learn: 3.3814835	total: 11.3s	remaining: 14m 18s
13:	learn: 3.3767415	total: 12.2s	remaining: 14m 17s
14:	learn: 3.3728779	total: 13.1s	remaining: 14m 19s
15:	learn: 3.3687898	total: 13.9s	remaining: 14m 15s
16:	learn: 3.3645428	total: 14.8s	remaining: 14m 13s
17:	learn: 3.3603848	total: 15.6s	remaining: 14m 9s
18:	learn: 3.3569955	total: 16.4s	remai

{'mean_squared_error': 10.48606994087936,
 'mean_absolute_percentage_error': 6558457133422.015,
 'r2': 0.15355473743178216}

In [76]:
from xgboost import XGBRegressor
model = XGBRegressor()

model.fit(X_train_proccesed, y_train)

scores = {
    'mean_squared_error': mean_squared_error(y_test, model.predict(X_test_proccesed)),
    'mean_absolute_percentage_error': mean_absolute_percentage_error(y_test, model.predict(X_test_proccesed)),
    'r2': r2_score(y_test, model.predict(X_test_proccesed)),
}
scores

{'mean_squared_error': 10.668367248496512,
 'mean_absolute_percentage_error': 6499537059397.321,
 'r2': 0.13883953018240736}

In [77]:
from lightgbm import LGBMRegressor
 
model = LGBMRegressor(metric='mse')

model.fit(X_train_proccesed, y_train)

scores = {
    'mean_squared_error': mean_squared_error(y_test, model.predict(X_test_proccesed)),
    'mean_absolute_percentage_error': mean_absolute_percentage_error(y_test, model.predict(X_test_proccesed)),
    'r2': r2_score(y_test, model.predict(X_test_proccesed)),
}
scores

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 5.592813 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1902983
[LightGBM] [Info] Number of data points in the train set: 100471, number of used features: 40428
[LightGBM] [Info] Start training from score 0.107487


{'mean_squared_error': 10.633911638477876,
 'mean_absolute_percentage_error': 6149678652337.21,
 'r2': 0.14162081888576372}

In [24]:
df_merged[df_merged['Symbol'] == 'AAPL']

# Only AAPL

In [58]:
df_aapl = df_merged[df_merged['Symbol'] == 'AAPL'][['body', 'pct_diff']]

In [59]:
df_aapl['body_preprocessed'] = df_aapl['body'].apply(lambda x: text_to_wordlist(x, stem_words=True))

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

vectorizer = TfidfVectorizer(
    input="content",
    tokenizer=text_to_wordlist,
    token_pattern=None,
)

X_train, X_test, y_train, y_test = train_test_split(df_aapl, df_aapl["pct_diff"], test_size=0.3)


X_train_body_vectorized = vectorizer.fit_transform(X_train["body"])
X_test_body_vectorized = vectorizer.transform(X_test["body"])

In [62]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

model = CatBoostRegressor()

model.fit(X_train_body_vectorized, y_train)

scores = {
    'mean_squared_error': mean_squared_error(y_test, model.predict(X_test_body_vectorized)),
    'mean_absolute_percentage_error': mean_absolute_percentage_error(y_test, model.predict(X_test_body_vectorized)),
    'r2': r2_score(y_test, model.predict(X_test_body_vectorized)),
}
scores

Learning rate set to 0.057529
0:	learn: 2.0949785	total: 295ms	remaining: 4m 54s
1:	learn: 2.0930747	total: 445ms	remaining: 3m 42s
2:	learn: 2.0916982	total: 558ms	remaining: 3m 5s
3:	learn: 2.0890859	total: 669ms	remaining: 2m 46s
4:	learn: 2.0873476	total: 774ms	remaining: 2m 34s
5:	learn: 2.0856431	total: 887ms	remaining: 2m 26s
6:	learn: 2.0846084	total: 996ms	remaining: 2m 21s
7:	learn: 2.0831892	total: 1.11s	remaining: 2m 17s
8:	learn: 2.0820847	total: 1.23s	remaining: 2m 15s
9:	learn: 2.0809140	total: 1.34s	remaining: 2m 13s
10:	learn: 2.0787592	total: 1.47s	remaining: 2m 12s
11:	learn: 2.0783463	total: 1.58s	remaining: 2m 10s
12:	learn: 2.0773026	total: 1.69s	remaining: 2m 8s
13:	learn: 2.0765409	total: 1.81s	remaining: 2m 7s
14:	learn: 2.0752359	total: 1.92s	remaining: 2m 5s
15:	learn: 2.0740933	total: 2.03s	remaining: 2m 4s
16:	learn: 2.0734786	total: 2.14s	remaining: 2m 3s
17:	learn: 2.0724447	total: 2.25s	remaining: 2m 2s
18:	learn: 2.0702818	total: 2.36s	remaining: 2m 1s


{'mean_squared_error': 4.1158709165198495,
 'mean_absolute_percentage_error': 3116774256794.7705,
 'r2': 0.05578023078144623}

In [63]:
from xgboost import XGBRegressor
model = XGBRegressor()

model.fit(X_train_body_vectorized, y_train)

scores = {
    'mean_squared_error': mean_squared_error(y_test, model.predict(X_test_body_vectorized)),
    'mean_absolute_percentage_error': mean_absolute_percentage_error(y_test, model.predict(X_test_body_vectorized)),
    'r2': r2_score(y_test, model.predict(X_test_body_vectorized)),
}
scores

{'mean_squared_error': 4.381126780122696,
 'mean_absolute_percentage_error': 6707651970997.321,
 'r2': -0.005071976538681078}

In [64]:
from lightgbm import LGBMRegressor
 
model = LGBMRegressor(metric='mse')

model.fit(X_train_body_vectorized, y_train)

scores = {
    'mean_squared_error': mean_squared_error(y_test, model.predict(X_test_body_vectorized)),
    'mean_absolute_percentage_error': mean_absolute_percentage_error(y_test, model.predict(X_test_body_vectorized)),
    'r2': r2_score(y_test, model.predict(X_test_body_vectorized)),
}
scores

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.247370 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 407666
[LightGBM] [Info] Number of data points in the train set: 8607, number of used features: 8083
[LightGBM] [Info] Start training from score 0.089547


{'mean_squared_error': 4.224062468412791,
 'mean_absolute_percentage_error': 7944728957177.876,
 'r2': 0.030960064106703045}

Merge all news of one week for every ticker in one string 

In [20]:
df_news = pd.read_csv('datasets/news_sp_500.csv').iloc[:,2:][['updated', 'stocks', 'body']].dropna()

df_news['stocks'] = df_news['stocks'].apply(eval).apply(lambda x: [entry['name'] for entry in x])

df_news = df_news.explode('stocks')

df_news['updated'] = pd.to_datetime(df_news['updated']).dt.tz_localize(None)

df_news['updated'] = df_news['updated'].dt.date
df_news['updated'] = pd.to_datetime(df_news['updated'])

df_news.head()

Unnamed: 0,updated,stocks,body
1,2013-01-02,AAPL,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...
1,2013-01-02,BROAD,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...
1,2013-01-02,BZSUM,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...
1,2013-01-02,CAR,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...
1,2013-01-02,EARLY,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...


In [51]:
df_news_daily = df_news.groupby([df_news['updated'], 'stocks'])['body'].apply(lambda x: '\n'.join(x)).reset_index()

In [47]:
import pandas as pd
import pandas as pd
from ta.trend import MACD

def apply_features(group):
    group.index = pd.to_datetime(group.index)

    # Compute lag features, moving averages, etc., for the group
    for lag in range(1, 4):
        group[f'lag_{lag}'] = group['Close'].shift(lag)
    group['weekly_return'] = group['Close'].pct_change(5)
    group['5_day_MA'] = group['Close'].rolling(window=5).mean()
    group['20_day_MA'] = group['Close'].rolling(window=20).mean()
    group['5_day_volatility'] = group['Close'].rolling(window=5).std()
    group['momentum'] = group['Close'] - group['Close'].shift(1)
    
    # MACD, ensuring you handle NaNs as per your strategy

    macd = MACD(close=group['Close'], window_slow=26, window_fast=12, window_sign=9)
    group['MACD'] = macd.macd()
    group['MACD_signal'] = macd.macd_signal()
    group['MACD_histogram'] = macd.macd_diff()

    # Adjusting for multi-stock data: adding week_of_year and month
    group['week_of_year'] = group.index.isocalendar().week
    group['month'] = group.index.month
    
    return group.dropna()  # Optionally drop NaNs

# Apply the function to each group and recombine
# data_grouped = data.groupby('Symbol').apply(apply_features)


In [66]:
df_stocks = stocks.groupby('Symbol').apply(apply_features)

In [67]:
df_stocks.index = df_stocks.index.droplevel()
df_stocks.reset_index(inplace=True)
df_stocks

Unnamed: 0,Date,Symbol,Adj Close,Close,High,Low,Open,Volume,lag_1,lag_2,...,weekly_return,5_day_MA,20_day_MA,5_day_volatility,momentum,MACD,MACD_signal,MACD_histogram,week_of_year,month
0,2013-02-20,A,27.471472,30.214592,30.650930,30.200287,30.643778,5410260.0,30.765379,30.221745,...,-0.053339,31.020028,31.796495,0.878207,-0.550787,-0.056695,0.209343,-0.266038,8,2
1,2013-02-21,A,27.074751,29.778255,30.143063,29.663805,30.143063,4774450.0,30.214592,30.765379,...,-0.069721,30.573677,31.701001,0.814060,-0.436337,-0.177837,0.131907,-0.309744,8,2
2,2013-02-22,A,27.185310,29.899857,30.092991,29.742489,29.921316,4690150.0,29.778255,30.214592,...,-0.062360,30.175965,31.598712,0.382559,0.121601,-0.261022,0.053321,-0.314343,8,2
3,2013-02-25,A,26.853619,29.535049,30.200287,29.535049,30.107296,5064255.0,29.899857,29.778255,...,-0.022722,30.038626,31.455651,0.474282,-0.364807,-0.352323,-0.027808,-0.324515,9,2
4,2013-02-26,A,26.645504,29.306152,29.535049,28.748213,29.055794,8647888.0,29.535049,29.899857,...,-0.047431,29.746781,31.305079,0.347222,-0.228897,-0.438099,-0.109866,-0.328233,9,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1348219,2024-02-15,ZTS,189.649994,189.649994,190.339996,183.860001,183.860001,2725600.0,184.080002,183.490005,...,-0.031162,190.246002,190.657500,6.625424,5.569992,-0.425651,0.179822,-0.605472,7,2
1348220,2024-02-16,ZTS,188.389999,188.389999,190.350006,187.929993,189.399994,1953700.0,189.649994,184.080002,...,-0.045256,188.460001,190.658000,5.315994,-1.259995,-0.585697,0.026718,-0.612414,7,2
1348221,2024-02-20,ZTS,186.550003,186.550003,189.410004,186.240005,187.300003,2502800.0,188.389999,189.649994,...,-0.051553,186.432001,190.425500,2.664132,-1.839996,-0.851194,-0.148865,-0.702330,8,2
1348222,2024-02-21,ZTS,188.380005,188.380005,188.860001,186.660004,186.660004,3179300.0,186.550003,188.389999,...,0.026650,187.410001,190.439500,2.164911,1.830002,-0.903522,-0.299796,-0.603726,8,2


In [76]:
df_merged = df_news_daily.merge(df_stocks,
                                how='inner',
                                left_on=['updated', 'stocks'],
                                right_on=['Date', 'Symbol'])


df_merged.drop(['stocks', 'updated'], axis=1, inplace=True)
df_merged.columns = df_merged.columns.str.lower()
df_merged

Unnamed: 0,body,date,symbol,adj close,close,high,low,open,volume,lag_1,...,weekly_return,5_day_ma,20_day_ma,5_day_volatility,momentum,macd,macd_signal,macd_histogram,week_of_year,month
0,Shares of Apple (NASDAQ: AAPL) are trading dow...,2013-02-20,AAPL,13.771608,16.030357,16.346071,16.028570,16.346071,476302400.0,16.428213,...,-0.040714,16.447143,16.481375,0.262181,-0.397856,-0.427553,-0.489380,0.061827,8,2
1,Shares of Apple (NASDAQ: AAPL) are trading dow...,2013-02-20,AMZN,13.320500,13.320500,13.715000,13.318500,13.510000,70578000.0,13.487500,...,0.029803,13.399600,13.330875,0.105347,-0.167000,0.012436,-0.008985,0.021421,8,2
2,Boeing (NYSE: BA) and its negotiations team ar...,2013-02-20,BA,63.067375,74.779999,76.250000,74.750000,75.639999,7551300.0,74.650002,...,-0.015923,74.834000,75.151500,0.147749,0.129997,-0.211090,-0.116235,-0.094855,8,2
3,\r\n\tAlthough Congress is in recess for the w...,2013-02-20,CI,56.092823,59.270000,60.580002,59.220001,60.570000,1876800.0,60.430000,...,-0.043724,60.824001,59.843500,1.020284,-1.160000,1.135656,1.358660,-0.223004,8,2
4,"With the fiscal cliff having been averted, “bu...",2013-02-20,COST,80.679810,101.080002,102.629997,101.050003,102.180000,1915100.0,101.900002,...,-0.006682,101.759999,102.346000,0.428543,-0.820000,-0.034452,0.111248,-0.145701,8,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86227,"On CNBC’s ""Mad Money Lightning Round,"" Jim Cra...",2024-02-02,SLB,48.722450,49.000000,49.180000,48.490002,48.900002,19020300.0,49.000000,...,-0.074074,49.850000,49.987000,1.886797,0.000000,-0.406141,-0.320799,-0.085342,5,2
86228,"Meta Platforms, Inc. (NASDAQ:META) announced o...",2024-02-02,TSLA,187.910004,187.910004,188.690002,182.000000,185.039993,110505100.0,188.860001,...,0.025430,189.315997,209.535501,1.875200,-0.949997,-13.777086,-12.046669,-1.730417,5,2
86229,Billionaire Elon Musk tried to play down Apple...,2024-02-02,WBD,10.250000,10.250000,10.370000,10.130000,10.370000,16107500.0,10.460000,...,-0.034840,10.264000,10.543500,0.209833,-0.210000,-0.233812,-0.223187,-0.010625,5,2
86230,"Plug Power, Inc. (NASDAQ:PLUG) shares are trad...",2024-02-02,WMT,169.570007,169.570007,170.580002,167.919998,168.149994,7218900.0,168.309998,...,0.032264,166.751999,162.653501,2.055830,1.260010,2.503528,1.903171,0.600356,5,2


In [85]:
def text_to_wordlist(text, remove_stop_words=True, stem_words=False): 
    text = text.replace('\n', '')
    text = text.replace('\r', '')
    text = re.sub(r"[^A-Za-z0-9]", " ", text).lower()
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        stop_words = set(stopwords.words("english")) 
        word_tokens = word_tokenize(text) 
        text = [word for word in word_tokens if word not in stop_words] 
    
    # Optionally, shorten words to their stems
    if stem_words:
        # text = text.split()
        stemmer = SnowballStemmer('english')
        text = [stemmer.stem(word) for word in text]
    
    # Return a list of words
    return text

In [144]:
import inflect 
q = inflect.engine() 
  
def is_digit(string):
    try:
        float(string)
        return True
    except ValueError:
        return False 
        
def convert_num(text): 
    # split strings into list of texts 
    # temp_string = text.split() 
    # initialise empty list 
    new_str = [] 
  
    for word in text: 
        # if text is a digit, convert the digit 
        # to numbers and append into the new_str list 
        if is_digit(word): 
            temp = q.number_to_words(word) 
            new_str.append(temp) 
  
        # append the texts as it is 
        else: 
            new_str.append(word) 
  
    # join the texts of new_str to form a string 
    temp_str = ' '.join(new_str) 
    return temp_str 



def text_to_wordlist(text, remove_stop_words=True, stem_words=False, convert_numbers=True): 
    text = text.replace('\n', '')
    text = text.replace('\r', '')
    text = re.sub(r"[^A-Za-z0-9]", " ", text).lower()
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        stop_words = set(stopwords.words("english")) 
        word_tokens = word_tokenize(text) 
        text = [word for word in word_tokens if word not in stop_words] 
    
    # Optionally, shorten words to their stems
    if stem_words:
        stemmer = SnowballStemmer('english')
        text = [stemmer.stem(word) for word in text]
        
    if convert_numbers:
        text = convert_num(text)
        
    # Return a list of words
    return text


In [147]:
%%time
df_merged['body_preprocessed'] = df_merged['body'].apply(lambda x: text_to_wordlist(x, stem_words=True))

CPU times: user 15min 8s, sys: 4.25 s, total: 15min 12s
Wall time: 15min 16s


In [168]:
df_merged.drop('body', axis=1).to_csv('datasets/df_merged.csv')

In [171]:
import time

In [187]:
df_merged.drop("body", axis=1, inplace=True)

In [218]:
%%time

X_train, X_test, y_train, y_test = train_test_split(df_merged.drop(['date', 'symbol', 'weekly_return'], axis=1),
                                                    df_merged["weekly_return"],
                                                    test_size=0.3)


X_train = vectorizer.fit_transform(X_train["body_preprocessed"])
X_test = vectorizer.transform(X_test["body_preprocessed"])


model = CatBoostRegressor()

model.fit(X_train, y_train)


y_pred = model.predict(X_test)

scores = {
    'mean_squared_error': mean_squared_error(y_test, model.predict(X_test)),
    'mean_absolute_percentage_error': mean_absolute_percentage_error(y_test, model.predict(X_test)),
    'r2': r2_score(y_test, model.predict(X_test)),
}
print(scores)

CPU times: user 22.2 ms, sys: 23.5 ms, total: 45.8 ms
Wall time: 52.1 ms


In [220]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from catboost import CatBoostRegressor

# Assuming vectorizer is defined outside the snippet

# Define a ColumnTransformer to handle both text and numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'body_preprocessed'),  # Text feature
        # Add other numeric features here if any
        # ('numeric', 'passthrough', ['numeric_feature1', 'numeric_feature2']),
    ],
    remainder='passthrough'  # Keep the remaining columns as they are
)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df_merged.drop(['date', 'symbol', 'weekly_return'], axis=1),
    df_merged["weekly_return"],
    test_size=0.3
)

# Preprocess data
X_train_processed = pipeline.fit_transform(X_train)
X_test_processed = pipeline.transform(X_test)

# Initialize and train the model
model = CatBoostRegressor()
model.fit(X_train_processed, y_train)

# Make predictions
y_pred = model.predict(X_test_processed)

# Calculate scores
scores = {
    'mean_squared_error': mean_squared_error(y_test, y_pred),
    'mean_absolute_percentage_error': mean_absolute_percentage_error(y_test, y_pred),
    'r2': r2_score(y_test, y_pred),
}
print(scores)


Learning rate set to 0.07826
0:	learn: 0.0594483	total: 1.22s	remaining: 20m 13s
1:	learn: 0.0572738	total: 2.14s	remaining: 17m 49s
2:	learn: 0.0552361	total: 3.1s	remaining: 17m 11s
3:	learn: 0.0534326	total: 4.11s	remaining: 17m 2s
4:	learn: 0.0517727	total: 5.2s	remaining: 17m 14s
5:	learn: 0.0503338	total: 6.21s	remaining: 17m 9s
6:	learn: 0.0491500	total: 7.27s	remaining: 17m 10s
7:	learn: 0.0479706	total: 8.3s	remaining: 17m 9s
8:	learn: 0.0468571	total: 9.26s	remaining: 16m 59s
9:	learn: 0.0457921	total: 10.2s	remaining: 16m 52s
10:	learn: 0.0448724	total: 11.3s	remaining: 16m 54s
11:	learn: 0.0440140	total: 12.3s	remaining: 16m 53s
12:	learn: 0.0431950	total: 13.2s	remaining: 16m 44s
13:	learn: 0.0424706	total: 14.2s	remaining: 16m 38s
14:	learn: 0.0417427	total: 15.2s	remaining: 16m 36s
15:	learn: 0.0411340	total: 16.2s	remaining: 16m 37s
16:	learn: 0.0405875	total: 17.3s	remaining: 16m 37s
17:	learn: 0.0399826	total: 18.2s	remaining: 16m 30s
18:	learn: 0.0394867	total: 19.1s