In [1]:
import warnings
import pathlib
import os
import sys
import concurrent.futures


import pandas as pd

from benzinga import news_data
from datetime import datetime
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning

warnings.filterwarnings("ignore")

In [2]:
api_key = '092fe3da34ad421f99c42265ce3a7cbd'

In [3]:
project_folder = pathlib.Path(os.path.abspath("__file__")).resolve().parent.parent
sys.path.insert(1, str(project_folder))

from classes.yahoo_parser import SP500Parser

In [4]:
yahoo_parser = SP500Parser()

start = '2013-01-01'
end = datetime.today().strftime('%Y-%m-%d')
stocks = yahoo_parser.download_sp500_data(start, end)

[*********************100%%**********************]  503 of 503 completed

2 Failed downloads:
['BF.B']: Exception('%ticker%: No price data found, symbol may be delisted (1d 2013-01-01 -> 2024-02-05)')
['BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')


In [5]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def get_news(ticker, page, date_from, date_to, display_output="full"):
    news = paper.news(company_tickers=ticker, display_output=display_output, date_from=date_from, date_to=date_to, page=page, pagesize=100)
    if (len(news) == 0):
        return []
    df = pd.DataFrame(news)
    df['teaser'] = df['teaser'].apply(remove_html_tags)
    df['body'] = df['body'].apply(remove_html_tags)
    return df

def create_datasets_folder():
    if not os.path.exists('datasets'):
        os.makedirs('datasets')

def get_news_by_ticker(ticker):
    try:
        page = 0
        main_df = pd.DataFrame()
        date_from = start
        total = 0
        while True:
            if page > 100:
                date_from  = datetime.strptime(main_df['updated'].iloc[-1], "%a, %d %b %Y %H:%M:%S %z").strftime('%Y-%m-%d')
                page = 0
            news_df = get_news(ticker, page, date_from, today_date, 'full')
            if (len(news_df) == 0):
                break
            main_df = pd.concat([main_df, news_df], ignore_index=True)
            main_df = main_df.drop_duplicates(subset=['id'])
            page += 1
            total += 1
            print(f"{ticker} - {total} page. Added rows: {len(news_df)} total: {len(main_df)}")
        if ticker == 'BRK.B' or ticker == 'BRK.A':
            ticker = 'BRK-B'
        main_df.to_csv(f"datasets/news_sp_500_{ticker}.csv")
        return ticker
    except Exception as e:
        print(e)

def merge_all_in_one_file():
    main_df = pd.DataFrame()
    for ticker in tickers:
        if ticker == 'BRK.B':
            ticker = 'BRK-B'
        df = pd.read_csv(f"datasets/news_sp_500_{ticker}.csv")
        main_df = pd.concat([main_df, df], ignore_index=True)
        main_df = main_df.drop_duplicates(subset=['id'])
        os.remove(f"datasets/news_sp_500_{ticker}.csv")
    print(f"Rows in total {len(main_df)}")
    main_df.to_csv(f"datasets/news_sp_500.csv")

def run_concurent(max_workers=10):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(get_news_by_ticker, ticker) for ticker in tickers]
        for future in concurrent.futures.as_completed(futures):
            ticker = future.result()
            print(f"Ticker {ticker} done")

def check_all_files():
    for ticker in tickers:
        if ticker == 'BRK.B':
            ticker = 'BRK-B'
        df = pd.read_csv(f"datasets/news_sp_500_{ticker}.csv")
        if len(df) == 0:
            print(f"{ticker} - {len(df)}")

def zip_all_datasets():
    zf = zipfile.ZipFile('news_datasets.zip', mode='w')
    for ticker in tickers:
        if ticker == 'BRK.B':
            ticker = 'BRK-B'
        zf.write(f"datasets/news_sp_500_{ticker}.csv")
    zf.write(f"datasets/news_sp_500.csv")
    zf.close()

In [6]:
main_df = pd.DataFrame()
today_date = datetime.today().strftime('%Y-%m-%d')
paper = news_data.News(api_key, log=False)
tickers = yahoo_parser.get_sp500_tickers()

In [7]:
create_datasets_folder()
run_concurent()

Ticker AAL done
Ticker ABNB done
Ticker ACGL done
Ticker ABT done
Ticker ACN done
Ticker ADI done
Ticker ABBV done
Ticker ADBE done
Ticker A done
AAPL - 1 page. Added rows: 100 total: 29
Ticker ADP done
Ticker ADM done
Ticker ADSK done
Ticker AIZ done
Ticker AFL done
Ticker AIG done
Ticker AEP done
Ticker AEE done
Ticker AES done
Ticker AJG done
Ticker ALB done
Ticker ALL done
Ticker ALGN done
Ticker ALLE done
Ticker AMAT done
Ticker AMCR done
Ticker AKAM done
Ticker AMD done
Ticker AME done
Ticker AMT done
Ticker ANSS done
Ticker AMZN done
Ticker AMP done
AAPL - 2 page. Added rows: 100 total: 58
Ticker ANET done
Ticker AOS done
AMGN - 1 page. Added rows: 100 total: 28
Ticker APD done
Ticker AVGO done
Ticker AVY done
AMGN - 2 page. Added rows: 100 total: 60
Ticker AWK done
Ticker APTV done
Ticker ARE done
Ticker APH done
Ticker ATO done
Ticker APA done
Ticker AVB done
Ticker AXON done
Ticker AON done
Ticker AZO done
Ticker BALL done
Ticker BAC done
Ticker BAX done
AAPL - 3 page. Added 

In [8]:
merge_all_in_one_file()

Rows in total 45407


In [15]:
df_news = pd.read_csv('datasets/news_sp_500.csv').iloc[:,2:][['updated', 'stocks', 'body']].dropna()

In [16]:
df_news['stocks'] = df_news['stocks'].apply(eval).apply(lambda x: [entry['name'] for entry in x])

In [17]:
df_news = df_news.explode('stocks')

In [18]:
df_news['updated'] = pd.to_datetime(df_news['updated']).dt.tz_localize(None)

In [20]:
df_news['body'] = df_news['body'].apply(lambda x: x.replace('\n',''))

In [21]:
df_stocks = stocks[['Symbol', 'Adj Close']]
df_stocks['Pct Diff'] = df_stocks.groupby('Symbol')['Adj Close'].pct_change() * 100
df_stocks.dropna(inplace=True)
df_stocks.reset_index(inplace=True)

In [22]:
df_news['updated'] = df_news['updated'].dt.date

df_news['updated'] = pd.to_datetime(df_news['updated'])
df_stocks['Date'] = pd.to_datetime(df_stocks['Date'])

In [23]:
df_merged = df_news.merge(df_stocks,
                          how='inner',
                          left_on=['updated', 'stocks'],
                          right_on=['Date', 'Symbol'])[['Symbol', 'body', 'Pct Diff']]

In [26]:
df_merged

Unnamed: 0,Symbol,body,Pct Diff
0,AAPL,Microsoft (NASDAQ: MSFT) has purchased technol...,-1.262217
1,AAPL,Apple (NASDAQ: AAPL) is reportedly interested ...,-1.262217
2,AAPL,The year has opened with a bit of good news fo...,-1.262217
3,AAPL,"On Thursday, the NASDAQ stock market is appare...",-1.262217
4,AAPL,Sony (NYSE: SNE) is getting a lot of attention...,-1.262217
...,...,...,...
143526,EBAY,After Ford Motor (NYSE:F) decided to cut the o...,-1.572028
143527,F,After Ford Motor (NYSE:F) decided to cut the o...,1.991338
143528,WMT,After Ford Motor (NYSE:F) decided to cut the o...,0.333254
143529,WMT,Walmart Inc. (NYSE:WMT) announced on Tuesday e...,0.333254


In [33]:
df_merged.to_csv('datasets/merged.csv')

In [37]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from string import punctuation

In [43]:
stop_words = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']

def text_to_wordlist(text, remove_stop_words=True, stem_words=False):
    text = re.sub(r"[^A-Za-z0-9]", " ", text)

# Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text.lower())

In [47]:
df_merged['body_preprocessed'] = df_merged['body'].apply(text_to_wordlist)

In [50]:
df_merged

Unnamed: 0,Symbol,body,Pct Diff,body_preprocessed
0,AAPL,Microsoft (NASDAQ: MSFT) has purchased technol...,-1.262217,microsoft nasdaq msft has purchased technology...
1,AAPL,Apple (NASDAQ: AAPL) is reportedly interested ...,-1.262217,apple nasdaq aapl reportedly interested in acq...
2,AAPL,The year has opened with a bit of good news fo...,-1.262217,the year has opened with bit good news caffein...
3,AAPL,"On Thursday, the NASDAQ stock market is appare...",-1.262217,on thursday nasdaq stock market apparently hav...
4,AAPL,Sony (NYSE: SNE) is getting a lot of attention...,-1.262217,sony nyse sne getting lot attention week after...
...,...,...,...,...
143526,EBAY,After Ford Motor (NYSE:F) decided to cut the o...,-1.572028,after ford motor nyse f decided cut output its...
143527,F,After Ford Motor (NYSE:F) decided to cut the o...,1.991338,after ford motor nyse f decided cut output its...
143528,WMT,After Ford Motor (NYSE:F) decided to cut the o...,0.333254,after ford motor nyse f decided cut output its...
143529,WMT,Walmart Inc. (NYSE:WMT) announced on Tuesday e...,0.333254,walmart inc nyse wmt announced on tuesday even...


In [182]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from string import punctuation

In [183]:
def text_to_wordlist(text, remove_stop_words=True, stem_words=False):
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = ''.join([c for c in text if c not in punctuation])
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [168]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    input="content",
    tokenizer=text_to_wordlist,
    token_pattern=None,
)

X_train, X_test, y_train, y_test = train_test_split(df_merged, df_merged["Pct Diff"], test_size=0.3)


X_train_body_vectorized = vectorizer.fit_transform(X_train["body"])
X_test_body_vectorized = vectorizer.transform(X_test["body"])

In [197]:
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

encoder = OneHotEncoder(drop='first', handle_unknown='ignore')

X_train_encoded = encoder.fit_transform(np.array(X_train['Symbol']).reshape(-1, 1))
X_train_proccesed = hstack([X_train_body_vectorized, X_train_encoded])

X_test_encoded = encoder.transform(np.array(X_test['Symbol']).reshape(-1, 1))
X_test_proccesed = hstack([X_test_body_vectorized, X_test_encoded])

In [203]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

model = CatBoostRegressor()

model.fit(X_train_proccesed, y_train)

scores = {
    'mean_squared_error': mean_squared_error(y_test, model.predict(X_test_proccesed)),
    'mean_squared_error': mean_squared_error(y_test, model.predict(X_test_proccesed)),
    'mean_absolute_percentage_error': mean_absolute_percentage_error(y_test, model.predict(X_test_proccesed)),
    'r2': r2_score(y_test, model.predict(X_test_proccesed)),
}
scores

Learning rate set to 0.084821
0:	learn: 3.4954522	total: 5.5ms	remaining: 5.49s
1:	learn: 3.4760321	total: 9.86ms	remaining: 4.92s
2:	learn: 3.4597058	total: 14.4ms	remaining: 4.78s
3:	learn: 3.4455783	total: 19ms	remaining: 4.73s
4:	learn: 3.4337324	total: 23.4ms	remaining: 4.66s
5:	learn: 3.4238544	total: 28.1ms	remaining: 4.65s
6:	learn: 3.4152401	total: 32.6ms	remaining: 4.63s
7:	learn: 3.4075351	total: 37.7ms	remaining: 4.67s
8:	learn: 3.4008230	total: 42.7ms	remaining: 4.7s
9:	learn: 3.3954864	total: 47.2ms	remaining: 4.67s
10:	learn: 3.3911757	total: 51.5ms	remaining: 4.63s
11:	learn: 3.3865333	total: 55.7ms	remaining: 4.59s
12:	learn: 3.3816248	total: 60.7ms	remaining: 4.61s
13:	learn: 3.3780561	total: 65.3ms	remaining: 4.6s
14:	learn: 3.3746676	total: 69.7ms	remaining: 4.58s
15:	learn: 3.3713567	total: 74.4ms	remaining: 4.57s
16:	learn: 3.3682157	total: 79.1ms	remaining: 4.57s
17:	learn: 3.3661790	total: 83.6ms	remaining: 4.56s
18:	learn: 3.3639405	total: 88.6ms	remaining: 4.5

{'mean_squared_error': 11.046047151586857,
 'mean_absolute_percentage_error': 5779378004157.431,
 'r2': 0.1335667113316964}

In [179]:
from xgboost import XGBRegressor
model = XGBRegressor()

model.fit(X_train_proccesed, y_train)

scores = {
    'mean_squared_error': mean_squared_error(y_test, model.predict(X_test_proccesed)),
    'mean_absolute_percentage_error': mean_absolute_percentage_error(y_test, model.predict(X_test_proccesed)),
    'r2': r2_score(y_test, model.predict(X_test_proccesed)),
}
scores

{'mean_squared_error': 11.172709892059533,
 'mean_absolute_percentage_error': 6713571657369.681,
 'r2': 0.12363149982359456}

In [181]:
from lightgbm import LGBMRegressor
 
model = LGBMRegressor(metric='mse')

model.fit(X_train_proccesed, y_train)

scores = {
    'mean_squared_error': mean_squared_error(y_test, model.predict(X_test_proccesed)),
    'mean_absolute_percentage_error': mean_absolute_percentage_error(y_test, model.predict(X_test_proccesed)),
    'r2': r2_score(y_test, model.predict(X_test_proccesed)),
}
scores

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10373
[LightGBM] [Info] Number of data points in the train set: 100471, number of used features: 506
[LightGBM] [Info] Start training from score 0.100895


{'mean_squared_error': 11.275614513568575,
 'mean_absolute_percentage_error': 5153113998003.363,
 'r2': 0.11555983505431655}