In [1]:
import warnings
import pathlib
import os
import sys
import concurrent.futures
import re
import timeit
import inflect 
import nltk
import json

import pandas as pd
import numpy as np


from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize 
from benzinga import news_data
from datetime import datetime
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from string import punctuation
from ta.trend import MACD
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from catboost import CatBoostRegressor
from category_encoders import MEstimateEncoder
from sentence_transformers import SentenceTransformer

project_folder = pathlib.Path(os.path.abspath("__file__")).resolve().parent.parent
sys.path.insert(1, str(project_folder))

from classes.yahoo_parser import SP500Parser
from classes.benzinga_parser import BenzingaNewsParser

warnings.filterwarnings("ignore")

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

load_dotenv()

True

## Блок функций

NLP

In [103]:
q = inflect.engine() 
  
def is_digit(string):
    try:
        float(string)
        return True
    except ValueError:
        return False 
        
def convert_num(text): 
    new_str = [] 
  
    for word in text: 
        if is_digit(word): 
            temp = q.number_to_words(word) 
            new_str.append(temp) 
        else: 
            new_str.append(word) 
            
    temp_str = ' '.join(new_str) 
    return temp_str 



def text_to_wordlist(text, remove_stop_words=True, stem_words=False, convert_numbers=True): 
    text = text.replace('\n', '')
    text = text.replace('\r', '')
    text = re.sub(r"[^A-Za-z0-9]", " ", text).lower()
    text = re.sub(r'\d+', '', text)
    

    text = ''.join([c for c in text if c not in punctuation])
    
    if remove_stop_words:
        stop_words = set(stopwords.words("english")) 
        word_tokens = word_tokenize(text) 
        text = [word for word in word_tokens if word not in stop_words] 
    
    if stem_words:
        stemmer = SnowballStemmer('english')
        text = [stemmer.stem(word) for word in text]
        
    if convert_numbers:
        text = convert_num(text)
        
    return text


Для получения признаков из финансовых данных

In [2]:
def apply_features(group):
    group.index = pd.to_datetime(group.index)

    for lag in range(1, 4):
        group[f'lag_{lag}'] = group['Close'].shift(lag)
    group['weekly_return'] = group['Close'].pct_change(5)
    group['5_day_MA'] = group['Close'].rolling(window=5).mean()
    group['20_day_MA'] = group['Close'].rolling(window=20).mean()
    group['5_day_volatility'] = group['Close'].rolling(window=5).std()
    group['momentum'] = group['Close'] - group['Close'].shift(1)
    

    macd = MACD(close=group['Close'], window_slow=26, window_fast=12, window_sign=9)
    group['MACD'] = macd.macd()
    group['MACD_signal'] = macd.macd_signal()
    group['MACD_histogram'] = macd.macd_diff()

    group['week_of_year'] = group.index.isocalendar().week
    group['month'] = group.index.month
    
    return group.dropna()


## Загрузка финансовых показателей **SP500**

In [88]:
yahoo_parser = SP500Parser()
start_date = '2013-01-01'
end_date = datetime.today().strftime('%Y-%m-%d')
stocks = yahoo_parser.download_sp500_data(start_date, end_date)

[*********************100%%**********************]  503 of 503 completed

2 Failed downloads:
['BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')
['BF.B']: Exception('%ticker%: No price data found, symbol may be delisted (1d 2013-01-01 -> 2024-03-05)')


In [89]:
df_stocks = stocks.groupby('Symbol').apply(apply_features)
df_stocks.index = df_stocks.index.droplevel()
df_stocks.reset_index(inplace=True)
df_stocks.head()

Price,Date,Symbol,Adj Close,Close,High,Low,Open,Volume,lag_1,lag_2,...,weekly_return,5_day_MA,20_day_MA,5_day_volatility,momentum,MACD,MACD_signal,MACD_histogram,week_of_year,month
0,2013-02-20,A,27.47147,30.214592,30.65093,30.200287,30.643778,5410260.0,30.765379,30.221745,...,-0.053339,31.020028,31.796495,0.878207,-0.550787,-0.056695,0.209343,-0.266038,8,2
1,2013-02-21,A,27.074749,29.778255,30.143063,29.663805,30.143063,4774450.0,30.214592,30.765379,...,-0.069721,30.573677,31.701001,0.81406,-0.436337,-0.177837,0.131907,-0.309744,8,2
2,2013-02-22,A,27.18531,29.899857,30.092991,29.742489,29.921316,4690150.0,29.778255,30.214592,...,-0.06236,30.175965,31.598712,0.382559,0.121601,-0.261022,0.053321,-0.314343,8,2
3,2013-02-25,A,26.853622,29.535049,30.200287,29.535049,30.107296,5064255.0,29.899857,29.778255,...,-0.022722,30.038626,31.455651,0.474282,-0.364807,-0.352323,-0.027808,-0.324515,9,2
4,2013-02-26,A,26.64551,29.306152,29.535049,28.748213,29.055794,8647888.0,29.535049,29.899857,...,-0.047431,29.746781,31.305079,0.347222,-0.228897,-0.438099,-0.109866,-0.328233,9,2


## Загрузка новостей по **SP500**

In [6]:
benzinga_api_key = os.getenv('BENZINGA_API_KEY')
tickers = yahoo_parser.get_sp500_tickers()

In [7]:
benzinga_parser = BenzingaNewsParser(benzinga_api_key, tickers, start_date, end_date)
benzinga_parser.run_concurrent()
df_news = benzinga_parser.get_df_news()
df_news.head()

Unnamed: 0,id,author,created,updated,title,teaser,body,url,image,channels,stocks,tags
0,36497786,Benzinga Insights,"Thu, 04 Jan 2024 15:30:51 -0400",2024-01-04,$100 Invested In This Stock 20 Years Ago Would...,,Arch Capital Group (NASDAQ:ACGL) has outperfor...,https://www.benzinga.com/news/24/01/36497786/1...,"[{'size': 'thumb', 'url': 'https://cdn.benzing...","[{'name': 'News'}, {'name': 'Trading Ideas'}]",ACGL,[{'name': 'BZI-POD'}]
1,36574531,Benzinga Insights,"Wed, 10 Jan 2024 09:00:18 -0400",2024-01-10,Looking Into Arch Capital Group's Recent Short...,,Arch Capital Group's (NYSE:ACGL) short percent...,https://www.benzinga.com/short-sellers/24/01/3...,"[{'size': 'thumb', 'url': 'https://cdn.benzing...","[{'name': 'Short Sellers'}, {'name': 'Short Id...",ACGL,[{'name': 'BZI-SHORTHIST'}]
2,36577127,Benzinga Newsdesk,"Wed, 10 Jan 2024 10:36:12 -0400",2024-01-10,"UBS Maintains Buy on Arch Capital Group, Lower...",UBS analyst Brian Meredith maintains Arch C...,UBS analyst Brian Meredith maintains Arch C...,https://www.benzinga.com/news/24/01/36577127/u...,[],"[{'name': 'News'}, {'name': 'Price Target'}, {...",ACGL,[]
3,36582924,Benzinga Insights,"Wed, 10 Jan 2024 16:00:13 -0400",2024-01-10,A Closer Look at 9 Analyst Recommendations For...,,Ratings for Arch Capital Group (NASDAQ:ACGL) w...,https://www.benzinga.com/analyst-ratings/24/01...,"[{'size': 'thumb', 'url': 'https://cdn.benzing...",[{'name': 'Analyst Ratings'}],ACGL,[{'name': 'BZI-AAR'}]
4,36673096,Avi Kapoor,"Thu, 18 Jan 2024 10:36:18 -0400",2024-01-18,"AbbVie, Arch Capital, Gilead Sciences And A Te...","On CNBC’s ""Halftime Report Final Trades,"" Rob ...","On CNBC’s ""Halftime Report Final Trades,"" Rob ...",https://www.benzinga.com/trading-ideas/long-id...,"[{'size': 'large', 'url': 'https://cdn.benzing...","[{'name': 'Long Ideas'}, {'name': 'News'}, {'n...",ABBV,"[{'name': 'Expert Ideas'}, {'name': 'Final Tra..."


In [90]:
df_news = pd.read_csv('datasets/news_sp_500.csv').iloc[:,2:][['updated', 'stocks', 'body']].dropna()

df_news['stocks'] = df_news['stocks'].apply(eval).apply(lambda x: [entry['name'] for entry in x])
df_news = df_news.explode('stocks')
df_news['updated'] = pd.to_datetime(df_news['updated']).dt.tz_localize(None)
df_news['updated'] = df_news['updated'].dt.date
df_news['updated'] = pd.to_datetime(df_news['updated'])

df_news = df_news.groupby([df_news['updated'], 'stocks'])['body'].apply(lambda x: '\n'.join(x)).reset_index()

df_news.head()

Unnamed: 0,updated,stocks,body
0,2013-01-02,AAPL,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...
1,2013-01-02,AMAT,Shares of the Market Vectors Semiconductor ETF...
2,2013-01-02,ARX,Shares of the Market Vectors Semiconductor ETF...
3,2013-01-02,AVGO,Shares of the Market Vectors Semiconductor ETF...
4,2013-01-02,BA,Boeing (NYSE: BA) announced today an\r\norder ...


## Объединение финансовых данных и новостей

In [91]:
df_merged = df_news.merge(df_stocks,
                          how='inner',
                          left_on=['updated', 'stocks'],
                          right_on=['Date', 'Symbol'])
df_merged.columns = [i.lower() for i in df_merged]
df_merged.head()

Unnamed: 0,updated,stocks,body,date,symbol,adj close,close,high,low,open,...,weekly_return,5_day_ma,20_day_ma,5_day_volatility,momentum,macd,macd_signal,macd_histogram,week_of_year,month
0,2013-02-20,AAPL,Shares of Apple (NASDAQ: AAPL) are trading dow...,2013-02-20,AAPL,13.771607,16.030357,16.346071,16.02857,16.346071,...,-0.040714,16.447143,16.481375,0.262181,-0.397856,-0.427553,-0.48938,0.061827,8,2
1,2013-02-20,AMZN,Shares of Apple (NASDAQ: AAPL) are trading dow...,2013-02-20,AMZN,13.3205,13.3205,13.715,13.3185,13.51,...,0.029803,13.3996,13.330875,0.105347,-0.167,0.012436,-0.008985,0.021421,8,2
2,2013-02-20,BA,Boeing (NYSE: BA) and its negotiations team ar...,2013-02-20,BA,63.067375,74.779999,76.25,74.75,75.639999,...,-0.015923,74.834,75.1515,0.147749,0.129997,-0.21109,-0.116235,-0.094855,8,2
3,2013-02-20,CI,\r\n\tAlthough Congress is in recess for the w...,2013-02-20,CI,56.092812,59.27,60.580002,59.220001,60.57,...,-0.043724,60.824001,59.8435,1.020284,-1.16,1.135656,1.35866,-0.223004,8,2
4,2013-02-20,COST,"With the fiscal cliff having been averted, “bu...",2013-02-20,COST,80.679817,101.080002,102.629997,101.050003,102.18,...,-0.006682,101.759999,102.346,0.428543,-0.82,-0.034452,0.111248,-0.145701,8,2


## Только финансовые признаки без новостей и заголовков

In [171]:
X_train, X_test, y_train, y_test = train_test_split(
    df_stocks.drop(['Date', 'weekly_return'], axis=1),
    df_stocks["weekly_return"],
    test_size=0.3,
    random_state=42
)

cat_features = ['Symbol', 'week_of_year', 'month']

In [172]:
preprocessor = ColumnTransformer(
    transformers=[
        ('category', MEstimateEncoder(m=5.0), cat_features),
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
])

X_train_processed = pipeline.fit_transform(X_train,  y_train)
X_test_processed = pipeline.transform(X_test)

In [173]:
params = {'n_estimators': 277, 'max_depth': 9, 'learning_rate': 0.030116927907760483, 'subsample': 0.6}

In [174]:
start_learn = timeit.default_timer()

model = CatBoostRegressor(**params)
model.fit(X_train_processed, y_train)

end_learn = timeit.default_timer()

y_pred = model.predict(X_test_processed)

0:	learn: 0.0430627	total: 64.6ms	remaining: 17.8s
1:	learn: 0.0424793	total: 94.7ms	remaining: 13s
2:	learn: 0.0419141	total: 126ms	remaining: 11.5s
3:	learn: 0.0413718	total: 157ms	remaining: 10.7s
4:	learn: 0.0408522	total: 184ms	remaining: 9.99s
5:	learn: 0.0403493	total: 212ms	remaining: 9.6s
6:	learn: 0.0398664	total: 239ms	remaining: 9.23s
7:	learn: 0.0394078	total: 265ms	remaining: 8.91s
8:	learn: 0.0389669	total: 298ms	remaining: 8.87s
9:	learn: 0.0385402	total: 331ms	remaining: 8.85s
10:	learn: 0.0381322	total: 364ms	remaining: 8.79s
11:	learn: 0.0377374	total: 392ms	remaining: 8.65s
12:	learn: 0.0373570	total: 423ms	remaining: 8.59s
13:	learn: 0.0369909	total: 453ms	remaining: 8.51s
14:	learn: 0.0366443	total: 481ms	remaining: 8.41s
15:	learn: 0.0363015	total: 510ms	remaining: 8.32s
16:	learn: 0.0359818	total: 537ms	remaining: 8.22s
17:	learn: 0.0356709	total: 564ms	remaining: 8.11s
18:	learn: 0.0353709	total: 591ms	remaining: 8.02s
19:	learn: 0.0350820	total: 624ms	remainin

In [175]:
scores = {
    'mean_squared_error': mean_squared_error(y_test, y_pred),
    'mean_absolute_error': mean_absolute_error(y_test, y_pred),
    'r2': r2_score(y_test, y_pred),
    'train_size': X_train_processed.shape[0],
    'full_train_min': (end_learn - start_learn) // 60,
    'full_train_sec': (end_learn - start_learn) % 60,
}
scores

{'mean_squared_error': 0.0006397531684000877,
 'mean_absolute_error': 0.016941635846497695,
 'r2': 0.6651099361243257,
 'train_size': 946211,
 'full_train_min': 0.0,
 'full_train_sec': 10.017572625074536}

In [187]:
model.save_model('../models/financial_data_only.cbm')

with open('../models/metrics_financial_data_only.json', 'w') as json_file:
    json.dump(scores, json_file)

# NLP обработка новостей Lemmatization + TF-IDF

In [10]:
start_lemmatization = timeit.default_timer()
df_merged['body_preprocessed'] = df_merged['body'].apply(lambda x: text_to_wordlist(x, stem_words=True))
end_lemmatization = timeit.default_timer()

df_merged.drop("body", axis=1, inplace=True)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df_merged.drop(['updated', 'stocks', 'date', 'weekly_return'], axis=1),
    df_merged["weekly_return"],
    test_size=0.3,
    random_state=42
)

cat_features = ['symbol', 'week_of_year', 'month']

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'body_preprocessed'),
        ('category', MEstimateEncoder(m=5.0), cat_features),
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
])

start_tfidf = timeit.default_timer()
X_train_processed = pipeline.fit_transform(X_train,  y_train)
end_tfidf = timeit.default_timer()

X_test_processed = pipeline.transform(X_test)

## Обучение градиентного бустинга

In [13]:
params = {'n_estimators': 277, 'max_depth': 9, 'learning_rate': 0.030116927907760483, 'subsample': 0.6}

In [14]:
start_learn = timeit.default_timer()

model = CatBoostRegressor(**params)
model.fit(X_train_processed, y_train)

end_learn = timeit.default_timer()

y_pred = model.predict(X_test_processed)

0:	learn: 0.0611508	total: 11.2s	remaining: 51m 24s
1:	learn: 0.0601287	total: 22.2s	remaining: 50m 46s
2:	learn: 0.0591953	total: 32.7s	remaining: 49m 46s
3:	learn: 0.0582912	total: 43.5s	remaining: 49m 26s
4:	learn: 0.0573870	total: 55.2s	remaining: 50m 1s
5:	learn: 0.0564901	total: 1m 6s	remaining: 49m 45s
6:	learn: 0.0556333	total: 1m 17s	remaining: 49m 35s
7:	learn: 0.0548419	total: 1m 28s	remaining: 49m 25s
8:	learn: 0.0541131	total: 1m 39s	remaining: 49m 10s
9:	learn: 0.0533430	total: 1m 50s	remaining: 48m 58s
10:	learn: 0.0526442	total: 2m 1s	remaining: 48m 46s
11:	learn: 0.0519452	total: 2m 11s	remaining: 48m 28s
12:	learn: 0.0513158	total: 2m 22s	remaining: 48m 6s
13:	learn: 0.0507246	total: 2m 32s	remaining: 47m 50s
14:	learn: 0.0500696	total: 2m 43s	remaining: 47m 38s
15:	learn: 0.0494702	total: 2m 54s	remaining: 47m 22s
16:	learn: 0.0488534	total: 3m 4s	remaining: 47m 2s
17:	learn: 0.0482659	total: 3m 15s	remaining: 46m 48s
18:	learn: 0.0477099	total: 3m 26s	remaining: 46m

ValueError: If using all scalar values, you must pass an index

In [73]:
scores = {
    'mean_squared_error': mean_squared_error(y_test, y_pred),
    'mean_absolute_error': mean_absolute_error(y_test, y_pred),
    'r2': r2_score(y_test, y_pred),
    'train_size': X_train_processed.shape[0],
    'full_train_min': (end_learn - start_learn) // 60,
    'full_train_sec': (end_learn - start_learn) % 60,
    'lemmatization_news_min': (end_lemmatization - start_lemmatization) // 60,
    'lemmatization_news_sec': (end_lemmatization - start_lemmatization) % 60,
    'tf_idf_news_min': (end_tfidf - start_tfidf) // 60,
    'tf_idf_news_sec': (end_tfidf - start_tfidf) % 60,
}
scores

{'mean_squared_error': 0.0008929964909068492,
 'mean_absolute_error': 0.01914771398272385,
 'r2': 0.7727123286756394,
 'train_size': 60358,
 'full_train_min': 43.0,
 'full_train_sec': 19.204212832963094,
 'lemmatization_news_min': 15.0,
 'lemmatization_news_sec': 53.56387616600841,
 'tf_idf_news_min': 0.0,
 'tf_idf_news_sec': 22.487894125049934}

In [75]:
model.save_model('../models/nlp_news_financial_data.cbm')

with open('../models/metrics_nlp_news_financial_data.json', 'w') as json_file:
    json.dump(scores, json_file)

# Эмбеддинги заголовков новостей

In [189]:
from sentence_transformers import SentenceTransformer

df_news = pd.read_csv('datasets/news_sp_500.csv').iloc[:,2:][['updated', 'body', 'stocks', 'title']].dropna()
df_news.reset_index(inplace=True)
df_news.dropna(inplace=True)

model = SentenceTransformer("all-MiniLM-L6-v2")

start_embeddings = timeit.default_timer()
sentence_embeddings = model.encode(df_news['title'])
end_embeddings = timeit.default_timer()

In [190]:
df_news['title_embeddings'] = sentence_embeddings.tolist()

df_news.drop('title', axis=1, inplace=True)

In [191]:
df_news['stocks'] = df_news['stocks'].apply(eval).apply(lambda x: [entry['name'] for entry in x])

df_embedded_news = df_news.join(pd.DataFrame(df_news['title_embeddings'].tolist(), columns=[f'title_embedding_{i}' for i in range(384)]))

df_embedded_news = df_embedded_news.explode('stocks')

df_embedded_news['updated'] = pd.to_datetime(df_embedded_news['updated']).dt.date

df_embedded_news.head()

Unnamed: 0,index,updated,body,stocks,title_embeddings,title_embedding_0,title_embedding_1,title_embedding_2,title_embedding_3,title_embedding_4,...,title_embedding_374,title_embedding_375,title_embedding_376,title_embedding_377,title_embedding_378,title_embedding_379,title_embedding_380,title_embedding_381,title_embedding_382,title_embedding_383
0,1,2013-01-02,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...,AAPL,"[-0.09893397986888885, 0.03140946477651596, -0...",-0.098934,0.031409,-0.01048,0.008883,0.051896,...,0.033901,0.102746,-0.10758,0.019103,-0.011611,-0.11292,0.050958,-0.038548,-0.1088,0.078721
0,1,2013-01-02,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...,BROAD,"[-0.09893397986888885, 0.03140946477651596, -0...",-0.098934,0.031409,-0.01048,0.008883,0.051896,...,0.033901,0.102746,-0.10758,0.019103,-0.011611,-0.11292,0.050958,-0.038548,-0.1088,0.078721
0,1,2013-01-02,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...,BZSUM,"[-0.09893397986888885, 0.03140946477651596, -0...",-0.098934,0.031409,-0.01048,0.008883,0.051896,...,0.033901,0.102746,-0.10758,0.019103,-0.011611,-0.11292,0.050958,-0.038548,-0.1088,0.078721
0,1,2013-01-02,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...,CAR,"[-0.09893397986888885, 0.03140946477651596, -0...",-0.098934,0.031409,-0.01048,0.008883,0.051896,...,0.033901,0.102746,-0.10758,0.019103,-0.011611,-0.11292,0.050958,-0.038548,-0.1088,0.078721
0,1,2013-01-02,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...,EARLY,"[-0.09893397986888885, 0.03140946477651596, -0...",-0.098934,0.031409,-0.01048,0.008883,0.051896,...,0.033901,0.102746,-0.10758,0.019103,-0.011611,-0.11292,0.050958,-0.038548,-0.1088,0.078721


In [192]:
df_embedded_news = df_embedded_news.groupby(['updated', 'stocks']).agg({
    'body': '\n'.join,
    **{f'title_embedding_{i}': 'sum' for i in range(384)}
}).reset_index()

df_embedded_news.head()

Unnamed: 0,updated,stocks,body,title_embedding_0,title_embedding_1,title_embedding_2,title_embedding_3,title_embedding_4,title_embedding_5,title_embedding_6,...,title_embedding_374,title_embedding_375,title_embedding_376,title_embedding_377,title_embedding_378,title_embedding_379,title_embedding_380,title_embedding_381,title_embedding_382,title_embedding_383
0,2013-01-02,AAPL,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...,-0.019994,0.018245,0.008015,-0.027542,-0.085193,0.019731,0.050116,...,-0.000599,0.094991,-0.087186,-0.002892,0.046942,-0.117861,0.053491,-0.084149,-0.139026,0.103073
1,2013-01-02,AMAT,Shares of the Market Vectors Semiconductor ETF...,-0.096256,-0.053875,0.022681,0.053231,-0.007122,-0.068544,0.010291,...,-0.042933,0.015831,-0.110235,0.105988,0.018843,-0.012126,0.102385,-0.09862,0.032747,0.046961
2,2013-01-02,ARX,Shares of the Market Vectors Semiconductor ETF...,-0.096256,-0.053875,0.022681,0.053231,-0.007122,-0.068544,0.010291,...,-0.042933,0.015831,-0.110235,0.105988,0.018843,-0.012126,0.102385,-0.09862,0.032747,0.046961
3,2013-01-02,AVGO,Shares of the Market Vectors Semiconductor ETF...,-0.096256,-0.053875,0.022681,0.053231,-0.007122,-0.068544,0.010291,...,-0.042933,0.015831,-0.110235,0.105988,0.018843,-0.012126,0.102385,-0.09862,0.032747,0.046961
4,2013-01-02,BA,Boeing (NYSE: BA) announced today an\r\norder ...,0.001937,-0.030179,-0.002562,-0.006971,-0.032365,0.068107,-0.05838,...,-0.010652,-0.031298,-0.058106,0.080881,-0.007483,0.070982,0.086416,-0.158374,0.000312,0.034754


In [193]:
df_embedded_news['updated'] = pd.to_datetime(df_embedded_news['updated'])

In [194]:
df_merged = df_embedded_news.merge(df_stocks,
                                how='inner',
                                left_on=['updated', 'stocks'],
                                right_on=['Date', 'Symbol'])

df_merged.columns = df_merged.columns.str.lower()

In [195]:
df_merged.head()

Unnamed: 0,updated,stocks,body,title_embedding_0,title_embedding_1,title_embedding_2,title_embedding_3,title_embedding_4,title_embedding_5,title_embedding_6,...,weekly_return,5_day_ma,20_day_ma,5_day_volatility,momentum,macd,macd_signal,macd_histogram,week_of_year,month
0,2013-02-20,AAPL,Shares of Apple (NASDAQ: AAPL) are trading dow...,-0.019157,-0.238011,0.092994,0.007629,-0.055334,-0.134213,-0.078134,...,-0.040714,16.447143,16.481375,0.262181,-0.397856,-0.427553,-0.48938,0.061827,8,2
1,2013-02-20,AMZN,Shares of Apple (NASDAQ: AAPL) are trading dow...,0.016204,-0.113836,0.114636,0.014135,0.02712,-0.05671,-0.136462,...,0.029803,13.3996,13.330875,0.105347,-0.167,0.012436,-0.008985,0.021421,8,2
2,2013-02-20,BA,Boeing (NYSE: BA) and its negotiations team ar...,0.055105,-0.014958,0.022888,0.063149,-0.082238,0.110724,-0.082824,...,-0.015923,74.834,75.1515,0.147749,0.129997,-0.21109,-0.116235,-0.094855,8,2
3,2013-02-20,CI,\r\n\tAlthough Congress is in recess for the w...,0.002382,0.040935,0.010315,0.002876,0.034303,0.064144,0.023652,...,-0.043724,60.824001,59.8435,1.020284,-1.16,1.135656,1.35866,-0.223004,8,2
4,2013-02-20,COST,"With the fiscal cliff having been averted, “bu...",0.017649,0.013712,0.031954,-0.036274,0.003851,0.005587,0.066122,...,-0.006682,101.759999,102.346,0.428543,-0.82,-0.034452,0.111248,-0.145701,8,2


In [196]:
X_train, X_test, y_train, y_test = train_test_split(
    df_merged.drop(['updated', 'stocks', 'date', 'weekly_return', 'body'], axis=1),
    df_merged["weekly_return"],
    test_size=0.3,
    random_state=42
)

cat_features = ['symbol', 'week_of_year', 'month']

In [197]:
preprocessor = ColumnTransformer(
    transformers=[
        ('category', MEstimateEncoder(m=5.0), cat_features),
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
])

X_train_processed = pipeline.fit_transform(X_train,  y_train)
X_test_processed = pipeline.transform(X_test)

## Обучение градиентного бустинга

In [203]:
start_learn = timeit.default_timer()

model = CatBoostRegressor()
model.fit(X_train_processed, y_train)

end_learn = timeit.default_timer()

y_pred = model.predict(X_test_processed)


Learning rate set to 0.078259
0:	learn: 0.0597739	total: 33.8ms	remaining: 33.7s
1:	learn: 0.0576176	total: 58.7ms	remaining: 29.3s
2:	learn: 0.0557050	total: 77.8ms	remaining: 25.8s
3:	learn: 0.0539697	total: 100ms	remaining: 25s
4:	learn: 0.0523586	total: 119ms	remaining: 23.7s
5:	learn: 0.0509467	total: 138ms	remaining: 22.8s
6:	learn: 0.0495178	total: 160ms	remaining: 22.7s
7:	learn: 0.0481824	total: 180ms	remaining: 22.3s
8:	learn: 0.0471661	total: 199ms	remaining: 21.9s
9:	learn: 0.0460849	total: 224ms	remaining: 22.1s
10:	learn: 0.0450736	total: 243ms	remaining: 21.9s
11:	learn: 0.0441490	total: 265ms	remaining: 21.8s
12:	learn: 0.0432806	total: 285ms	remaining: 21.6s
13:	learn: 0.0424819	total: 305ms	remaining: 21.5s
14:	learn: 0.0417815	total: 327ms	remaining: 21.5s
15:	learn: 0.0411320	total: 346ms	remaining: 21.3s
16:	learn: 0.0405294	total: 372ms	remaining: 21.5s
17:	learn: 0.0399776	total: 392ms	remaining: 21.4s
18:	learn: 0.0394275	total: 415ms	remaining: 21.4s
19:	learn:

In [204]:
scores = {
    'mean_squared_error': mean_squared_error(y_test, y_pred),
    'mean_absolute_error': mean_absolute_error(y_test, y_pred),
    'r2': r2_score(y_test, y_pred),
    'train_size': X_train_processed.shape[0],
    'full_train_min': (end_learn - start_learn) // 60,
    'full_train_sec': (end_learn - start_learn) % 60,
    'titel_embedding_min': (end_embeddings - start_embeddings) // 60,
    'titel_embedding_sec': (end_embeddings - start_embeddings) % 60,
}
scores

{'mean_squared_error': 0.000688798721103898,
 'mean_absolute_error': 0.01710555349111017,
 'r2': 0.8246852491302417,
 'train_size': 60358,
 'full_train_min': 0.0,
 'full_train_sec': 20.25413379096426,
 'titel_embedding_min': 0.0,
 'titel_embedding_sec': 26.955965125001967}

In [205]:
model.save_model('../models/titles_embedding_financial_data.cbm')

with open('../models/metrics_titles_embedding_financial_data.json', 'w') as json_file:
    json.dump(scores, json_file)

# Эмбеддинги заголовков новостей + TF-IDF + финансовые признаки

In [98]:
df_embedded_news = df_embedded_news.groupby(['updated', 'stocks']).agg({
    'body': '\n'.join,
    **{f'title_embedding_{i}': 'sum' for i in range(384)}
}).reset_index()

df_embedded_news.head()

Unnamed: 0,updated,stocks,body,title_embedding_0,title_embedding_1,title_embedding_2,title_embedding_3,title_embedding_4,title_embedding_5,title_embedding_6,...,title_embedding_374,title_embedding_375,title_embedding_376,title_embedding_377,title_embedding_378,title_embedding_379,title_embedding_380,title_embedding_381,title_embedding_382,title_embedding_383
0,2013-01-02,AAPL,Futures Up Strong on Fiscal Cliff Deal\nU.S. e...,-0.019994,0.018245,0.008015,-0.027542,-0.085193,0.019731,0.050116,...,-0.000599,0.094991,-0.087186,-0.002892,0.046942,-0.117861,0.053491,-0.084149,-0.139026,0.103073
1,2013-01-02,AMAT,Shares of the Market Vectors Semiconductor ETF...,-0.096256,-0.053875,0.022681,0.053231,-0.007122,-0.068544,0.010291,...,-0.042933,0.015831,-0.110235,0.105988,0.018843,-0.012126,0.102385,-0.09862,0.032747,0.046961
2,2013-01-02,ARX,Shares of the Market Vectors Semiconductor ETF...,-0.096256,-0.053875,0.022681,0.053231,-0.007122,-0.068544,0.010291,...,-0.042933,0.015831,-0.110235,0.105988,0.018843,-0.012126,0.102385,-0.09862,0.032747,0.046961
3,2013-01-02,AVGO,Shares of the Market Vectors Semiconductor ETF...,-0.096256,-0.053875,0.022681,0.053231,-0.007122,-0.068544,0.010291,...,-0.042933,0.015831,-0.110235,0.105988,0.018843,-0.012126,0.102385,-0.09862,0.032747,0.046961
4,2013-01-02,BA,Boeing (NYSE: BA) announced today an\r\norder ...,0.001937,-0.030179,-0.002562,-0.006971,-0.032365,0.068107,-0.05838,...,-0.010652,-0.031298,-0.058106,0.080881,-0.007483,0.070982,0.086416,-0.158374,0.000312,0.034754


In [99]:
df_embedded_news['updated'] = pd.to_datetime(df_embedded_news['updated'])

In [100]:
df_merged = df_embedded_news.merge(df_stocks,
                                how='inner',
                                left_on=['updated', 'stocks'],
                                right_on=['Date', 'Symbol'])

df_merged.columns = df_merged.columns.str.lower()

In [101]:
df_merged.head()

Unnamed: 0,updated,stocks,body,title_embedding_0,title_embedding_1,title_embedding_2,title_embedding_3,title_embedding_4,title_embedding_5,title_embedding_6,...,weekly_return,5_day_ma,20_day_ma,5_day_volatility,momentum,macd,macd_signal,macd_histogram,week_of_year,month
0,2013-02-20,AAPL,Shares of Apple (NASDAQ: AAPL) are trading dow...,-0.019157,-0.238011,0.092994,0.007629,-0.055334,-0.134213,-0.078134,...,-0.040714,16.447143,16.481375,0.262181,-0.397856,-0.427553,-0.48938,0.061827,8,2
1,2013-02-20,AMZN,Shares of Apple (NASDAQ: AAPL) are trading dow...,0.016204,-0.113836,0.114636,0.014135,0.02712,-0.05671,-0.136462,...,0.029803,13.3996,13.330875,0.105347,-0.167,0.012436,-0.008985,0.021421,8,2
2,2013-02-20,BA,Boeing (NYSE: BA) and its negotiations team ar...,0.055105,-0.014958,0.022888,0.063149,-0.082238,0.110724,-0.082824,...,-0.015923,74.834,75.1515,0.147749,0.129997,-0.21109,-0.116235,-0.094855,8,2
3,2013-02-20,CI,\r\n\tAlthough Congress is in recess for the w...,0.002382,0.040935,0.010315,0.002876,0.034303,0.064144,0.023652,...,-0.043724,60.824001,59.8435,1.020284,-1.16,1.135656,1.35866,-0.223004,8,2
4,2013-02-20,COST,"With the fiscal cliff having been averted, “bu...",0.017649,0.013712,0.031954,-0.036274,0.003851,0.005587,0.066122,...,-0.006682,101.759999,102.346,0.428543,-0.82,-0.034452,0.111248,-0.145701,8,2


### TF-IDF

In [104]:
start_lemmatization = timeit.default_timer()
df_merged['body_preprocessed'] = df_merged['body'].apply(lambda x: text_to_wordlist(x, stem_words=True))
end_lemmatization = timeit.default_timer()

df_merged.drop("body", axis=1, inplace=True)

In [105]:
X_train, X_test, y_train, y_test = train_test_split(
    df_merged.drop(['updated', 'stocks', 'date', 'weekly_return'], axis=1),
    df_merged["weekly_return"],
    test_size=0.3,
    random_state=42
)

cat_features = ['symbol', 'week_of_year', 'month']

In [106]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'body_preprocessed'),
        ('category', MEstimateEncoder(m=5.0), cat_features),
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
])

start_tfidf = timeit.default_timer()
X_train_processed = pipeline.fit_transform(X_train,  y_train)
end_tfidf = timeit.default_timer()

X_test_processed = pipeline.transform(X_test)

## Обучение градиентного бустинга

In [107]:
params = {'n_estimators': 277, 'max_depth': 9, 'learning_rate': 0.030116927907760483, 'subsample': 0.6}

In [108]:
start_learn = timeit.default_timer()

model = CatBoostRegressor(**params)
model.fit(X_train_processed, y_train)

end_learn = timeit.default_timer()

y_pred = model.predict(X_test_processed)

0:	learn: 0.0611412	total: 9.85s	remaining: 45m 17s
1:	learn: 0.0601603	total: 19.6s	remaining: 44m 57s
2:	learn: 0.0591671	total: 29.5s	remaining: 44m 51s
3:	learn: 0.0582234	total: 39.2s	remaining: 44m 37s
4:	learn: 0.0573557	total: 49s	remaining: 44m 22s
5:	learn: 0.0564876	total: 58.6s	remaining: 44m 8s
6:	learn: 0.0556650	total: 1m 8s	remaining: 43m 54s
7:	learn: 0.0548564	total: 1m 17s	remaining: 43m 39s
8:	learn: 0.0541052	total: 1m 27s	remaining: 43m 29s
9:	learn: 0.0533463	total: 1m 37s	remaining: 43m 19s
10:	learn: 0.0526185	total: 1m 47s	remaining: 43m 8s
11:	learn: 0.0519064	total: 1m 57s	remaining: 43m 4s
12:	learn: 0.0512625	total: 2m 7s	remaining: 43m 3s
13:	learn: 0.0506011	total: 2m 17s	remaining: 42m 56s
14:	learn: 0.0499544	total: 2m 27s	remaining: 42m 48s
15:	learn: 0.0493853	total: 2m 37s	remaining: 42m 45s
16:	learn: 0.0487741	total: 2m 47s	remaining: 42m 40s
17:	learn: 0.0482175	total: 2m 57s	remaining: 42m 35s
18:	learn: 0.0476383	total: 3m 7s	remaining: 42m 27s

In [111]:
scores = {
    'mean_squared_error': mean_squared_error(y_test, y_pred),
    'mean_absolute_error': mean_absolute_error(y_test, y_pred),
    'r2': r2_score(y_test, y_pred),
    'train_size': X_train_processed.shape[0],
    'full_train_min': (end_learn - start_learn) // 60,
    'full_train_sec': (end_learn - start_learn) % 60,
    'lemmatization_news_min': (end_lemmatization - start_lemmatization) // 60,
    'lemmatization_news_sec': (end_lemmatization - start_lemmatization) % 60,
    'tf_idf_news_min': (end_tfidf - start_tfidf) // 60,
    'tf_idf_news_sec': (end_tfidf - start_tfidf) % 60,
    'title_embedding_min': (end_embeddings - start_embeddings) // 60,
    'title_embedding_min': (end_embeddings - start_embeddings) % 60,
}
scores

{'mean_squared_error': 0.0008828281941214841,
 'mean_absolute_error': 0.019027973239809894,
 'r2': 0.7753003886749947,
 'train_size': 60358,
 'full_train_min': 45.0,
 'full_train_sec': 42.973428290802985,
 'lemmatization_news_min': 15.0,
 'lemmatization_news_sec': 36.70095237507485,
 'tf_idf_news_min': 0.0,
 'tf_idf_news_sec': 21.85177983297035,
 'title_embedding_min': 33.74547725007869}

In [112]:
model.save_model('../models/news_tfidf_title_embedding_financial_data.cbm')

with open('../models/metrics_news_tfidf_title_embedding_financial_data.json', 'w') as json_file:
    json.dump(scores, json_file)

# Сравнение всех моделей:

In [220]:
models_directory = '../models'

models_metrics = os.listdir(models_directory)

json_files = [model_metrics for model_metrics in models_metrics if model_metrics.endswith('.json')]

metrics_list = []

for file in json_files:
    with open(os.path.join(models_directory, file), 'r') as f:
        data = json.load(f)
        model_name = file.split('.')[0]
        df = pd.DataFrame.from_dict(data, orient='index', columns=['value'])
        df['model'] = model_name
        metrics_list.append(df)
        
result_df = pd.concat(metrics_list)
result_df.reset_index(inplace=True)
result_df.columns = ['metric', 'value', 'model']

In [221]:
result_df = result_df.pivot(columns='metric', index='model', values='value')[['mean_absolute_error',
                                                                              'mean_squared_error',
                                                                              'r2', 
                                                                              'full_train_min',
                                                                              'full_train_sec',
                                                                              'train_size',
                                                                             ]].sort_values('mean_absolute_error')
result_df['train_size'] = result_df['train_size'].astype('int')

result_df

metric,mean_absolute_error,mean_squared_error,r2,full_train_min,full_train_sec,train_size
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
metrics_financial_data_only,0.016942,0.00064,0.66511,0.0,10.017573,946211
metrics_titles_embedding_financial_data,0.017106,0.000689,0.824685,0.0,20.254134,60358
metrics_news_tfidf_title_embedding_financial_data,0.019028,0.000883,0.7753,45.0,42.973428,60358
metrics_nlp_news_financial_data,0.019148,0.000893,0.772712,43.0,19.204213,60358
