In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from transformers import pipeline
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split




In [2]:
# Load financial news dataset
news_data = pd.read_csv('financialNews.csv')  # Replace with your actual dataset path

In [3]:
# Extract unique tickers
tickers = news_data['ticker'].unique()

In [4]:
# Define function to get historical data for all tickers
def get_historical_data(tickers, start="2022-01-01", end="2023-12-31"):
    historical_data = {}
    for ticker in tickers:
        try:
            data = yf.download(ticker, start=start, end=end)
            data['Ticker'] = ticker
            historical_data[ticker] = data
        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")
    return historical_data

In [5]:
# Fetch Yahoo Finance historical data
historical_data = get_historical_data(tickers)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [6]:
# Combine all ticker data into a single DataFrame
historical_df = pd.concat(historical_data.values())
historical_df.reset_index(inplace=True)

In [7]:
# Preprocess data for LSTM
def preprocess_data(data, target_col):
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data[[target_col]].values)
    x, y = [], []
    for i in range(60, len(scaled_data)):
        x.append(scaled_data[i-60:i, 0])
        y.append(scaled_data[i, 0])
    return np.array(x), np.array(y), scaler

In [8]:
# Apply FinBERT for sentiment analysis
finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")

def calculate_sentiment_scores(news_df):
    max_len = 512  # FinBERT max token limit
    news_df['sentiment'] = news_df['description'].apply(lambda x: finbert(x[:max_len])[0]['label'])
    news_df['sentiment_score'] = news_df['sentiment'].map({"POSITIVE": 1, "NEGATIVE": -1, "NEUTRAL": 0})
    return news_df


news_data = calculate_sentiment_scores(news_data)




All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\astev\AppData\Local\Programs\Python\Python310\lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\astev\AppData\Local\Temp\ipykernel_24108\3013397276.py", line 11, in <module>
    news_data = calculate_sentiment_scores(news_data)
  File "C:\Users\astev\AppData\Local\Temp\ipykernel_24108\3013397276.py", line 6, in calculate_sentiment_scores
    news_df['sentiment'] = news_df['description'].apply(lambda x: finbert(x[:max_len])[0]['label'])
  File "c:\Users\astev\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\series.py", line 4764, in apply
    ).apply()
  File "c:\Users\astev\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\apply.py", line 1209, in apply
    return self.apply_standard()
  File "c:\Users\astev\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\apply.py", line 1289, 

                       datetime  \
0      Fri 24 Nov 2023, 12:00AM   
1      Thu 23 Nov 2023, 07:00PM   
2      Thu 23 Nov 2023, 05:43PM   
3      Thu 23 Nov 2023, 04:47PM   
4      Thu 23 Nov 2023, 03:25PM   
...                         ...   
95454  Mon 02 May 2022, 04:56AM   
95455  Mon 02 May 2022, 04:56AM   
95456  Mon 02 May 2022, 04:56AM   
95457  Mon 02 May 2022, 04:56AM   
95458  Mon 02 May 2022, 04:00AM   

                                                   title  \
0      OpenAI turmoil exposes threat to Microsoft’s i...   
1      10 Can’t Miss Black Friday Electronics Deals a...   
2      UPDATE 1-German union Verdi calls for strikes ...   
3      Corrections & Amplifications - The success of ...   
4      EU mulls wider scope for cybersecurity certifi...   
...                                                  ...   
95454  Zacks Investment Ideas feature highlights: Alp...   
95455  Zacks Investment Ideas feature highlights: Alp...   
95456  Zacks Investment Ideas feature h

In [None]:
# Merge sentiment data with historical data
def merge_data(historical, news, ticker):
    historical = historical[historical['Ticker'] == ticker]
    news = news[news['ticker'] == ticker]
    merged_data = pd.merge(historical, news, left_on='Date', right_on='datetime', how='left')
    merged_data.fillna(0, inplace=True)
    return merged_data

# Example for one ticker (loop through all for full model creation)
example_ticker = tickers[0]
merged_data = merge_data(historical_df, news_data, example_ticker)

# Prepare datasets for LSTM models
x_hist, y_hist, scaler = preprocess_data(merged_data, "Close")

# Add sentiment scores for second model
x_with_sentiment = np.hstack([x_hist, merged_data['sentiment_score'].values.reshape(-1, 1)])

# Split data into train and test sets
x_train_hist, x_test_hist, y_train_hist, y_test_hist = train_test_split(x_hist, y_hist, test_size=0.2, random_state=42)
x_train_sent, x_test_sent, y_train_sent, y_test_sent = train_test_split(x_with_sentiment, y_hist, test_size=0.2, random_state=42)

# Build LSTM model
def build_lstm(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer="adam", loss="mean_squared_error")
    return model

# Model 1: Historical data only
model_hist = build_lstm((x_train_hist.shape[1], 1))
model_hist.fit(x_train_hist, y_train_hist, epochs=10, batch_size=32, validation_data=(x_test_hist, y_test_hist))

# Model 2: Historical data + Sentiment scores
model_sent = build_lstm((x_train_sent.shape[1], 1))
model_sent.fit(x_train_sent, y_train_sent, epochs=10, batch_size=32, validation_data=(x_test_sent, y_test_sent))

# Save models
model_hist.save("lstm_hist_model.h5")
model_sent.save("lstm_sent_model.h5")
