In [18]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification
from scipy.special import softmax
import yfinance as yf
import requests
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from eventregistry import EventRegistry, QueryArticlesIter

# Load Stock Data
def load_stock_data(ticker, start_date, end_date):
    df = yf.download(ticker, start=start_date, end=end_date)
    df = df[['Close']]
    df.columns = ['close']
    df['trend'] = df['close'].diff().apply(lambda x: 'up' if x > 0 else ('down' if x < 0 else 'stable'))
    df.dropna(inplace=True)
    return df

# Load Sentiment Model
def load_finbert():
    tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert-tone")
    model = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
    return tokenizer, model

# Get Sentiment Scores
def get_sentiment(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs).logits.numpy()[0]
    scores = softmax(outputs)
    return scores[2] - scores[0]  # Positive - Negative Sentiment Score

# Fetch Financial News from EventRegistry
def fetch_news(api_key, company_uri):
    er = EventRegistry(apiKey=api_key)
    query = {
        "$query": {
            "conceptUri": company_uri
        },
        "$filter": {
            "forceMaxDataTimeWindow": "31"
        }
    }
    q = QueryArticlesIter.initWithComplexQuery(query)
    articles = [article["title"] for article in q.execQuery(er, maxItems=100)]
    return articles

# Prepare LSTM Data
def prepare_data(df, sentiment_scores, sequence_length=10):
    scaler = MinMaxScaler()

    # Ensure sentiment_scores has the same length as df
    if len(sentiment_scores) == 0:
        sentiment_scores = [0.0] * len(df)  # Default neutral sentiment
    elif len(sentiment_scores) < len(df):
        sentiment_scores = np.pad(sentiment_scores, (0, len(df) - len(sentiment_scores)), mode='edge')
    else:
        sentiment_scores = sentiment_scores[:len(df)]

    df["sentiment"] = sentiment_scores  # Assign adjusted sentiment scores
    data = scaler.fit_transform(df[['close', 'sentiment']])
    
    label_encoder = LabelEncoder()
    df['trend_encoded'] = label_encoder.fit_transform(df['trend'])
    
    sequences, targets = [], []
    for i in range(len(data) - sequence_length):
        sequences.append(data[i:i+sequence_length])
        targets.append(df['trend_encoded'].iloc[i+sequence_length])  # Predicting trend
    
    X = np.array(sequences)
    Y = np.array(targets)
    return X, Y, scaler, label_encoder


# Define LSTM Model
class StockLSTM(nn.Module):
    def __init__(self, input_size=2, hidden_size=50, num_layers=2, output_size=3):
        super(StockLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        return self.fc(lstm_out[:, -1, :])

# Train LSTM Model
def train_model(train_loader, input_size=2, epochs=20, lr=0.001):
    model = StockLSTM(input_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        for seqs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(seqs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}, Loss: {loss.item():.6f}")
    
    return model

# Main Execution
if __name__ == "__main__":
    ticker = "AAPL"
    start_date = "2024-01-01"
    end_date = "2025-01-01"
    api_key = "17b19eac-bdd8-4730-beda-a2f58443ad43"  # Replace with your EventRegistry API key
    company_uri = "http://en.wikipedia.org/wiki/Apple_Inc."
    
    df = load_stock_data(ticker, start_date, end_date)
    tokenizer, model = load_finbert()
    
    news_headlines = fetch_news(api_key, company_uri)
    sentiment_scores = [get_sentiment(text, tokenizer, model) for text in news_headlines]
    
    # Split stock data and sentiment into train/test
    train_size = int(0.8 * len(df))
    df_train, df_test = df.iloc[:train_size], df.iloc[train_size:]
    sentiment_train, sentiment_test = sentiment_scores[:train_size], sentiment_scores[train_size:]
    
    seq_length = 10
    X_train, Y_train, scaler, label_encoder = prepare_data(df_train, sentiment_train, seq_length)
    X_test, Y_test, _, _ = prepare_data(df_test, sentiment_test, seq_length)
    
    train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), 
                               torch.tensor(Y_train, dtype=torch.long))
    train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
    
    model = train_model(train_loader)
    print("Training complete!")
    
    # Evaluate on Test Data
    with torch.no_grad():
        test_inputs = torch.tensor(X_test, dtype=torch.float32)
        test_outputs = model(test_inputs)
        predictions = torch.argmax(test_outputs, dim=1).numpy()
        accuracy = np.mean(predictions == Y_test)
        print(f"Test Accuracy: {accuracy:.2f}")


[*********************100%***********************]  1 of 1 completed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sentiment"] = sentiment_scores  # Assign adjusted sentiment scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trend_encoded'] = label_encoder.fit_transform(df['trend'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df

Epoch 1, Loss: 0.952931
Epoch 2, Loss: 0.733387
Epoch 3, Loss: 0.698653
Epoch 4, Loss: 0.698015
Epoch 5, Loss: 0.693927
Epoch 6, Loss: 0.676571
Epoch 7, Loss: 0.693214
Epoch 8, Loss: 0.719585
Epoch 9, Loss: 0.672143
Epoch 10, Loss: 0.701457
Epoch 11, Loss: 0.708492
Epoch 12, Loss: 0.746761
Epoch 13, Loss: 0.682691
Epoch 14, Loss: 0.697453
Epoch 15, Loss: 0.661455
Epoch 16, Loss: 0.674350
Epoch 17, Loss: 0.693382
Epoch 18, Loss: 0.644918
Epoch 19, Loss: 0.674579
Epoch 20, Loss: 0.671891
Training complete!
Test Accuracy: 0.63
