In [1]:
!pip install numpy==1.26.4

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl (20.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.6/20.6 MB[0m [31m26.0 MB/s[0m  [33m0:00:00[0m [31m30.2 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
Successfully installed numpy-1.26.4


In [2]:
!pip install --upgrade torch



In [14]:
# --- Cell 1: Setup and Imports ---

!pip install yfinance transformers torch pandas numpy scikit-learn tqdm

# --- Imports ---
import yfinance as yf
import pandas as pd
import numpy as np
import torch
from datetime import date
# Import the optimized Hugging Face pipeline tool
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import torch.nn as nn
from tqdm.auto import tqdm

# Set pandas options for better table previews
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_columns', None)

print("Cell 1: Libraries installed and imported successfully.")

Cell 1: Libraries installed and imported successfully.


In [2]:
# --- Cell 2: Real Data Loading and Preparation (Fully Corrected for Timezone) ---

import pandas as pd
from datetime import date
import numpy as np

# --- Configuration ---
TARGET_LAG = -2 # Shift for T+2 target variable
TIME_STEPS = 60 # Look back 60 trading days for the LSTM

# --- Part 1: Load and Prepare Price Data ---
try:
    # IMPORTANT: Reads your stock price file 'price_data.csv'
    price_df = pd.read_csv('price_data.csv')
    
    # Clean and rename columns
    price_df = price_df.rename(columns={'date': 'Date', 'close': 'Close'})
    
    # Convert date to datetime object and normalize (remove time)
    price_df['Date'] = pd.to_datetime(price_df['Date']).dt.normalize()

    # Create the T+2 Target Variable
    price_df['Target_T_plus_2_Close'] = price_df['Close'].shift(TARGET_LAG)

    # Keep only the columns we need
    price_df = price_df[['Date', 'Close', 'Target_T_plus_2_Close']].dropna().reset_index(drop=True)
    
    print(f"Price data loaded: {len(price_df)} total rows.")
    
except FileNotFoundError:
    print("ERROR: 'price_data.csv' not found. Please ensure the file is named correctly.")
    raise

# --- Part 2: Load and Aggregate News Headlines ---
try:
    # IMPORTANT: Reads your news headline file 'news_data.csv'
    news_df = pd.read_csv('news_data.csv')
    
    # Convert date to datetime object, normalize, AND REMOVE TIMEZONE
    news_df['Date'] = pd.to_datetime(news_df['Date']).dt.normalize().dt.tz_localize(None)
    
    # Aggregate all headlines into a single string per trading day
    daily_headlines_df = news_df.groupby('Date')['Article_title'].apply(
        lambda x: '. '.join(x.astype(str))
    ).reset_index(name='News Headlines')
    
    print(f"News data loaded: {len(daily_headlines_df)} unique news days.")
    
except FileNotFoundError:
    print("ERROR: 'news_data.csv' not found. Please ensure the file is named correctly.")
    raise

# --- Part 3: Merge and Finalize Dataframe ---
# Merge the price data and news data based on the Date (now compatible!)
master_df = pd.merge(price_df, daily_headlines_df, on='Date', how='inner')

# Drop the current day's closing price since we are only using sentiment for prediction
master_df = master_df.drop(columns=['Close'])

# Fill NaN headlines (days where price exists but no news was available) with an empty string
master_df['News Headlines'] = master_df['News Headlines'].fillna('')

print(f"\nCell 2: Data Preparation complete.")
print(f"Final merged dataset size: {len(master_df)} rows (constrained by news availability). Preview:")
print(master_df.head())

Price data loaded: 10850 total rows.
News data loaded: 81 unique news days.

Cell 2: Data Preparation complete.
Final merged dataset size: 61 rows (constrained by news availability). Preview:
        Date  Target_T_plus_2_Close  \
0 2020-06-10             333.459991   
1 2020-06-09             331.500000   
2 2020-06-02             317.940002   
3 2020-06-01             318.250000   
4 2020-05-29             318.109985   

                                      News Headlines  
0  Tech Stocks And FAANGS Strong Again To Start D...  
1  Big Tech Reaches New Record Heights At The Sto...  
2  'Apple is tracking iPhones stolen by looters' ...  
3  Apple Cuts iPhone Prices in China To Push Sale...  
4  Costco Shares Come Under Pressure Despite Stro...  


In [3]:
# --- Cell 3: Sentiment Feature Engineering (FINAL FIX - Truncation Added) ---
print("\nStarting Optimized FinBERT Sentiment Analysis (Truncation Fix)...")
MODEL_NAME = "ProsusAI/finbert"

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm.auto import tqdm

# --- Manual Model Loading FIX ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Loading model weights using safetensors...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    use_safetensors=True
) 
# --- End FIX ---


# Initialize the FinBERT pipeline with the already loaded model and tokenizer
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1, 
    batch_size=32 
)

# Extract headlines
headlines_list = master_df['News Headlines'].astype(str).tolist()

# Run the pipeline on the full list of headlines. TRUNCATION IS ADDED HERE.
tqdm.pandas(desc="FinBERT Inference")
results = sentiment_pipeline(headlines_list, truncation=True)

# Function to extract a numerical score (Positive probability)
def extract_positive_score(result):
    return result['score'] if result['label'] == 'positive' else (1 - result['score']) / 2

# Apply the results back to the DataFrame
master_df['Sentiment_Score'] = [extract_positive_score(r) for r in results]

print("\nCell 3: Sentiment analysis complete. Preview:")
print(master_df[['Date', 'Sentiment_Score', 'Target_T_plus_2_Close']].head())


Starting Optimized FinBERT Sentiment Analysis (Truncation Fix)...
Loading model weights using safetensors...


Device set to use cpu



Cell 3: Sentiment analysis complete. Preview:
        Date  Sentiment_Score  Target_T_plus_2_Close
0 2020-06-10         0.204344             333.459991
1 2020-06-09         0.944226             331.500000
2 2020-06-02         0.432404             317.940002
3 2020-06-01         0.163856             318.250000
4 2020-05-29         0.857266             318.109985


In [4]:
print(master_df[['Date', 'Sentiment_Score']].head())

        Date  Sentiment_Score
0 2020-06-10         0.204344
1 2020-06-09         0.944226
2 2020-06-02         0.432404
3 2020-06-01         0.163856
4 2020-05-29         0.857266


In [7]:
# --- Cell 4: LSTM Model Construction and Training (FINAL FIX: N_SAMPLES=1) ---
print("\nStarting Optimized LSTM Training with Dropout...")

import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
from tqdm.auto import tqdm
import numpy as np

# Use global variables set in Cell 2
TIME_STEPS = 60 # Look back window

# 1. Prepare Data for LSTM
data = master_df[['Sentiment_Score', 'Target_T_plus_2_Close']].values

# 2. Normalize Data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data)

# Separate features (X) and target (Y)
X = scaled_data[:, 0] # Sentiment Score
Y = scaled_data[:, 1] # T+2 Close

# 3. Create Sequences
def create_sequences(data, target, time_steps):
    Xs, Ys = [], []
    for i in range(len(data) - time_steps):
        # Reshape data to fit the (time_steps, features) format
        Xs.append(data[i:(i + time_steps)].reshape(time_steps, 1))
        Ys.append(target[i + time_steps])
    return np.array(Xs), np.array(Ys)

X_seq, Y_seq = create_sequences(X, Y, TIME_STEPS)

# 4. Handle Sample Split (Manually force 1 sample to be used for both train and test)
# n_samples = 1, so we must use the single sample for both sets.
X_train, X_test = X_seq, X_seq
Y_train, Y_test = Y_seq, Y_seq

print(f"Total samples for LSTM: {len(X_seq)}. Training with {len(X_train)} and testing with {len(X_test)} samples.")


X_train_t = torch.tensor(X_train, dtype=torch.float32)
Y_train_t = torch.tensor(Y_train, dtype=torch.float32).unsqueeze(1)
X_test_t = torch.tensor(X_test, dtype=torch.float32)
Y_test_t = torch.tensor(Y_test, dtype=torch.float32).unsqueeze(1)

# --- LSTM Model Definition (Modified with Dropout) ---
class FinBERTLSTM(nn.Module):
    def __init__(self, input_size=1, hidden_size=40, num_layers=2, output_size=1, dropout_rate=0.3):
        super(FinBERTLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate
        )
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

# --- Training Loop ---
model_lstm = FinBERTLSTM()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_lstm.parameters(), lr=0.001)

NUM_EPOCHS = 50
print("Starting training (50 epochs)...")
for epoch in tqdm(range(NUM_EPOCHS), desc="LSTM Training"):
    model_lstm.train()
    optimizer.zero_grad()
    output = model_lstm(X_train_t)
    loss = criterion(output, Y_train_t)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {loss.item():.6f}')

print("\nCell 4: Training complete.")

# --- Evaluation ---
model_lstm.eval()
with torch.no_grad():
    test_output = model_lstm(X_test_t)
    test_loss = criterion(test_output, Y_test_t)
    
# Inverse transform the predictions for real price comparison
test_predictions_scaled = test_output.numpy().flatten()
dummy_pred_array = np.zeros((len(test_predictions_scaled), 2))
dummy_pred_array[:, 1] = test_predictions_scaled
final_predictions = scaler.inverse_transform(dummy_pred_array)[:, 1]

print(f"\nTest Set MSE (Normalized): {test_loss.item():.6f}")
print(f"Sample T+2 Price Prediction (Actual $): {final_predictions[0]:.2f}")


Starting Optimized LSTM Training with Dropout...
Total samples for LSTM: 1. Training with 1 and testing with 1 samples.
Starting training (50 epochs)...


LSTM Training: 100%|███████████████████████████| 50/50 [00:00<00:00, 121.00it/s]

Epoch [10/50], Loss: 0.356186
Epoch [20/50], Loss: 0.120308
Epoch [30/50], Loss: 0.018573
Epoch [40/50], Loss: 0.002328
Epoch [50/50], Loss: 0.001197

Cell 4: Training complete.

Test Set MSE (Normalized): 0.000418
Sample T+2 Price Prediction (Actual $): 290.69



