In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random
from sklearn.model_selection import train_test_split
import wandb

# Set random seeds for reproducibility
random_state = 42
np.random.seed(random_state)
random.seed(random_state)
torch.manual_seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_state)

# ----- Load Stock and Sentiment Data -----
stock_data = pd.read_csv('AAPL_trend.csv', parse_dates=['date'])
# We can still filter out rows if needed, but for regression, you might want to use all data.
stock_data['closingValue'] = stock_data['closingValue'].astype(float)
sentiment_data = pd.read_csv('sentiment_scores_title.csv', parse_dates=['date'])
stock_data['date'] = pd.to_datetime(stock_data['date'])
merged_data = pd.merge(stock_data, sentiment_data, on='date', how='left')

# ----- Define Date Range -----
start_date = pd.to_datetime("2024-02-12")
end_date = pd.to_datetime("2025-02-19")
selected_stock_data = merged_data[(merged_data['date'] >= start_date) & (merged_data['date'] <= end_date)]

# ----- Feature Engineering -----
X_days = 7  # Use previous 7 days of closing prices
Y_days = 1  # Use previous 1 day of sentiment scores
selected_stock_data = selected_stock_data.sort_values('date')
closing_values = selected_stock_data['closingValue'].values
sentiment_scores = selected_stock_data['sentiment_score'].values

features, labels = [], []
# For regression, use the closing value at time i as the label
for i in range(max(X_days, Y_days), len(selected_stock_data)):
    stock_features = closing_values[i - X_days:i]
    sentiment_feature = sentiment_scores[i - Y_days:i]
    feature_vector = np.concatenate([stock_features, sentiment_feature])
    features.append(feature_vector)
    labels.append(closing_values[i])  # Predict the closing value at time i

features = np.array(features)
labels = np.array(labels)

# Train-Test Split (no stratification for regression)
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=random_state
)

# Define LSTM Model for Regression
class StockPriceLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(StockPriceLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)  # Output a single value
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        out, _ = self.lstm(x, (h0, c0))
        out = out[:, -1, :]  # Use the last time step's output
        out = self.fc(out)
        return out

# Prepare data for LSTM
input_size = 1
X_train_seq = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_seq = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
# For regression, targets are float values
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# ----- Define W&B Sweep Configuration for Regression -----
sweep_config = {
    "method": "bayes",  # Can also be "random" or "grid"
    "metric": {"name": "mse", "goal": "minimize"},
    "parameters": {
        "hidden_size": {"values": [16, 32, 64]},
        "num_layers": {"values": [1, 2, 3]},
        "lr": {"values": [0.01, 0.001, 0.0001]},
        "num_epochs": {"value": 10000}  # Fixed for tuning purposes
    }
}

# Create a new project for stock price prediction
sweep_id = wandb.sweep(sweep_config, project="stock_price_prediction")

# ----- Training Function for Regression -----
def train():
    wandb.init()
    config = wandb.config  # Load sweep parameters

    model = StockPriceLSTM(
        input_size=1,
        hidden_size=config.hidden_size,
        num_layers=config.num_layers
    )
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

    num_epochs = config.num_epochs
    for epoch in range(num_epochs):
        model.train()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 100 == 0:
            model.eval()
            with torch.no_grad():
                test_outputs = model(X_test_tensor)
                test_loss = criterion(test_outputs, y_test_tensor)
                # Log hyperparameters & metrics to wandb
                wandb.log({
                    "epoch": epoch + 1,
                    "train_loss": loss.item(),
                    "test_mse": test_loss.item(),
                    "hidden_size": config.hidden_size,
                    "num_layers": config.num_layers,
                    "lr": config.lr
                })
                print(f"Epoch {epoch + 1}: Test MSE = {test_loss.item():.4f}")

    wandb.finish()

# ----- Run Sweep with Multiple Configurations -----
wandb.agent(sweep_id, train, count=10)  # Run 10 different configurations


wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: 7pu82bg8
Sweep URL: https://wandb.ai/its_mrpsycho/stock_price_prediction/sweeps/7pu82bg8


wandb: Agent Starting Run: 27t6r7fd with config:
wandb: 	hidden_size: 64
wandb: 	lr: 0.0001
wandb: 	num_epochs: 10000
wandb: 	num_layers: 2
wandb: Currently logged in as: its_mrpsycho to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


Epoch 100: Test MSE = 44683.3164
Epoch 200: Test MSE = 43332.8633
Epoch 300: Test MSE = 42662.8438
Epoch 400: Test MSE = 42242.6094
Epoch 500: Test MSE = 41875.2148
Epoch 600: Test MSE = 41547.3750
Epoch 700: Test MSE = 41240.2109
Epoch 800: Test MSE = 40945.5820
Epoch 900: Test MSE = 40659.6445
Epoch 1000: Test MSE = 40380.2109
Epoch 1100: Test MSE = 40105.9102
Epoch 1200: Test MSE = 39835.8164
Epoch 1300: Test MSE = 39569.2617
Epoch 1400: Test MSE = 39305.7734
Epoch 1500: Test MSE = 39044.9805
Epoch 1600: Test MSE = 38786.5938
Epoch 1700: Test MSE = 38530.3828
Epoch 1800: Test MSE = 38276.1719
Epoch 1900: Test MSE = 38023.8086
Epoch 2000: Test MSE = 37773.1562
Epoch 2100: Test MSE = 37524.1172
Epoch 2200: Test MSE = 37276.6016
Epoch 2300: Test MSE = 37030.5312
Epoch 2400: Test MSE = 36785.8320
Epoch 2500: Test MSE = 36542.4570
Epoch 2600: Test MSE = 36300.3477
Epoch 2700: Test MSE = 36059.4609
Epoch 2800: Test MSE = 35819.7578
Epoch 2900: Test MSE = 35581.1992
Epoch 3000: Test MSE = 