## Stock Prediction Notebook(Baseline)
This notebook demonstrates stock price prediction using data without sentiment analysis. It covers data preparation, model definition, training, and evaluation.

In [26]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import requests
import zipfile
import os
from io import BytesIO

### Data Preparation
Classes and functions for loading and preprocessing data.

In [None]:

# Loading data section
# ZIP file URL from GitHub repository
url = 'https://github.com/yumoxu/stocknet-dataset/archive/refs/heads/master.zip'

# Download ZIP file and read into memory
response = requests.get(url)
zip_file = zipfile.ZipFile(BytesIO(response.content))

# Folder path for 'stocknet-dataset-master/price/raw/' in ZIP file
price_folder_path = 'stocknet-dataset-master/price/raw/'

# Read and combine all CSV files
data_frames = []
for file_name in zip_file.namelist():
    if file_name.startswith(price_folder_path) and file_name.endswith('.csv'):
        # Extract stock name from filename (e.g., 'raw/prices/appl.csv' -> 'appl')
        stock_name = os.path.splitext(os.path.basename(file_name))[0]

        # Read CSV file and convert to DataFrame
        with zip_file.open(file_name) as file:
            df = pd.read_csv(file)
             # Add stock name column
            df['Stock'] = stock_name
            df['Date'] = pd.to_datetime(df['Date'])
            # df.set_index('Date', inplace=True)
            data_frames.append(df)

# Combine all DataFrames into one
combined_df = pd.concat(data_frames, ignore_index=False)

# Check the combined DataFrame
print(combined_df.head())


        Date       Open       High        Low      Close  Adj Close  \
0 2012-09-04  95.108574  96.448570  94.928574  96.424286  87.121140   
1 2012-09-05  96.510002  96.621429  95.657143  95.747147  86.509338   
2 2012-09-06  96.167145  96.898575  95.828575  96.610001  87.288956   
3 2012-09-07  96.864288  97.497147  96.538574  97.205711  87.827171   
4 2012-09-10  97.207146  97.612854  94.585716  94.677139  85.542564   

        Volume Stock  
0   91973000.0  AAPL  
1   84093800.0  AAPL  
2   97799100.0  AAPL  
3   82416600.0  AAPL  
4  121999500.0  AAPL  


In [None]:
class StockDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length
        self.features = data[['Open', 'High', 'Low', 'Volume', 'Adj Close', 'Close']].values
        self.labels = data['movement'].values

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        x = self.features[idx:idx+self.seq_length]
        y = self.labels[idx+self.seq_length]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.long)


### Model Definition
Definitions for stock price prediction models using different approaches.

In [None]:

class BaseRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(BaseRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

class BaseLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(BaseLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out



### Logging Setup and Utilities

In [34]:
import logging

# Set up logging to save output to a file
logging.basicConfig(filename='model_training_log_baseline.txt', level=logging.INFO)

class ModelHandler:
    def __init__(self, model, criterion, optimizer, num_epochs):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.num_epochs = num_epochs

    def train(self, train_loader, company_id, model_name):
        self.model.train()
        for epoch in range(self.num_epochs):
            total_loss = 0.0
            for inputs, labels in train_loader:
                self.optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()
                
                total_loss += loss.item()
            
            if (epoch + 1) % 10 == 0:
                log_message = f'Company: {company_id}, Model: {model_name}, Epoch [{epoch+1}/{self.num_epochs}], Loss: {total_loss / len(train_loader):.4f}'
                logging.info(log_message)
                print(log_message)

    def evaluate(self, test_loader, company_id, model_name):
        self.model.eval()
        correct = 0
        total = 0
        tp = tn = fp = fn = 0  # Initialize counters
        with torch.no_grad():
            for inputs, labels in test_loader:
                outputs = self.model(inputs)
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                # Calculate TP, TN, FP, FN
                tp += ((predicted == 1) & (labels == 1)).sum().item()
                tn += ((predicted == 0) & (labels == 0)).sum().item()
                fp += ((predicted == 1) & (labels == 0)).sum().item()
                fn += ((predicted == 0) & (labels == 1)).sum().item()

        accuracy = correct / total
        macc = self.calculate_mcc(tp, tn, fp, fn)

        # Log and print TP, TN, FP, FN
        log_message = f'Company: {company_id}, Model: {model_name}, TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}'
        logging.info(log_message)
        print(log_message)
        return accuracy, macc

    def calculate_mcc(self, tp, tn, fp, fn):
        numerator = (tp * tn) - (fp * fn)
        denominator = ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
        if denominator == 0:
            return 0
        else:
            return numerator / denominator


In [None]:
# Main script
def run_models_on_companies(companies, seq_length, num_epochs):
    company_results = {}
    models = [BaseRNN, BaseLSTM]  # Add more models as needed

  # Dictionary to store cumulative accuracy and macc for each model
    model_metrics = {model_cls.__name__: {'accuracy': 0, 'macc': 0, 'count': 0} for model_cls in models}


    for company_id in companies:
        data = filtered[filtered['Stock'] == company_id]
        
        # Train and test split
        train_data = data[(data['Date'] >= '2014-01-01') & (data['Date'] < '2015-10-01')]
        test_data = data[(data['Date'] >= '2015-10-01') & (data['Date'] < '2016-01-01')]
        
        train_dataset = StockDataset(train_data, seq_length)
        test_dataset = StockDataset(test_data, seq_length)

        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

        for model_cls in models:
            # Instantiate model, criterion, and optimizer
            model = model_cls(input_size=6, hidden_size=32, output_size=2, num_layers=1)
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=0.001)

            handler = ModelHandler(model, criterion, optimizer, num_epochs)
            
            # Pass the company_id and model_name when calling train and evaluate
            handler.train(train_loader, company_id, model_cls.__name__)
            accuracy, macc = handler.evaluate(test_loader, company_id, model_cls.__name__)

            model_name = model_cls.__name__
            company_results[(company_id, model_name)] = {'Accuracy': accuracy, 'MACC': macc}

    # Add accuracy and macc to the cumulative sum for the model
            model_metrics[model_name]['accuracy'] += accuracy
            model_metrics[model_name]['macc'] += macc
            model_metrics[model_name]['count'] += 1  # Track number of companies processed

    # Calculate average accuracy and macc for each model
    avg_results = {}
    for model_name, metrics in model_metrics.items():
        avg_accuracy = metrics['accuracy'] / metrics['count']
        avg_macc = metrics['macc'] / metrics['count']
        avg_results[model_name] = {'Average Accuracy': avg_accuracy, 'Average MACC': avg_macc}
    
    # Save results of all companies
    results_df = pd.DataFrame.from_dict(company_results, orient='index')
    results_df.to_csv('company88_PriceBase_result.csv')

    # Save average results of models
    avg_results_df = pd.DataFrame.from_dict(avg_results, orient='index')
    avg_results_df.to_csv('average_PriceBase_result.csv')

    print("Evaluation complete")
    print(avg_results_df)


# Load the filtered dataset
filtered = pd.read_csv('data/processed/price_scaled.csv')
companies = filtered['Stock'].unique().tolist()

# Run the models
run_models_on_companies(companies, seq_length=5, num_epochs=120)

Company: AAPL, Model: BaseRNN, Epoch [10/120], Loss: 0.6875
Company: AAPL, Model: BaseRNN, Epoch [20/120], Loss: 0.6868
Company: AAPL, Model: BaseRNN, Epoch [30/120], Loss: 0.6860
Company: AAPL, Model: BaseRNN, Epoch [40/120], Loss: 0.6852
Company: AAPL, Model: BaseRNN, Epoch [50/120], Loss: 0.6843
Company: AAPL, Model: BaseRNN, Epoch [60/120], Loss: 0.6831
Company: AAPL, Model: BaseRNN, Epoch [70/120], Loss: 0.6816
Company: AAPL, Model: BaseRNN, Epoch [80/120], Loss: 0.6807
Company: AAPL, Model: BaseRNN, Epoch [90/120], Loss: 0.6800
Company: AAPL, Model: BaseRNN, Epoch [100/120], Loss: 0.6795
Company: AAPL, Model: BaseRNN, Epoch [110/120], Loss: 0.6789
Company: AAPL, Model: BaseRNN, Epoch [120/120], Loss: 0.6784
Company: AAPL, Model: BaseRNN, TP: 15, TN: 20, FP: 15, FN: 9
Company: AAPL, Model: BaseLSTM, Epoch [10/120], Loss: 0.6872
Company: AAPL, Model: BaseLSTM, Epoch [20/120], Loss: 0.6860
Company: AAPL, Model: BaseLSTM, Epoch [30/120], Loss: 0.6850
Company: AAPL, Model: BaseLSTM, E