## Stock Prediction Notebook(Main Models)
This notebook demonstrates stock price prediction using data with sentiment analysis. It covers data preparation, model definition, training, and evaluation.

In [26]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import requests
import zipfile
import os
from io import BytesIO

### Data Preparation
Classes and functions for loading and preprocessing data.

In [None]:
url = 'https://github.com/yumoxu/stocknet-dataset/archive/refs/heads/master.zip'

response = requests.get(url)
zip_file = zipfile.ZipFile(BytesIO(response.content))

price_folder_path = 'stocknet-dataset-master/price/raw/'

data_frames = []
for file_name in zip_file.namelist():
    if file_name.startswith(price_folder_path) and file_name.endswith('.csv'):
        # 주식 이름을 파일명에서 추출 (예: 'raw/prices/appl.csv' -> 'appl')
        stock_name = os.path.splitext(os.path.basename(file_name))[0]

        # CSV 파일을 읽어 데이터프레임으로 변환
        with zip_file.open(file_name) as file:  
            df = pd.read_csv(file)
            # 주식 이름 열 추가
            df['Stock'] = stock_name
            df['Date'] = pd.to_datetime(df['Date'])
            # df.set_index('Date', inplace=True)
            data_frames.append(df)

combined_df = pd.concat(data_frames, ignore_index=False)

class AddStockDataset(Dataset):
    def __init__(self, data, seq_length, sentiment_data):
        self.data = data
        self.seq_length = seq_length
        self.features = data[['Open', 'High', 'Low', 'Volume', 'Adj Close', 'Close']].values
        self.labels = data['movement'].values
        self.sentiment_data = sentiment_data[['positive', 'neutral', 'negative']].values

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        x = self.features[idx:idx+self.seq_length]
        y = self.labels[idx+self.seq_length]
        sentiment = self.sentiment_data[idx+self.seq_length]

        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.long)
        sentiment = torch.tensor(sentiment, dtype=torch.float32)



### Model Definition
Definitions for stock price prediction models using Stock Price + Sentiment Model

In [None]:
class StockRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, sentiment_size, num_layers, dropout=0.0):
        super(StockRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)  # Dropout layer
        self.fc = nn.Linear(hidden_size + sentiment_size, output_size)

    def forward(self, x, sentiment_data):
        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
        out, hn = self.rnn(x, h0)
        last_hidden = hn[-1]
        last_hidden = self.dropout(last_hidden)  # Apply dropout
        sentiment_data = sentiment_data.view(sentiment_data.size(0), -1)
        combined = torch.cat((last_hidden, sentiment_data), dim=1)
        output = self.fc(combined)
        return output


class StockLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, sentiment_size, num_layers, dropout=0.0):
        super(StockLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)  # Dropout layer
        self.fc = nn.Linear(hidden_size + sentiment_size, output_size)

    def forward(self, x, sentiment_data):
        h0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        c0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        out, (hn, _) = self.lstm(x, (h0, c0))
        
        # Apply dropout to the LSTM outputs (last hidden state)
        last_hidden = self.dropout(hn[-1])

        sentiment_data = sentiment_data.view(sentiment_data.size(0), -1)
        combined = torch.cat((last_hidden, sentiment_data), dim=1)

        output = self.fc(combined)
        return output
    
class SentimentRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout=0.0):
        super(SentimentRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
        out, hn = self.rnn(x, h0)
        out = self.layer_norm(out)
        out = self.dropout(out)  # Apply dropout after normalization
        return hn[-1]  # Return the last hidden state


class StockSentimentRNN(nn.Module):
    def __init__(self, stock_input_size, sentiment_input_size, hidden_size, output_size, num_layers, dropout=0.0):
        super(StockSentimentRNN, self).__init__()
        self.stock_rnn = nn.RNN(stock_input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.stock_layer_norm = nn.LayerNorm(hidden_size)
        self.sentiment_rnn = SentimentRNN(sentiment_input_size, hidden_size, num_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)  # Add dropout after concatenation
        self.fc = nn.Linear(hidden_size * 2, output_size)  # No Sigmoid for CrossEntropyLoss

    def forward(self, data, sentiment_data):
        # Stock data processing
        h0_stock = torch.zeros(self.stock_rnn.num_layers, data.size(0), self.stock_rnn.hidden_size).to(data.device)
        stock_out, stock_hn = self.stock_rnn(data, h0_stock)
        stock_out = self.stock_layer_norm(stock_out)  # Normalize stock RNN output
        stock_out = self.dropout(stock_out)  # Apply dropout to the stock output
        stock_hidden = stock_hn[-1]  # Get the last hidden state from the stock RNN

        # Sentiment data processing
        seq_length = data.size(1)  # Get sequence length from stock data
        sentiment_data = sentiment_data.unsqueeze(1).repeat(1, seq_length, 1)  # Repeat sentiment data
        sentiment_out = self.sentiment_rnn(sentiment_data)  # Process sentiment data and get last hidden state

        # Concatenate hidden states from stock and sentiment RNNs
        combined_hidden = torch.cat((stock_hidden, sentiment_out), dim=1)

        # Apply dropout to the combined hidden state
        combined_hidden = self.dropout(combined_hidden)

        # Pass the combined hidden state through a fully connected layer
        output = self.fc(combined_hidden)
        return output

### Logging Setup and Utilities

In [62]:
import logging
logging.basicConfig(filename='sentimodel_training_log.txt', level=logging.INFO)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ModelHandler:
    def __init__(self, model, criterion, optimizer, num_epochs, device):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.num_epochs = num_epochs
        self.device = device

    def train(self, train_loader, company_id, model_name):
        self.model.to(self.device)
        self.model.train()
        
        for epoch in range(self.num_epochs):
            total_loss = 0.0
            for data, labels, sentiment_data in train_loader:
                # Move data to the appropriate device
                data = data.to(self.device)
                labels = labels.to(self.device)
                sentiment_data = sentiment_data.to(self.device)
                
                # Clear gradients
                self.optimizer.zero_grad()

                # Forward pass
                outputs = self.model(data, sentiment_data)
                
                # Compute loss
                loss = self.criterion(outputs, labels)

                # Backward pass and optimization
                loss.backward()
                self.optimizer.step()
                
                total_loss += loss.item()
            
            avg_loss = total_loss / len(train_loader)
            if (epoch + 1) % 10 == 0:
                log_message = (f'Company: {company_id}, Model: {model_name}, '
                               f'Epoch [{epoch+1}/{self.num_epochs}], '
                               f'Average Loss: {avg_loss:.4f}')
                logging.info(log_message)
                print(log_message)

    def evaluate(self, test_loader, company_id, model_name):
        self.model.to(self.device)
        self.model.eval()
        correct = 0
        total = 0
        tp = tn = fp = fn = 0  # Initialize counters
        
        with torch.no_grad():
            for data, labels, sentiment_data in test_loader:
                # Move data to the appropriate device
                data = data.to(self.device)
                labels = labels.to(self.device)
                sentiment_data = sentiment_data.to(self.device)
                
                # Forward pass
                outputs = self.model(data, sentiment_data)
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                # Calculate TP, TN, FP, FN
                tp += ((predicted == 1) & (labels == 1)).sum().item()
                tn += ((predicted == 0) & (labels == 0)).sum().item()
                fp += ((predicted == 1) & (labels == 0)).sum().item()
                fn += ((predicted == 0) & (labels == 1)).sum().item()

        accuracy = correct / total
        macc = self.calculate_mcc(tp, tn, fp, fn)

        # Log and print evaluation results
        log_message = (f'Company: {company_id}, Model: {model_name}, '
                       f'Accuracy: {accuracy:.4f}, MCC: {macc:.4f}, '
                       f'TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}')
        logging.info(log_message)
        print(log_message)
        return accuracy, macc

    def calculate_mcc(self, tp, tn, fp, fn):
        numerator = (tp * tn) - (fp * fn)
        denominator = ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5
        if denominator == 0:
            return 0
        else:
            return numerator / denominator

### Training and Hyperparameter Tuning
Training scripts and hyperparameter optimization using Optuna.

In [None]:
pip install optuna
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

# Load your data
filtered = pd.read_csv(r'C:\Users\dhjrz\nlp\ionlab\price_scaled.csv')
senti = pd.read_csv(r"C:\Users\dhjrz\nlp\ionlab\all_companies_sentiment.csv")
selected_columns = ['filename', 'label', 'positive', 'negative', 'neutral', 'sentiment_score']
sentiment = senti[selected_columns]
sentiment.rename(columns={'filename': 'Date'}, inplace=True)
sentiment['Date'] = pd.to_datetime(sentiment['Date'])
data = filtered.copy()
sentiment_data = sentiment.copy()
companies_senti = filtered['Stock'].unique().tolist()


import optuna
import torch
import torch.nn as nn
import torch.optim as optim

# Make sure to define your Dataset, Models, and ModelHandler as before

def get_data_loaders(company_id, seq_length, sentiment_data):
    # Function to create data loaders based on the company_id
    # Replace with your actual implementation
    train_data = filtered[(filtered['Stock'] == company_id) & 
                          (filtered['Date'] >= '2014-01-01') & 
                          (filtered['Date'] < '2015-08-01')]
    
    valid_data = filtered[(filtered['Stock'] == company_id) & 
                          (filtered['Date'] >= '2015-08-01') & 
                          (filtered['Date'] < '2015-10-01')]
    
    test_data = filtered[(filtered['Stock'] == company_id) & 
                         (filtered['Date'] >= '2015-10-01') & 
                         (filtered['Date'] < '2016-01-01')]

    train_dataset = AddStockDataset(train_data, seq_length, sentiment_data)
    valid_dataset = AddStockDataset(valid_data, seq_length, sentiment_data)
    test_dataset = AddStockDataset(test_data, seq_length, sentiment_data)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    return train_loader, test_loader

def objective(trial):
    hidden_size = trial.suggest_categorical('hidden_size', [32, 64])
    num_layers = trial.suggest_int('num_layers', 1, 2)
    dropout = trial.suggest_uniform('dropout', 0.0, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    model_type = trial.suggest_categorical('model_type', ['StockRNN', 'StockLSTM', 'StockSentimentRNN'])
    
    # You can also include model initialization and hyperparameter selection based on your requirement
    if model_type == 'StockRNN':
        model = StockRNN(input_size=6, hidden_size=hidden_size, output_size=2, sentiment_size=3, num_layers=num_layers, dropout=dropout)
    elif model_type == 'StockLSTM':
        model = StockLSTM(input_size=6, hidden_size=hidden_size, output_size=2, sentiment_size=3, num_layers=num_layers, dropout=dropout)
    else:
        model = StockSentimentRNN(stock_input_size=6, sentiment_input_size=3, hidden_size=hidden_size, output_size=2, num_layers=num_layers, dropout=dropout)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Example of using multiple companies; adjust the list as needed
    companies = companies_senti.copy()
    avg_accuracy = 0
    avg_mcc = 0
    num_companies = len(companies)

    for company_id in companies:
        train_loader, test_loader = get_data_loaders(company_id, seq_length=5, sentiment_data=sentiment)
        
        # Initialize ModelHandler with only the current trial's model, criterion, and optimizer
        model_handler = ModelHandler(models=[model], criteria=[criterion], optimizers=[optimizer], num_epochs=10, device=device)
        
        # Train and evaluate
        model_handler.train_and_evaluate(train_loader, test_loader, company_id)
        accuracy, mcc = model_handler.evaluate(test_loader, company_id, model_type, model)

        avg_accuracy += accuracy
        avg_mcc += mcc

    # Calculate average metrics
    avg_accuracy /= num_companies
    avg_mcc /= num_companies

    # Define weights for accuracy and MCC
    accuracy_weight = 0.3
    mcc_weight = 0.7

    # Combine them into a single objective
    combined_score = accuracy_weight * avg_accuracy + mcc_weight * avg_mcc

    return combined_score

def run_optuna_study():
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)  # Number of trials can be adjusted

    print("Best hyperparameters: ", study.best_params)
    print("Best value: ", study.best_value)

if __name__ == '__main__':
    run_optuna_study()


In [63]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd

filtered = pd.read_csv(r'C:\Users\dhjrz\nlp\ionlab\price_scaled.csv')
companies = filtered['Stock'].unique().tolist()

senti = pd.read_csv(r'C:\Users\dhjrz\nlp\ionlab\all_companies_sentiment.csv')
selected_columns = ['filename','label','positive','negative','neutral', 'sentiment_score']
sentiment = senti[selected_columns]
sentiment.rename(columns={'filename':'Date'}, inplace=True)
sentiment['Date'] = pd.to_datetime(sentiment['Date'])

data = filtered.copy()
sentiment_data = sentiment.copy()

num_layers = 2
input_size = 6
hidden_size = 64
output_size = 2  # 1 또는 0
sentiment_size = 3
dropout = 0.1
stock_input_size = 6       # 주가 데이터의 feature 수
seq_length = 5 #5일치로 하루 예측
num_epochs = 120
lr = 0.00001


def run_sentimodels_on_companies(companies_senti, seq_length, num_epochs):
    company_results = {}
    models = [StockRNN, StockLSTM, StockSentimentRNN]  # Add your models here
    
    # Dictionary to store cumulative accuracy and macc for each model
    model_metrics = {model_cls.__name__: {'accuracy': 0, 'macc': 0, 'count': 0} for model_cls in models}


    # Initialize device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for company_id in companies_senti:
        print(f"Processing company: {company_id}")

        # Assuming train_data and test_data are available for each company
        train_data = filtered[(filtered['Stock'] == company_id) & 
                              (filtered['Date'] >= '2014-01-01') & 
                              (filtered['Date'] < '2015-08-01')]
        
        valid_data = filtered[(filtered['Stock'] == company_id) & 
                              (filtered['Date'] >= '2015-08-01') & 
                              (filtered['Date'] < '2015-10-01')]
        
        test_data = filtered[(filtered['Stock'] == company_id) & 
                             (filtered['Date'] >= '2015-10-01') & 
                             (filtered['Date'] < '2016-01-01')]

        train_dataset = AddStockDataset(train_data, seq_length, sentiment_data)
        valid_dataset = AddStockDataset(valid_data, seq_length, sentiment_data)
        test_dataset = AddStockDataset(test_data, seq_length, sentiment_data)
        
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
        valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

        for model_cls in models:
            print(f"Training model: {model_cls.__name__}")

            if model_cls == StockRNN or model_cls == StockLSTM:
                # These models don't require sentiment_input_size
                model = model_cls(input_size=6, hidden_size=hidden_size, output_size=2, sentiment_size=3, num_layers=num_layers, dropout=dropout)
                
            elif model_cls == StockSentimentRNN:
                # StockSentimentRNN requires both stock and sentiment input sizes
                model = model_cls(stock_input_size=6, sentiment_input_size=3, hidden_size=hidden_size, output_size=2, num_layers=num_layers, dropout=dropout)
            
            model.to(device)  # Move model to the device
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=0.001)

            handler = ModelHandler(model, criterion, optimizer, num_epochs, device)

            # Train and evaluate
            handler.train(train_loader, company_id, model_cls.__name__)
            accuracy, macc = handler.evaluate(test_loader, company_id, model_cls.__name__)

            model_name = model_cls.__name__
            company_results[(company_id, model_name)] = {'Accuracy': accuracy, 'MACC': macc}

            # Add accuracy and macc to the cumulative sum for the model
            model_metrics[model_name]['accuracy'] += accuracy
            model_metrics[model_name]['macc'] += macc
            model_metrics[model_name]['count'] += 1  # Track number of companies processed

            # Store results
            company_results[(company_id, model_cls.__name__)] = {'Accuracy': accuracy, 'MACC': macc}

            # Add accuracy and macc to the cumulative sum for the model
            model_metrics[model_name]['accuracy'] += accuracy
            model_metrics[model_name]['macc'] += macc
            model_metrics[model_name]['count'] += 1  # Track number of companies processed

            
             # Calculate average accuracy and macc for each model

    avg_results = {}
    for model_name, metrics in model_metrics.items():
        avg_accuracy = metrics['accuracy'] / metrics['count']
        avg_macc = metrics['macc'] / metrics['count']
        avg_results[model_name] = {'Average Accuracy': avg_accuracy, 'Average MACC': avg_macc}
    
    # Save results of all companies
    results_df = pd.DataFrame.from_dict(company_results, orient='index')
    results_df.to_csv('company88_Senti_results.csv')

    # Save average results of models
    avg_results_df = pd.DataFrame.from_dict(avg_results, orient='index')
    avg_results_df.to_csv('average_Senti_results.csv')

    print("Evaluation complete")
    print(avg_results_df)


    
# Assuming `companies` is defined
companies_senti = companies.copy()

run_sentimodels_on_companies(companies_senti, seq_length=5, num_epochs=120)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentiment.rename(columns={'filename':'Date'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentiment['Date'] = pd.to_datetime(sentiment['Date'])


Processing company: AAPL
Training model: StockRNN
Company: AAPL, Model: StockRNN, Epoch [10/120], Average Loss: 0.6889
Company: AAPL, Model: StockRNN, Epoch [20/120], Average Loss: 0.6851
Company: AAPL, Model: StockRNN, Epoch [30/120], Average Loss: 0.6833
Company: AAPL, Model: StockRNN, Epoch [40/120], Average Loss: 0.6806
Company: AAPL, Model: StockRNN, Epoch [50/120], Average Loss: 0.6798
Company: AAPL, Model: StockRNN, Epoch [60/120], Average Loss: 0.6740
Company: AAPL, Model: StockRNN, Epoch [70/120], Average Loss: 0.6751
Company: AAPL, Model: StockRNN, Epoch [80/120], Average Loss: 0.6697
Company: AAPL, Model: StockRNN, Epoch [90/120], Average Loss: 0.6692
Company: AAPL, Model: StockRNN, Epoch [100/120], Average Loss: 0.6669
Company: AAPL, Model: StockRNN, Epoch [110/120], Average Loss: 0.6676
Company: AAPL, Model: StockRNN, Epoch [120/120], Average Loss: 0.6656
Company: AAPL, Model: StockRNN, Accuracy: 0.6102, MCC: 0.2818, TP: 19, TN: 17, FP: 18, FN: 5
Training model: StockLSTM
