# Strategy Notebook

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'Data/final/aapl_data.csv'
data = pd.read_csv(file_path, delimiter=';')

# Calculate the weighted sentiment score
data['weighted_sentiment'] = data['avg_sentiment'] * np.log1p(data['submission_count'])

# Sort the data by date to maintain the chronological order
data = data.sort_values(by='date')

# Split the data into 80% training, 10% validation, and 10% testing sets
train_size = int(0.8 * len(data))
valid_size = int(0.1 * len(data))

train_data = data.iloc[:train_size]
valid_data = data.iloc[train_size:train_size + valid_size]
test_data = data.iloc[train_size + valid_size:]

def compute_optimal_threshold(train_data, valid_data, test_data):
    """
    Compute the optimal threshold for a trading decision based on weighted sentiment using logistic regression and ROC curve analysis.
    The function uses 80% of the data for training, 10% for validation (ROC curve analysis), and 10% for testing.

    Parameters:
    train_data (pd.DataFrame): Training dataset containing 'weighted_sentiment' and 'outcome' columns.
    valid_data (pd.DataFrame): Validation dataset containing 'weighted_sentiment' and 'outcome' columns.
    test_data (pd.DataFrame): Testing dataset containing 'weighted_sentiment' and 'outcome' columns.

    Returns:
    float: The optimal threshold for deciding buy/sell based on the ROC curve.
    """
    # Ensure data contains necessary columns
    required_columns = ['weighted_sentiment', 'Close']
    for df in [train_data, valid_data, test_data]:
        if not all(col in df.columns for col in required_columns):
            raise ValueError(f"DataFrame must include {required_columns} columns")

    # Create the outcome column based on price increase (1 for increase, 0 for decrease)
    for df in [train_data, valid_data, test_data]:
        df['outcome'] = (df['Close'].diff().shift(-1) > 0).astype(int)

    # Prepare features and labels for training
    X_train = train_data['weighted_sentiment'].values.reshape(-1, 1)
    y_train = train_data['outcome']

    # Fit the logistic regression model
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Predict probabilities on the validation set
    X_valid = valid_data['weighted_sentiment'].values.reshape(-1, 1)
    y_valid = valid_data['outcome']
    probs = model.predict_proba(X_valid)[:, 1]

    # Compute ROC curve
    fpr, tpr, thresholds = roc_curve(y_valid, probs)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

    # Select optimal threshold
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    print('Optimal Threshold:', optimal_threshold)

    # Return the optimal threshold
    return optimal_threshold

def predict_trading_action(sentiment_score, comment_volume, optimal_threshold, buy_threshold=0.5, sell_threshold=-0.5):
    """
    Predicts the trading action for the next day based on the current day's sentiment score and comment volume.
    
    Parameters:
    sentiment_score (float): The sentiment score from Reddit comments for the stock on the current day.
    comment_volume (int): The number of comments related to the stock on the current day.
    optimal_threshold (float): The optimal threshold for deciding buy/sell based on the ROC curve.
    buy_threshold (float): The threshold for the weighted sentiment score above which a buy action is triggered.
    sell_threshold (float): The threshold for the weighted sentiment score below which a sell action is triggered.

    Returns:
    str: The trading action for the next day ('Buy', 'Short', or 'Hold').
    """
    # Calculate the weighted sentiment score
    weighted_sentiment = sentiment_score * np.log1p(comment_volume)

    # Determine the trading action based on the weighted sentiment and optimal threshold
    if weighted_sentiment > optimal_threshold:
        return 'Buy'
    elif weighted_sentiment < optimal_threshold:
        return 'Short'
    else:
        return 'Hold'

def test_predict_trading_action(test_data, optimal_threshold):
    actions = []
    for _, row in test_data.iterrows():
        action = predict_trading_action(row['avg_sentiment'], row['submission_count'], optimal_threshold)
        actions.append(action)
    return actions

# Compute the optimal threshold using the given datasets
optimal_threshold = compute_optimal_threshold(train_data, valid_data, test_data)

# Test the function
predicted_actions = test_predict_trading_action(test_data, optimal_threshold)

# Add predicted actions to the test data
test_data['predicted_action'] = predicted_actions

# Filter for only the "Buy" predictions
buy_predictions = test_data[test_data['predicted_action'] == 'Buy']

# Calculate the number of correctly predicted "Buy" actions when the price increased
correct_buy_predictions = buy_predictions[buy_predictions['outcome'] == 1]

# Calculate the number and percentage of correctly predicted price increases
num_correct_increases = len(correct_buy_predictions)
total_buy_predictions = len(buy_predictions)
percentage_correct_increases = (num_correct_increases / total_buy_predictions) * 100 if total_buy_predictions > 0 else 0

num_correct_increases, total_buy_predictions, percentage_correct_increases