In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score

# Mount to Google Drive in order to upload data files
drive.mount('')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 3 years

# Import 3 years of combined data
data3 = pd.read_csv('path_to_combined_3year_data.csv')

# Convert 'date' column to datetime type
data3['date'] = pd.to_datetime(data3['date'])

# Define threshold for classifying "high volatility" (top 30% of Target per ticker)
data3['high_volatility'] = data3.groupby('ticker')['Target'].transform(lambda x: np.where(x > x.quantile(0.7), 1, 0))

# Initialize lists to store results and confusion matrices per ticker
results3 = []
ticker_confusion_matrices_3 = {}

# Loop through each ticker to train and evaluate model
for ticker in data3['ticker'].unique():

    # Select data for the current ticker and sort by date
    ticker_df_3 = data3[data3['ticker'] == ticker].sort_values('date').reset_index(drop=True)

    # Define feature columns
    features_3 = ['RealizedVol_3d', 'comment_volume', 'average_sentiment']

    # Split data into train/test based on 80% quantile date
    split_date_3 = ticker_df_3['date'].quantile(0.8)
    train_3 = ticker_df_3[ticker_df_3['date'] <= split_date_3]
    test_3 = ticker_df_3[ticker_df_3['date'] > split_date_3]

    # Define features and target
    X_train_3 = train_3[features_3]
    y_train_3 = train_3['high_volatility']
    X_test_3 = test_3[features_3]
    y_test_3 = test_3['high_volatility']

    # Standardize features to have mean=0 and variance=1
    scaler = StandardScaler()
    X_train_scaled_3 = scaler.fit_transform(X_train_3)
    X_test_scaled_3 = scaler.transform(X_test_3)

    # Initialize and fit logistic regression model
    model3 = LogisticRegression(random_state=42)
    model3.fit(X_train_scaled_3, y_train_3)

    # Make predictions and predicted probabilities on the test set
    y_pred_3 = model3.predict(X_test_scaled_3)
    y_prob_3 = model3.predict_proba(X_test_scaled_3)[:, 1]

    # Store results for this ticker in a DataFrame
    test_results_3 = pd.DataFrame({
    'date': test_3['date'].values,
    'ticker': ticker,
    'target': test_3['Target'].values,
    'high_volatility_actual': y_test_3.values,
    'high_volatility_pred': y_pred_3,
    'probability': y_prob_3
    })

    # Append ticker results to the list
    results3.append(test_results_3)

# Concatenate results from all tickers
results_df_3 = pd.concat(results3).reset_index(drop=True)

# Count total misclassified samples
num_misclassified_3 = (results_df_3['high_volatility_actual'] != results_df_3['high_volatility_pred']).sum()

# Generate high-confidence trading signals using probability threshold
confidence_thresh = 0.9
results_df_3["signal"] = (results_df_3["probability"] > confidence_thresh).astype(int)
n_signals = results_df_3["signal"].sum()
print(f"Trade Signals Triggered: {n_signals} out of {len(results_df_3)} samples")

# Calculate precision of high-confidence signals
precision = precision_score(results_df_3["high_volatility_actual"], results_df_3["signal"])
print(f"Precision of High-Confidence Strategy: {precision:.3f}")
print()

# Identify top 5 high-confidence predictions
top_signals = pd.DataFrame({
    'probability': y_prob_3,
    'true_label': y_test_3.values,
    'date': test_3['date'].values
})
print(f"\nTop 5 high-confidence predictions for {ticker}:")
print(top_signals.sort_values("probability", ascending=False).head(5))
print()

# Compute confusion matrix per ticker
for ticker in results_df_3['ticker'].unique():
    subset_3 = results_df_3[results_df_3['ticker'] == ticker]
    cm3 = confusion_matrix(subset_3['high_volatility_actual'], subset_3['high_volatility_pred'], labels=[0, 1])
    ticker_confusion_matrices_3[ticker] = cm3

# Print confusion matrices for each ticker
for ticker, cm3 in ticker_confusion_matrices_3.items():
    print(f"Confusion Matrix for {ticker}:")
    print(cm3)
    print()

# Compute overall ROC AUC score across all tickers
overall_auc = roc_auc_score(results_df_3['high_volatility_actual'], results_df_3['probability'])
print(f"Overall ROC AUC Score: {overall_auc:.3f}")
print()

# Print overall model performance
print("Overall Model Performance:")
print(confusion_matrix(results_df_3['high_volatility_actual'], results_df_3['high_volatility_pred']))
print()
print(classification_report(results_df_3['high_volatility_actual'], results_df_3['high_volatility_pred']))
print()

# Print first 10 rows of results
print(results_df_3.head(10))
print()

# Print total number of misclassified samples
print(num_misclassified_3)
print()

In [None]:
# 1 year

# Import 1 year of combined data
data1 = pd.read_csv('path_to_combined_1year_data.csv')

# Convert 'date' column to datetime type
data1['date'] = pd.to_datetime(data3['date'])

# Define threshold for classifying "high volatility" (top 30% of Target per ticker)
data1['high_volatility'] = data3.groupby('ticker')['Target'].transform(lambda x: np.where(x > x.quantile(0.7), 1, 0))

# Initialize lists to store results and confusion matrices per ticker
results1 = []
ticker_confusion_matrices_1 = {}

# Loop through each ticker to train and evaluate model
for ticker in data1['ticker'].unique():

    # Select data for the current ticker and sort by date
    ticker_df_1 = data1[data1['ticker'] == ticker].sort_values('date').reset_index(drop=True)

    # Define features for the 1-year model
    features_1 = ['RealizedVol_3d', 'reddit_sentiment_lag1', 'reddit_volume_lag1', 'news_sentiment_lag1', 'news_volume_lag1']

    # Split data into train/test based on 80% quantile date
    split_date_1 = ticker_df_1['date'].quantile(0.8)
    train_1 = ticker_df_1[ticker_df_1['date'] <= split_date_1]
    test_1 = ticker_df_1[ticker_df_1['date'] > split_date_1]

    # Define features and target
    X_train_1 = train_1[features_1]
    y_train_1 = train_1['high_volatility']
    X_test_1 = test_1[features_1]
    y_test_1 = test_1['high_volatility']

    # Standardize features to have mean=0 and variance=1
    scaler = StandardScaler()
    X_train_scaled_1 = scaler.fit_transform(X_train_1)
    X_test_scaled_1 = scaler.transform(X_test_1)

    # Initialize and fit logistic regression model
    model1 = LogisticRegression(random_state=42)
    model1.fit(X_train_scaled_1, y_train_1)

    # Make predictions on the test set
    y_pred_1 = model1.predict(X_test_scaled_1)

    # Store results for this ticker in a DataFrame
    test_results_1 = pd.DataFrame({
    'date': test_1['date'].values,
    'ticker': ticker,
    'target': test_1['Target'].values,
    'high_volatility_actual': y_test_1.values,
    'high_volatility_pred': y_pred_1,
    })

    # Append ticker results to the list
    results1.append(test_results_1)


# Concatenate results from all tickers
results_df_1 = pd.concat(results1).reset_index(drop=True)

# Count total misclassified samples
num_misclassified_1 = (results_df_1['high_volatility_actual'] != results_df_1['high_volatility_pred']).sum()

# Compute confusion matrix per ticker
for ticker in results_df_1['ticker'].unique():
    subset = results_df_1[results_df_1['ticker'] == ticker]
    cm1 = confusion_matrix(subset['high_volatility_actual'], subset['high_volatility_pred'], labels=[0, 1])
    ticker_confusion_matrices_1[ticker] = cm1

# Print confusion matrices for each ticker
for ticker, cm1 in ticker_confusion_matrices_1.items():
    print(f"Confusion Matrix for {ticker}:")
    print(cm1)
    print()  # Blank line for readability

# Print overall model performance
print("Overall Model Performance:")
print(confusion_matrix(results_df_1['high_volatility_actual'], results_df_1['high_volatility_pred']))
print(classification_report(results_df_1['high_volatility_actual'], results_df_1['high_volatility_pred']))

# Print first 10 rows of results
print(results_df_1.head(10))

# Print total number of misclassified samples
print(num_misclassified_1)

Confusion Matrix for AAPL:
[[49  0]
 [16  0]]

Confusion Matrix for AMZN:
[[43  0]
 [19  0]]

Confusion Matrix for BRK-B:
[[44  0]
 [16  0]]

Confusion Matrix for COST:
[[55  0]
 [16  0]]

Confusion Matrix for GOOG:
[[48  0]
 [17  0]]

Confusion Matrix for GOOGL:
[[51  1]
 [17  0]]

Confusion Matrix for HD:
[[45  3]
 [15  0]]

Confusion Matrix for KO:
[[53  0]
 [14  0]]

Confusion Matrix for META:
[[60  0]
 [ 8  0]]

Confusion Matrix for MSFT:
[[50  1]
 [11  1]]

Confusion Matrix for NVDA:
[[55  0]
 [ 8  0]]

Confusion Matrix for PG:
[[53  0]
 [14  0]]

Confusion Matrix for PLTR:
[[46  0]
 [13  1]]

Confusion Matrix for PM:
[[53  0]
 [15  0]]

Confusion Matrix for TSLA:
[[57  0]
 [12  0]]

Overall Model Performance:
[[762   5]
 [211   2]]
              precision    recall  f1-score   support

           0       0.78      0.99      0.88       767
           1       0.29      0.01      0.02       213

    accuracy                           0.78       980
   macro avg       0.53      0.50