## Sentiment to Strategy: Leveraging Forums Discussions to Guide Automated Trading Decisions 

---

### 0. Importing libraries

In [120]:
# Import necessary libraries
from pprint import pprint
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import praw
import csv
import time
from datetime import datetime
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import string
import yfinance as yf   
from pathlib import Path

---

### 1. Reddit API Data Extraction and Manipulation

In [121]:
# Initialize PRAW (Python Reddit API Wrapper)
user_agent = "StockMarket Sentiment Analysis"
reddit = praw.Reddit(
    client_id="SDD6BO5WvswKQ2DOZrioiQ",
    client_secret="BoGTdq9xg1Wu4kjy3zj_WBLbTInQvQ",
    user_agent=user_agent
)

# Initialize the subreddit
subreddit = reddit.subreddit("wallstreetbets")

In [122]:
# List to hold data for each post
posts = []

# Fetch posts from the subreddit
for submission in subreddit.new(limit=None):
    post_date = datetime.fromtimestamp(submission.created_utc)
    posts.append({
        'id': submission.id,
        'title': submission.title,
        'selftext': submission.selftext,
        'score': submission.score,
        'upvote_ratio': submission.upvote_ratio,       
        'created_date': post_date,
        'permalink': f"https://redd.it/{submission.id}"
    })

# Convert the list to a DataFrame
raw_df = pd.DataFrame(posts)
raw_df

Unnamed: 0,id,title,selftext,score,upvote_ratio,created_date,permalink
0,1kul488,I love puts,"Hello, I felt market was overpriced this week ...",6,1.00,2025-05-24 22:32:39,https://redd.it/1kul488
1,1kul2g6,Posted May 14th 📉 📈,Genius or Delulu?,0,0.49,2025-05-24 22:30:23,https://redd.it/1kul2g6
2,1kukrol,My Futures Levels for Next Week,I calculate these levels for futures market ba...,1,0.60,2025-05-24 22:16:31,https://redd.it/1kukrol
3,1kuk2iy,Come back story,"I put in about 10k in the past 3 months, and g...",26,0.88,2025-05-24 21:44:01,https://redd.it/1kuk2iy
4,1kuje0q,I wish Klarna’s IPO happened before they repor...,So everyone who’s been active knows the memes ...,51,0.91,2025-05-24 21:12:48,https://redd.it/1kuje0q
...,...,...,...,...,...,...,...
854,1kd6mfo,200K in QQQ puts,,210,0.94,2025-05-02 19:24:42,https://redd.it/1kd6mfo
855,1kd6bdf,Lost 63k but still up $40 for the year lol,I yolo'd my Robinhood port into 0DTE Tesla put...,133,0.97,2025-05-02 19:12:07,https://redd.it/1kd6bdf
856,1kd64r9,Fuck!! Bear can’t win!,Fml,86,0.94,2025-05-02 19:04:28,https://redd.it/1kd64r9
857,1kd5vit,After 90-day tarrif pause,How do you guys think the market will be affec...,308,0.93,2025-05-02 18:54:07,https://redd.it/1kd5vit


In [123]:
# Create a csv file with the data
if os.path.exists('wsb_posts.csv'):
    existing_df = pd.read_csv('wsb_posts.csv')
    combined_df = pd.concat([existing_df, raw_df]).drop_duplicates(subset=['id'])
    combined_df.to_csv('wsb_posts.csv', index=False)
else:
    raw_df.to_csv('wsb_posts.csv', index=False)

In [124]:
# Load the data from the csv file
df = pd.read_csv('wsb_posts.csv')
df

Unnamed: 0,id,title,selftext,score,upvote_ratio,created_date,permalink,text
0,1kqotlw,Whats your opinion about Rocket Lab?,I see many ppl hyping rocket lab and also boug...,1,1.00,2025-05-20 00:12:16,https://redd.it/1kqotlw,Whats your opinion about Rocket Lab? I see man...
1,1kqoadm,Fed's Powell has sounded the alarm for years a...,,125,0.94,2025-05-19 23:49:38,https://redd.it/1kqoadm,Fed's Powell has sounded the alarm for years a...
2,1kqo5u3,Big beautiful bill,Am I screwed with my sp500 ETFs as a French in...,0,0.25,2025-05-19 23:44:20,https://redd.it/1kqo5u3,Big beautiful bill Am I screwed with my sp500 ...
3,1kqo1xz,Question on UNH Stock,"Should I sell UNH calls, covered calls or hold...",11,0.87,2025-05-19 23:39:50,https://redd.it/1kqo1xz,"Question on UNH Stock Should I sell UNH calls,..."
4,1kqmxov,Is this gonna print tomorrow,"Held these over the weekend, at its peak today...",0,0.44,2025-05-19 22:54:47,https://redd.it/1kqmxov,Is this gonna print tomorrow Held these over t...
...,...,...,...,...,...,...,...,...
1134,1ktvo5o,"OK Gains, I Guess",How'd i do guys?,28,0.82,2025-05-23 23:53:28,https://redd.it/1ktvo5o,
1135,1kuk2iy,Come back story,"I put in about 10k in the past 3 months, and g...",12,0.93,2025-05-24 21:44:01,https://redd.it/1kuk2iy,
1136,1kukrol,My Futures Levels for Next Week,I calculate these levels for futures market ba...,2,0.75,2025-05-24 22:16:31,https://redd.it/1kukrol,
1137,1kul488,I love puts,"Hello, I felt market was overpriced this week ...",1,1.00,2025-05-24 22:32:39,https://redd.it/1kul488,


In [125]:
# Read the S&P 500 companies list from the CSV file
csv_file = 'stocks.csv'
stocks_df = pd.read_csv(csv_file)

# Extract the ticker symbols (the column is usually named "Symbol")
tickers = stocks_df['Symbol'].tolist()

# Extract the names of the companies
companies = stocks_df['Name'].tolist()

In [126]:
# From company names remove the stock part and everything after it
def remove_stock_part(company_name):
    # Remove common stock suffixes (suffix + any following text)
    stock_suffixes = [' Inc.', ' Corp.', ' Ltd.', ' PLC', ' Co.', ' Class']
    for suffix in stock_suffixes:
        if suffix in company_name:
            return company_name.split(suffix)[0].strip()
    return company_name

# Apply the function to the companies list
companies = [remove_stock_part(company) for company in companies]

In [127]:
# Combine title and selftext into one column to search both
raw_df['text'] = raw_df['title'].fillna('') + " " + raw_df['selftext'].fillna('')

# Filter only the needed columns
filtered_reddit = raw_df.copy()
filtered_reddit.drop(columns=['title', 'selftext', 'score', 'upvote_ratio'], axis=1, inplace=True)
filtered_reddit

Unnamed: 0,id,created_date,permalink,text
0,1kul488,2025-05-24 22:32:39,https://redd.it/1kul488,"I love puts Hello, I felt market was overprice..."
1,1kul2g6,2025-05-24 22:30:23,https://redd.it/1kul2g6,Posted May 14th 📉 📈 Genius or Delulu?
2,1kukrol,2025-05-24 22:16:31,https://redd.it/1kukrol,My Futures Levels for Next Week I calculate th...
3,1kuk2iy,2025-05-24 21:44:01,https://redd.it/1kuk2iy,Come back story I put in about 10k in the past...
4,1kuje0q,2025-05-24 21:12:48,https://redd.it/1kuje0q,I wish Klarna’s IPO happened before they repor...
...,...,...,...,...
854,1kd6mfo,2025-05-02 19:24:42,https://redd.it/1kd6mfo,200K in QQQ puts
855,1kd6bdf,2025-05-02 19:12:07,https://redd.it/1kd6bdf,Lost 63k but still up $40 for the year lol I y...
856,1kd64r9,2025-05-02 19:04:28,https://redd.it/1kd64r9,Fuck!! Bear can’t win! Fml
857,1kd5vit,2025-05-02 18:54:07,https://redd.it/1kd5vit,After 90-day tarrif pause How do you guys thin...


In [128]:
# Create a mapping from cleaned company names (lowercase) to their corresponding ticker
company_mapping = {company.lower(): ticker for company, ticker in zip(companies, tickers)}

# Find matching tickers in the text, replacing company name matches with their stock codes
def matching_tickers(text):
    matched = [str(ticker) for ticker in tickers if str(ticker) in text]
    for company in companies:
        if company.lower() in text.lower():
            ticker_code = company_mapping.get(company.lower())
            if ticker_code:
                matched.append(ticker_code)
    return matched

# Apply the matching function to the 'text' column and remove duplicates
filtered_reddit['tickers'] = filtered_reddit['text'].apply(matching_tickers)
filtered_reddit['tickers'] = filtered_reddit['tickers'].apply(lambda x: list(set(x)))
filtered_reddit

Unnamed: 0,id,created_date,permalink,text,tickers
0,1kul488,2025-05-24 22:32:39,https://redd.it/1kul488,"I love puts Hello, I felt market was overprice...","[W, G, H]"
1,1kul2g6,2025-05-24 22:30:23,https://redd.it/1kul2g6,Posted May 14th 📉 📈 Genius or Delulu?,"[M, G, D]"
2,1kukrol,2025-05-24 22:16:31,https://redd.it/1kukrol,My Futures Levels for Next Week I calculate th...,"[L, F, W, M]"
3,1kuk2iy,2025-05-24 21:44:01,https://redd.it/1kuk2iy,Come back story I put in about 10k in the past...,"[R, C, T, DOW, S, V]"
4,1kuje0q,2025-05-24 21:12:48,https://redd.it/1kuje0q,I wish Klarna’s IPO happened before they repor...,"[F, B, IP, C, PR, S, ING, R, A, G, PL, W, J, P..."
...,...,...,...,...,...
854,1kd6mfo,2025-05-02 19:24:42,https://redd.it/1kd6mfo,200K in QQQ puts,[K]
855,1kd6bdf,2025-05-02 19:12:07,https://redd.it/1kd6bdf,Lost 63k but still up $40 for the year lol I y...,"[R, TSLA, OLO, H, DTE, L, T, D, DOW, TE, DT, E]"
856,1kd64r9,2025-05-02 19:04:28,https://redd.it/1kd64r9,Fuck!! Bear can’t win! Fml,"[F, B]"
857,1kd5vit,2025-05-02 18:54:07,https://redd.it/1kd5vit,After 90-day tarrif pause How do you guys thin...,"[W, A, H, T, O, E]"


In [129]:
# Remove substring tickers
def remove_substring_tickers(tickers):
    # Remove duplicates
    tickers = list(set(tickers))
    # Sort tickers by length in descending order
    tickers.sort(key=len, reverse=True)
    # Create a new list to store the filtered tickers
    filtered_tickers = []
    for ticker in tickers:
        # Check if the ticker is a substring of any other ticker in the list
        if not any(ticker in other_ticker for other_ticker in tickers if ticker != other_ticker):
            filtered_tickers.append(ticker)
    return filtered_tickers

# Apply the function to the filtered_reddit['tickers'] column
filtered_reddit['tickers'] = filtered_reddit['tickers'].apply(remove_substring_tickers)
filtered_reddit

Unnamed: 0,id,created_date,permalink,text,tickers
0,1kul488,2025-05-24 22:32:39,https://redd.it/1kul488,"I love puts Hello, I felt market was overprice...","[W, G, H]"
1,1kul2g6,2025-05-24 22:30:23,https://redd.it/1kul2g6,Posted May 14th 📉 📈 Genius or Delulu?,"[M, D, G]"
2,1kukrol,2025-05-24 22:16:31,https://redd.it/1kukrol,My Futures Levels for Next Week I calculate th...,"[L, F, W, M]"
3,1kuk2iy,2025-05-24 21:44:01,https://redd.it/1kuk2iy,Come back story I put in about 10k in the past...,"[DOW, R, T, V, S, C]"
4,1kuje0q,2025-05-24 21:12:48,https://redd.it/1kuje0q,I wish Klarna’s IPO happened before they repor...,"[ING, PRI, nan, IP, PL, BN, F, C, S, A, W, J, ..."
...,...,...,...,...,...
854,1kd6mfo,2025-05-02 19:24:42,https://redd.it/1kd6mfo,200K in QQQ puts,[K]
855,1kd6bdf,2025-05-02 19:12:07,https://redd.it/1kd6bdf,Lost 63k but still up $40 for the year lol I y...,"[TSLA, OLO, DTE, DOW, R, H]"
856,1kd64r9,2025-05-02 19:04:28,https://redd.it/1kd64r9,Fuck!! Bear can’t win! Fml,"[F, B]"
857,1kd5vit,2025-05-02 18:54:07,https://redd.it/1kd5vit,After 90-day tarrif pause How do you guys thin...,"[W, A, H, T, O, E]"


In [130]:
# Remove ambiguous tickers
ambiguous_tickers = (
    list(string.ascii_uppercase)  # single-letter symbols
    + [
        # two-letter everyday words / abbreviations
        "AI", "EV", "IT", "ON", "OR", "NO", "OK", "GO", "SO", "DO", "UP", "EU", "US", "IP", 
        # three- to four-letter common words
        "ALL", "FUN", "BIG", "RUN", "NEW", "TOP", "YOU", "CAT", "DOG", "CAR", "SUN", "USA", 
        "IPO", "CAN", "NOW", "AND", "FOR", "OUT", "BUY", "WIN", "LOW", "HIGH", 
        "ONE", "TWO", "THREE", "FOUR", "FIVE", "SIX", "SEVEN", "EIGHT", "NINE", "TEN",
        # tech buzz-terms
        "CPU", "GPU", "VR", "AR", "NFT", "ML",
        # meme & slang tickers
        "YOLO", "HODL", "MOON", "LMAO", "LOL", "FOMO", "FUD", "WTF",
        # other 
        "Reddit", "WSB", "nan", "NAN"
    ])

# Remove ambiguous tickers from the filtered_reddit['tickers'] column
def remove_ambiguous_tickers(tickers):
    # Remove duplicates
    tickers = list(set(tickers))
    # Filter out ambiguous tickers
    tickers = [ticker for ticker in tickers if ticker not in ambiguous_tickers]
    return tickers

# Apply the function to the filtered_reddit['tickers'] column
filtered_reddit['tickers'] = filtered_reddit['tickers'].apply(remove_ambiguous_tickers)
filtered_reddit

Unnamed: 0,id,created_date,permalink,text,tickers
0,1kul488,2025-05-24 22:32:39,https://redd.it/1kul488,"I love puts Hello, I felt market was overprice...",[]
1,1kul2g6,2025-05-24 22:30:23,https://redd.it/1kul2g6,Posted May 14th 📉 📈 Genius or Delulu?,[]
2,1kukrol,2025-05-24 22:16:31,https://redd.it/1kukrol,My Futures Levels for Next Week I calculate th...,[]
3,1kuk2iy,2025-05-24 21:44:01,https://redd.it/1kuk2iy,Come back story I put in about 10k in the past...,[DOW]
4,1kuje0q,2025-05-24 21:12:48,https://redd.it/1kuje0q,I wish Klarna’s IPO happened before they repor...,"[ING, PRI, BN, PL]"
...,...,...,...,...,...
854,1kd6mfo,2025-05-02 19:24:42,https://redd.it/1kd6mfo,200K in QQQ puts,[]
855,1kd6bdf,2025-05-02 19:12:07,https://redd.it/1kd6bdf,Lost 63k but still up $40 for the year lol I y...,"[TSLA, OLO, DTE, DOW]"
856,1kd64r9,2025-05-02 19:04:28,https://redd.it/1kd64r9,Fuck!! Bear can’t win! Fml,[]
857,1kd5vit,2025-05-02 18:54:07,https://redd.it/1kd5vit,After 90-day tarrif pause How do you guys thin...,[]


In [131]:
def keep_standalone_tickers(row):
    sentence = row["text"]
    cleaned = []
    for tkr in row["tickers"]:
        # look for the ticker as an independent token, optionally prefixed with $
        # (?<![A-Za-z0-9])  → left boundary is NOT a letter/number
        # (?![A-Za-z0-9])   → right boundary is NOT a letter/number
        pattern = rf'(?<![A-Za-z0-9])\$?{re.escape(tkr)}(?![A-Za-z0-9])'
        if re.search(pattern, sentence):
            cleaned.append(tkr)
    return cleaned

filtered_reddit["tickers"] = filtered_reddit.apply(keep_standalone_tickers, axis=1)
filtered_reddit

Unnamed: 0,id,created_date,permalink,text,tickers
0,1kul488,2025-05-24 22:32:39,https://redd.it/1kul488,"I love puts Hello, I felt market was overprice...",[]
1,1kul2g6,2025-05-24 22:30:23,https://redd.it/1kul2g6,Posted May 14th 📉 📈 Genius or Delulu?,[]
2,1kukrol,2025-05-24 22:16:31,https://redd.it/1kukrol,My Futures Levels for Next Week I calculate th...,[]
3,1kuk2iy,2025-05-24 21:44:01,https://redd.it/1kuk2iy,Come back story I put in about 10k in the past...,[]
4,1kuje0q,2025-05-24 21:12:48,https://redd.it/1kuje0q,I wish Klarna’s IPO happened before they repor...,[]
...,...,...,...,...,...
854,1kd6mfo,2025-05-02 19:24:42,https://redd.it/1kd6mfo,200K in QQQ puts,[]
855,1kd6bdf,2025-05-02 19:12:07,https://redd.it/1kd6bdf,Lost 63k but still up $40 for the year lol I y...,[]
856,1kd64r9,2025-05-02 19:04:28,https://redd.it/1kd64r9,Fuck!! Bear can’t win! Fml,[]
857,1kd5vit,2025-05-02 18:54:07,https://redd.it/1kd5vit,After 90-day tarrif pause How do you guys thin...,[]


In [132]:
# Remove rows with empty 'tickers' list
filtered_reddit = filtered_reddit[filtered_reddit['tickers'].str.len() > 0]
filtered_reddit

Unnamed: 0,id,created_date,permalink,text,tickers
6,1kui8i4,2025-05-24 20:21:29,https://redd.it/1kui8i4,Snorting MSTR Puts I like BTC... and I kinda l...,[MSTR]
7,1kugymb,2025-05-24 19:25:41,https://redd.it/1kugymb,Realistically what is going to happen with TSL...,[TSLA]
8,1kue7b7,2025-05-24 17:26:17,https://redd.it/1kue7b7,I recently discovered NVDA options are a hell ...,[NVDA]
10,1kubxyz,2025-05-24 15:43:42,https://redd.it/1kubxyz,Nvidia to launch cheaper Blackwell AI chip for...,[NVDA]
11,1ku4auu,2025-05-24 07:38:14,https://redd.it/1ku4auu,Is it too late to post UNH gains? Got into put...,[UNH]
...,...,...,...,...,...
839,1kdcv86,2025-05-02 23:50:26,https://redd.it/1kdcv86,The war is over Thanks to TSM and NFLX big pul...,"[TSM, NFLX]"
843,1kdbcow,2025-05-02 22:44:25,https://redd.it/1kdbcow,I don’t even know wtf I’m doing at this point....,[AAPL]
844,1kdbcbf,2025-05-02 22:43:57,https://redd.it/1kdbcbf,"Nearly 800% gain, thanks PLTR! Couldn't take t...",[PLTR]
845,1kdaghu,2025-05-02 22:06:19,https://redd.it/1kdaghu,Still not tired of winning Withdrew a bit and ...,[JD]


---

### 2. Sentiment Analysis

In [133]:
device = 0 if torch.cuda.is_available() else -1      # -1 ⇒ CPU
model_id = "ProsusAI/finBERT"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    torch_dtype=torch.float16 if device==0 else torch.float32
)

sent_pipe1 = pipeline(
    task="sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=device,
    return_all_scores=True,    # **preferred** to top_k=None
    truncation=True
)

Device set to use cuda:0


In [134]:
LABEL_TO_POLARITY = {"positive": +1, "neutral": 0, "negative": -1}

def finbert_score(text: str) -> float:
    scores = sent_pipe1(text, batch_size=1)[0]           # list[dict]
    # Convert to dict {label: prob}
    scores = {d["label"].lower(): d["score"] for d in scores}

    # Simple +/− difference (prob_pos − prob_neg)
    return scores["positive"] - scores["negative"]

filtered_reddit["sentiment"] = filtered_reddit["text"].apply(finbert_score)
filtered_reddit

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_reddit["sentiment"] = filtered_reddit["text"].apply(finbert_score)


Unnamed: 0,id,created_date,permalink,text,tickers,sentiment
6,1kui8i4,2025-05-24 20:21:29,https://redd.it/1kui8i4,Snorting MSTR Puts I like BTC... and I kinda l...,[MSTR],0.039581
7,1kugymb,2025-05-24 19:25:41,https://redd.it/1kugymb,Realistically what is going to happen with TSL...,[TSLA],-0.011499
8,1kue7b7,2025-05-24 17:26:17,https://redd.it/1kue7b7,I recently discovered NVDA options are a hell ...,[NVDA],-0.129551
10,1kubxyz,2025-05-24 15:43:42,https://redd.it/1kubxyz,Nvidia to launch cheaper Blackwell AI chip for...,[NVDA],0.635806
11,1ku4auu,2025-05-24 07:38:14,https://redd.it/1ku4auu,Is it too late to post UNH gains? Got into put...,[UNH],0.172667
...,...,...,...,...,...,...
839,1kdcv86,2025-05-02 23:50:26,https://redd.it/1kdcv86,The war is over Thanks to TSM and NFLX big pul...,"[TSM, NFLX]",0.880135
843,1kdbcow,2025-05-02 22:44:25,https://redd.it/1kdbcow,I don’t even know wtf I’m doing at this point....,[AAPL],0.088753
844,1kdbcbf,2025-05-02 22:43:57,https://redd.it/1kdbcbf,"Nearly 800% gain, thanks PLTR! Couldn't take t...",[PLTR],0.035552
845,1kdaghu,2025-05-02 22:06:19,https://redd.it/1kdaghu,Still not tired of winning Withdrew a bit and ...,[JD],0.545319


In [135]:
# Create a dictionary with dates as keys and tickers with their sentiment scores as values
def create_sentiment_dict(df):
    sentiment_dict = {}
    for i, row in df.iterrows():
        date = row['created_date'].date()
        tickers = row['tickers']
        sentiment = row['sentiment']
        
        if date not in sentiment_dict:
            sentiment_dict[date] = {}
        
        for ticker in tickers:
            if ticker not in sentiment_dict[date]:
                sentiment_dict[date][ticker] = 0
            sentiment_dict[date][ticker] += sentiment
    
    return sentiment_dict

sentiment_dict = create_sentiment_dict(filtered_reddit)
sentiment_dict

{datetime.date(2025, 5, 24): {'MSTR': 0.039581332355737686,
  'TSLA': -0.011498726904392242,
  'NVDA': 0.8628268577158451,
  'UNH': 0.172666996717453,
  'GOOG': 0.3565719835460186},
 datetime.date(2025, 5, 23): {'CRM': -0.7369182333350182,
  'INFA': -0.7369182333350182,
  'DKNG': -0.19664626568555832,
  'SMCI': -0.014379844069480896,
  'VG': -0.06710822880268097,
  'IMO': -0.06710822880268097,
  'LNG': -0.06710822880268097,
  'PFE': 0.8902494953945279,
  'RSI': 0.8902494953945279,
  'AES': 0.8902494953945279,
  'AAPL': 0.4636433869600296,
  'GOOGL': 0.2755922246724367,
  'TSLA': 0.05564112588763237,
  'UNH': 0.38799571990966797,
  'MVST': 0.01325051300227642,
  'AMC': 0.2243385650217533,
  'LCID': 0.2243385650217533,
  'NVDA': 0.1536855585873127,
  'META': 0.1536855585873127,
  'QBTS': -0.5453298389911652,
  'WLKP': 0.06674850359559059},
 datetime.date(2025, 5, 22): {'IONQ': 1.008111972361803,
  'SMA': 0.6583618260920048,
  'ADX': 0.6583618260920048,
  'RSI': 0.6583618260920048,
  'HIV

In [136]:
# Convert sentiment_dict into a DataFrame and fill missing values with 0
sentiment_df = pd.DataFrame.from_dict(sentiment_dict, orient='index').fillna(0)

# Ensure the index is sorted (dates in ascending order)
sentiment_df.sort_index(inplace=True)
sentiment_df

Unnamed: 0,MSTR,TSLA,NVDA,UNH,GOOG,CRM,INFA,DKNG,SMCI,VG,...,WBD,PDD,RCL,NET,SU,SOFI,CC,PB,QQQX,TSM
2025-05-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.880135
2025-05-03,0.0,-0.035686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.262786,0.049127,0.049127,0.0
2025-05-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-05,0.023025,0.023025,0.023025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.022129,-0.860469,0.16159,0.085657,0.240592,-0.052041,0.0,0.0,0.0,0.0
2025-05-06,0.0,0.0,-1.205385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-07,0.0,-0.015507,0.400313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-08,-0.171449,0.30787,0.0,0.0,0.216432,0.113937,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-09,-0.414338,-0.434242,0.106228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-10,-0.504716,0.0,0.0,0.132281,0.070603,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-11,-0.444315,-0.077974,0.007467,0.041576,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---

### 3. Stock Market Comparison

In [137]:
# Drop columns from sentiment_df where all entries are between -0.1 and 0.1
comparison_df = sentiment_df.copy()
comparison_df

Unnamed: 0,MSTR,TSLA,NVDA,UNH,GOOG,CRM,INFA,DKNG,SMCI,VG,...,WBD,PDD,RCL,NET,SU,SOFI,CC,PB,QQQX,TSM
2025-05-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.880135
2025-05-03,0.0,-0.035686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.262786,0.049127,0.049127,0.0
2025-05-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-05,0.023025,0.023025,0.023025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.022129,-0.860469,0.16159,0.085657,0.240592,-0.052041,0.0,0.0,0.0,0.0
2025-05-06,0.0,0.0,-1.205385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-07,0.0,-0.015507,0.400313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-08,-0.171449,0.30787,0.0,0.0,0.216432,0.113937,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-09,-0.414338,-0.434242,0.106228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-10,-0.504716,0.0,0.0,0.132281,0.070603,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-11,-0.444315,-0.077974,0.007467,0.041576,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [138]:
# Create a dataframe with the same index comparison_df but with a column that contains the number of non-zero entries in each row and another with a list of tickers that have a non-zero sentiment score
def count_non_zero_entries(df):
    non_zero_count = df.ne(0).sum(axis=1)
    non_zero_tickers = df.apply(lambda row: list(row[row != 0].index), axis=1)
    
    return pd.DataFrame({
        'non_zero_count': non_zero_count,
        'non_zero_tickers': non_zero_tickers
    })

non_zero_info = count_non_zero_entries(comparison_df)
non_zero_info

Unnamed: 0,non_zero_count,non_zero_tickers
2025-05-02,6,"[AAPL, PLTR, NFLX, NKE, JD, TSM]"
2025-05-03,6,"[TSLA, PLTR, RKLB, CC, PB, QQQX]"
2025-05-04,2,"[AMC, AMD]"
2025-05-05,19,"[MSTR, TSLA, NVDA, AMC, PT, HIMS, PLTR, AMZN, ..."
2025-05-06,21,"[NVDA, PLTR, DD, MSFT, AMD, SNAP, RDDT, COIN, ..."
2025-05-07,22,"[TSLA, NVDA, AAPL, GOOGL, HIMS, PLTR, MSFT, LL..."
2025-05-08,26,"[MSTR, TSLA, GOOG, CRM, AAPL, META, BULL, HIMS..."
2025-05-09,18,"[MSTR, TSLA, NVDA, HIMS, PLTR, ARE, RKLB, CEP,..."
2025-05-10,16,"[MSTR, UNH, GOOG, QBTS, HIMS, PLTR, DD, WMT, U..."
2025-05-11,29,"[MSTR, TSLA, NVDA, UNH, PLTR, CRWV, AMZN, NBIS..."


In [139]:
# Setting up trading parameters
TOP_N = 4  # long TOP_N, short BOTTOM_N
capital  = 100_000
signals = comparison_df.copy() 
TRANSACTION_COST  = 0.0

In [140]:
# Create a weights DataFrame initialized with zeros
weights = pd.DataFrame(0.0, index=signals.index, columns=signals.columns)

# Fill the weights DataFrame with long and short positions based on the signals
for date, row in signals.iterrows():
    # keep only valid positives for longs and negatives for shorts
    pos_scores = row[row > 0]
    neg_scores = row[row < 0]

    long_tickers  = pos_scores.nlargest(TOP_N).index
    short_tickers = neg_scores.nsmallest(TOP_N).index   # most negative

    n_long, n_short = len(long_tickers), len(short_tickers)

    # allocate 50 % of capital to each side (if it exists)
    if n_long:
        long_weight = 0.5 / n_long
        weights.loc[date, long_tickers] =  long_weight
    if n_short:
        short_weight = 0.5 / n_short
        weights.loc[date, short_tickers] = -short_weight
weights

Unnamed: 0,MSTR,TSLA,NVDA,UNH,GOOG,CRM,INFA,DKNG,SMCI,VG,...,WBD,PDD,RCL,NET,SU,SOFI,CC,PB,QQQX,TSM
2025-05-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125
2025-05-03,0.0,-0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.166667,0.166667,0.166667,0.0
2025-05-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.125,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0
2025-05-06,0.0,0.0,-0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-08,-0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-09,-0.125,-0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-10,-0.125,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-11,-0.125,0.0,0.125,0.125,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:
def count_weights(df):
    positive_count = df.apply(lambda row: (row > 0).sum(), axis=1)
    negative_count = df.apply(lambda row: (row < 0).sum(), axis=1)
    positive_tickers = df.apply(lambda row: list(row[row > 0].index), axis=1)
    negative_tickers = df.apply(lambda row: list(row[row < 0].index), axis=1)
    
    return pd.DataFrame({
        'positive_count': positive_count,
        'negative_count': negative_count,
        'positive_tickers': positive_tickers,
        'negative_tickers': negative_tickers
    })

weights_info = count_weights(weights)
weights_info

Unnamed: 0,positive_count,negative_count,positive_tickers,negative_tickers
2025-05-02,4,1,"[AAPL, NFLX, JD, TSM]",[NKE]
2025-05-03,3,3,"[RKLB, PB, QQQX]","[TSLA, PLTR, CC]"
2025-05-04,1,1,[AMD],[AMC]
2025-05-05,4,4,"[HIMS, PLTR, AMD, SU]","[AMZN, RKLB, UPS, PDD]"
2025-05-06,3,4,"[PLTR, UK, AVGO]","[NVDA, MSFT, SONY, INTC]"
2025-05-07,4,4,"[ARE, HALO, NVMI, TRUE]","[PLTR, OKLO, ARM, BE]"
2025-05-08,4,4,"[HIMS, CVNA, NI, ROOT]","[MSTR, META, LLY, NYT]"
2025-05-09,4,4,"[CELH, VFC, CPS, MARA]","[MSTR, TSLA, ARE, TTD]"
2025-05-10,4,4,"[UNH, QBTS, DD, ES]","[MSTR, WMT, MELI, EOD]"
2025-05-11,4,4,"[NVDA, UNH, FT, GDDY]","[MSTR, CRWV, OKLO, HOOD]"


---

### 4. Visualization 