# Adding Sentiment Scores to Reddit Data Collection

## Part 0: Setup

#### Setup basic utilities

In [7]:
# Import Packages
import pyarrow, os, re, ollama, requests, json, time, subprocess
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm

# Check if running in Google Colab
def is_colab():
    from IPython import get_ipython
    return get_ipython().__class__.__module__ == "google.colab._shell"

## Part 1: Read Collected Reddit Data

In [8]:
# Save the collected data to parquet format
MERGED_PARQUET_PATH = './data/merged-reddit-wsb.parquet'

# Verify that the path exists
if not os.path.exists(MERGED_PARQUET_PATH):
    print(f"Error: The file {MERGED_PARQUET_PATH} does not exist.")
    
# Create a pyarrow schema for the data types.
merged_schema = pyarrow.schema([
    ('title', pyarrow.string()),
    ('upvote_ratio', pyarrow.float64()),
    ('id', pyarrow.string()),
    ('permalink', pyarrow.string()),
    ('num_comments', pyarrow.int64()),
    ('created_utc', pyarrow.float64()),
    ('selftext', pyarrow.string())
])

merged_collection = pd.read_parquet(MERGED_PARQUET_PATH, engine='pyarrow', schema=merged_schema)

In [9]:
# Display the first few rows of the submission collection.
display(merged_collection.head())

Unnamed: 0,title,upvote_ratio,id,permalink,num_comments,created_utc,selftext
0,Nivea Along,0.67,1k0t4jk,/r/wallstreetbets/comments/1k0t4jk/nivea_along/,5,1744832000.0,After -7% yesterday and -10% today
1,Powell to Volatile Stock Market: You’re on You...,0.86,1k0unbq,/r/wallstreetbets/comments/1k0unbq/powell_to_v...,2,1744836000.0,
2,Made back the last Wendy’s paycheck I lost,0.94,1k0tv2y,/r/wallstreetbets/comments/1k0tv2y/made_back_t...,6,1744834000.0,
3,After market observation. When I finished buyi...,0.72,1k0tnqx,/r/wallstreetbets/comments/1k0tnqx/after_marke...,8,1744833000.0,https://preview.redd.it/41ilvj6f39ve1.png?widt...
4,Ominous,0.85,1k0thnd,/r/wallstreetbets/comments/1k0thnd/ominous/,110,1744833000.0,NVIDIA 2024 is starting to rhyme like Cisco 20...


## Part 2: _

In [10]:
def attribute_ticker_raw(post: str) -> str:
    """
    Extracts the ticker symbol from the submission title.
    """
    match = re.search(r'([A-Z]{3,5})', post) #([A-Za-z]{1,5})(-[A-Za-z]{1,2})?
    if match:
        return match.group(0)
    else:
        return None

In [11]:
# Apply the function to the entire collection
tqdm.pandas()
merged_collection['re_ticker'] = merged_collection.progress_apply(lambda x: attribute_ticker_raw(f'{x['title']} {x['selftext']}'), axis=1)

100%|██████████| 54577/54577 [00:00<00:00, 127339.59it/s]


In [12]:
# Provide some details about the ticker extraction
print(f"Number of submissions with ticker: {merged_collection['re_ticker'].notnull().sum()}")
print(f"Number of submissions without ticker: {merged_collection['re_ticker'].isnull().sum()}")
print(f"Proportion of submissions with ticker: {merged_collection['re_ticker'].notnull().sum() / len(merged_collection) * 100:.2f}%")
print(f"Number of unique tickers: {merged_collection['re_ticker'].nunique()}")

# Display the first few rows of the collection with tickers
display(merged_collection[['title', 're_ticker']].head(10))

Number of submissions with ticker: 36060
Number of submissions without ticker: 18517
Proportion of submissions with ticker: 66.07%
Number of unique tickers: 3700


Unnamed: 0,title,re_ticker
0,Nivea Along,
1,Powell to Volatile Stock Market: You’re on You...,
2,Made back the last Wendy’s paycheck I lost,
3,After market observation. When I finished buyi...,
4,Ominous,NVIDI
5,"Monday Boeing, Tuesday Nivida, Today AMD, Tomo...",AMD
6,Before and After,MSTR
7,"Before, After, and Dr*gs?",MSTR
8,Uranium Yolo,
9,IBKR $185 C 6/20. What are your thoughts regar...,IBKR


## Part 3: Create functionality to use ollama

In [13]:
def check_ollama_serve():
    """
    Checks if the Ollama server is running and returns the base URL.
    """
    try:
        # Check if the server is running
        response = requests.get("http://localhost:11434")
        if response.status_code == 200:
            print("Ollama server is running.")
            return "http://localhost:11434"
        else:
            print("Ollama server is not running.")
            return None
    except requests.exceptions.ConnectionError:
        print("Ollama server is not running.")
        return None
    

def start_ollama_serve():
    """
    Starts the Ollama server and returns the base URL.
    """
    try:
        # Check if the server is already running
        requests.get("http://localhost:11434")
        print("Ollama server is already running.")
        return None, "http://localhost:11434"
    except requests.exceptions.ConnectionError:
        # Start the server if it's not running
        process = subprocess.Popen(["ollama", "serve"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        time.sleep(5)
        print("Ollama server started.")
        
        return process, "http://localhost:11434"
         
def stop_ollama_service():
    """
    Stops the Ollama server if it is running.
    """
    try:
        # Check if the server is running
        response = requests.get("http://localhost:11434")
        if response.status_code == 200:
            # Stop the server
            subprocess.run(["pkill", "-f", "ollama"], check=True)
            print("Ollama server stopped.")
        else:
            print("Ollama server is not running.")
    except requests.exceptions.ConnectionError:
        print("Ollama server is not running.")

In [14]:
# Start the Ollama server
ollama_process, base_url = start_ollama_serve()

# Check if the server is running
check_ollama_serve()

Ollama server started.
Ollama server is running.


'http://localhost:11434'

In [15]:
# Stop the Ollama server
stop_ollama_service()

# Check if the server is running
check_ollama_serve()

Ollama server stopped.
Ollama server is not running.


In [None]:
def attribute_ticker_ollama(post: str) -> str:
    """
    Extracts the ticker symbol from the submission text.
    """
    response = ollama.chat(
        model='smollm:360m', # Subject to change.
        messages=[{
            'role': 'user', 
            'content': f'what is the stock ticker for this text "{post}" you have to answer in one word with no period. If you cant determine it just ourput None. Also if there is no text provided output None.'
        }]
    )
    if len(response['message']['content']) > 5:
        response['message']['content'] = 'None'
    return response['message']['content']

## Part 4: Test the ollama attribution

In [17]:
# Get five submissions from the collection
sample_submissions = merged_collection.sample(10, random_state=42)
display(sample_submissions)

Unnamed: 0,title,upvote_ratio,id,permalink,num_comments,created_utc,selftext,re_ticker
32995,My prediction for the future of Wallstreet bets.,16.0,ldwcr9,https://www.reddit.com/r/wallstreetbets/commen...,15,1612642000.0,**TL;DR: the some media-companys will probably...,
31935,Damn. What a fucked up world eh guys?,31.0,ldiwjk,https://i.redd.it/5cqbkc81iqf61.jpg,25,1612592000.0,,
46909,My Predictions for NIO Q1 Earnings and Results,41.0,n15j46,https://www.reddit.com/r/wallstreetbets/commen...,40,1619735000.0,"I previously posted this, with typos and other...",NIO
47671,Monster Crash,53689.0,n5lboc,https://v.redd.it/shekh7yq5cx61,1156,1620264000.0,,
11993,So,1.0,l71b7v,https://www.reddit.com/r/wallstreetbets/commen...,7,1611880000.0,How much money have ya’ll lost in the last hour?,
33797,TDC - deeper DD as promised,236.0,lfulsm,https://www.reddit.com/r/wallstreetbets/commen...,174,1612874000.0,**TLDR**; TDC is a better Snowflake worth $70 ...,TDC
41293,What could be better for GME transition to onl...,3296.0,m0oxdf,https://www.reddit.com/r/wallstreetbets/commen...,194,1615263000.0,"Who buys more video games, a hedge fund or ten...",GME
49098,Buy the rumor sell the news...,70.0,nmhjn3,https://i.redd.it/2oh0ye679q171.jpg,27,1622178000.0,,
24634,"Losses are mounting$GME, $AMC but I might as w...",140.0,layr40,https://i.redd.it/p26msql063f61.jpg,8,1612310000.0,,GME
19224,Feel the Squeeze,117.0,l8t5sw,https://i.redd.it/ihybwj6drie61.gif,20,1612063000.0,,


In [18]:
## Provide example usage
# Start the Ollama server
ollama_process, base_url = start_ollama_serve()

# Display the sample submissions
for submission in tqdm(sample_submissions.iterrows()):
    submission = submission[1]
    post_text = submission['title']
    print('Post:', post_text)
    print('Ticker:', attribute_ticker_ollama(post_text))
    print('-' * 50)
    
# Stop the Ollama server
stop_ollama_service()

# Check if the server is running
check_ollama_serve()

Ollama server started.


0it [00:00, ?it/s]

Post: My prediction for the future of Wallstreet bets.


1it [00:02,  2.28s/it]

model='smollm:360m' created_at='2025-05-02T19:59:43.817306376Z' done=True done_reason='stop' total_duration=2275621462 load_duration=800649808 prompt_eval_count=60 prompt_eval_duration=184490507 eval_count=340 eval_duration=1288989602 message=Message(role='assistant', content='I think I can help you with that! Let\'s break down what you\'re looking for:\n\n1. **My prediction for the future of Wallstreet bets**: This question asks about your personal opinion or forecast on the future of Wallstreet bets, which could be a stock market-related topic.\n2. **What is the stock ticker for this text "My prediction for the future of Wallstreet bets"?**: This is a specific part of the text that you\'re looking for. The ticker refers to the stock exchange\'s ticker symbol or abbreviation used to identify different stocks on the platform.\n3. **None**: This option indicates that there isn\'t any information available about the stock, so I\'ll assume it means "no" or "unknown".\n\nNow, let\'s analyz

2it [00:02,  1.25s/it]

model='smollm:360m' created_at='2025-05-02T19:59:44.34925804Z' done=True done_reason='stop' total_duration=528030269 load_duration=4820362 prompt_eval_count=63 prompt_eval_duration=7811572 eval_count=139 eval_duration=514773310 message=Message(role='assistant', content='I think I can help you with that!\n\nThe stock ticker for this text "Damn. What a fucked up world eh guys?" would be:\n\n"Damn. What a fucked up world eh guys."\n\nYou could also use the following code to generate the text:\n\n```python\nimport random\n\ndef generate_text(length):\n    words = ["Damn", "What a fucked up world", "ehh", "fuck", "World", "is"]\n    random.shuffle(words)\n    return " ".join([random.choice(words) for _ in range(length)])\n```', images=None, tool_calls=None)
Ticker: None
--------------------------------------------------
Post: My Predictions for NIO Q1 Earnings and Results


4it [00:04,  1.17it/s]

model='smollm:360m' created_at='2025-05-02T19:59:45.734386739Z' done=True done_reason='stop' total_duration=1382798036 load_duration=4805453 prompt_eval_count=63 prompt_eval_duration=7970941 eval_count=353 eval_duration=1369179852 message=Message(role='assistant', content='The text "My Predictions for NIO Q1 Earnings and Results" can be interpreted as follows:\n\n* "My predictions for NIO Q1 earnings and results": This sentence indicates that the author has made some predictions about the company\'s performance in 2022, but they haven\'t provided any specific details or metrics.\n* "and results": This phrase suggests that the authors have mentioned a potential outcome or result of their predictions.\n* "NIO Q1 earnings and results": This sentence refers to the quarterly earnings and results of NIO (a company) in 2022, which are being discussed by the authors.\n\nTo answer this question, I would need to provide some information about the text or context that led to the author\'s predict

5it [00:06,  1.28s/it]

model='smollm:360m' created_at='2025-05-02T19:59:47.916956171Z' done=True done_reason='stop' total_duration=2017770512 load_duration=5017080 prompt_eval_count=52 prompt_eval_duration=7029872 eval_count=545 eval_duration=2005087616 message=Message(role='assistant', content='I think I can help you with that! Let\'s break down what you\'re looking for:\n\n* A stock ticker, which indicates the current price of a company\'s shares on a specific date.\n* A word or phrase, which could be anything from a simple sentence to a more complex expression.\n* A text without any punctuation marks (e.g., "So" is not part of a sentence).\n\nTo help you determine what you\'re looking for, I can provide some general guidance and examples:\n\n1. **Stock ticker**: If you see the word "stock" or "ticker," it\'s likely referring to a company\'s shares on a specific date. The stock ticker is usually displayed in a ticker tape format, with prices listed as numbers (e.g., 20-minute price).\n2. **Word or phrase**

6it [00:07,  1.23s/it]

model='smollm:360m' created_at='2025-05-02T19:59:49.049968288Z' done=True done_reason='stop' total_duration=1130354753 load_duration=4409487 prompt_eval_count=58 prompt_eval_duration=7133001 eval_count=324 eval_duration=1118129340 message=Message(role='assistant', content='The stock ticker for "TDC - deeper DD as promised" would be:\n\nTDC - deeper DD as promised\n\nYou can use the following steps to find the answer:\n\n1. Identify the words and phrases in the question.\n2. Determine the part of speech (noun, verb, adjective) for each word.\n3. Identify the parts of speech that are used in the sentence "TDC - deeper DD as promised".\n4. Use the parts of speech to determine the answer.\n\nHere\'s how you can approach this step:\n\n1. Identify the words and phrases in the question.\nIn this case, the words are "TDC" (tender-hearted), "Deeper DD" (deepest deep), and "as promised".\n2. Determine the part of speech for each word.\nThe parts of speech for "TDC" are "noun", "verb", and "adjec

8it [00:10,  1.24s/it]

model='smollm:360m' created_at='2025-05-02T19:59:51.92127703Z' done=True done_reason='stop' total_duration=2869072828 load_duration=5064372 prompt_eval_count=84 prompt_eval_duration=8265401 eval_count=772 eval_duration=2854903751 message=Message(role='assistant', content='I\'d like to suggest that the text could be "What could be better for GME transition to online retailer than a massive story and the loyalty of tens of thousands of Redditors having a life-changing experience with them?"\n\nThe word "better" is not present in the text, but it\'s implied through the use of "than" or "more". The phrase could be translated as "What could be better for GME transition to online retailer than a massive story and the loyalty of tens of thousands of Redditors having a life-changing experience with them?"\n\nThe word "majestic" is present in the text, but it\'s not used in the context. The phrase could be translated as "What could be more majestic for GME transition to online retailer than a m

9it [00:10,  1.08it/s]

model='smollm:360m' created_at='2025-05-02T19:59:52.27264957Z' done=True done_reason='stop' total_duration=238101408 load_duration=5668228 prompt_eval_count=99 prompt_eval_duration=11593983 eval_count=61 eval_duration=220294051 message=Message(role='assistant', content='The stock ticker for this text is:\n\n```\nGME, AMC but I might as well hold - if nothing else I can gift it to my son in March 💎🙏🏼🚀🚀🚀🚀\n```', images=None, tool_calls=None)
Ticker: None
--------------------------------------------------
Post: Feel the Squeeze


10it [00:11,  1.10s/it]

model='smollm:360m' created_at='2025-05-02T19:59:52.544639456Z' done=True done_reason='stop' total_duration=269747253 load_duration=4747124 prompt_eval_count=56 prompt_eval_duration=13323586 eval_count=69 eval_duration=250983377 message=Message(role='assistant', content='I think I can help you with that!\n\nThe stock ticker for "Feel the Squeeze" would be:\n\n```\nFEL-SQUEZ-1234567890\n```\n\nIf there is no text provided, we can assume it\'s a generic text.', images=None, tool_calls=None)
Ticker: None
--------------------------------------------------
Ollama server stopped.
Ollama server is not running.





## Part 5: Use the ollama attribution

In [19]:
# # Start the Ollama server
# ollama_process, base_url = start_ollama_serve()

# # Apply the function to the entire collection
# tqdm.pandas()
# merged_collection['llm_ticker'] = merged_collection.progress_apply(lambda x: attribute_ticker_ollama(f'{x['title']} {x['selftext']}'), axis=1)

# # Stop the Ollama server
# stop_ollama_service()

# # Check if the server is running
# check_ollama_serve()

In [20]:
# # Provide some details about the ticker extraction
# print(f"Number of submissions with ticker: {merged_collection['llm_ticker'].notnull().sum()}")
# print(f"Number of submissions without ticker: {merged_collection['llm_ticker'].isnull().sum()}")
# print(f"Proportion of submissions with ticker: {merged_collection['llm_ticker'].notnull().sum() / len(merged_collection) * 100:.2f}%")
# print(f"Number of unique tickers: {merged_collection['llm_ticker'].nunique()}")

# # Display the first few rows of the collection with tickers
# display(merged_collection[['title', 'llm_ticker']].head(10))