# Adding Sentiment Scores to Reddit Data Collection

## Part 0: Setup

#### Setup basic utilities

In [122]:
# Import Packages
import pyarrow
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Check if running in Google Colab
def is_colab():
    from IPython import get_ipython
    return get_ipython().__class__.__module__ == "google.colab._shell"

#### Setup NLTK (Natural Language Tool Kit) utilities

In [123]:
import os, re

# Set the NLTK data directory
NLTK_DOWNLOAD_DIR = './NLTK_DATA'
os.environ["NLTK_DATA"]=NLTK_DOWNLOAD_DIR

# Then, import NLTK and download the necessary data.
import nltk

# Do not download this data without understanding the implications.
nltk.download(['punkt',
               'punkt_tab',
               'stopwords',
               'vader_lexicon',
               'names',
               'averaged_perceptron_tagger',
               'wordnet'], download_dir=NLTK_DOWNLOAD_DIR)

from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to ./NLTK_DATA...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to ./NLTK_DATA...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to ./NLTK_DATA...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to ./NLTK_DATA...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package names to ./NLTK_DATA...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     ./NLTK_DATA...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to ./NLTK_DATA...
[nltk_data]   Package wordnet is already up-to-date!


## Part 1: Read Collected Reddit Data

In [124]:
# Save the collected data to parquet format
SUBMISSION_PARQUET_PATH = './data/wallstreetbets-collection.parquet'

# Create a pyarrow schema for the data types.
submission_schema = pyarrow.schema([
    ('title', pyarrow.string()),
    ('created_utc', pyarrow.float64()),
    ('id', pyarrow.string()),
    ('is_original_content', pyarrow.bool_()),
    ('link_flair_text', pyarrow.string()),
    ('locked', pyarrow.bool_()),
    ('name', pyarrow.string()),
    ('num_comments', pyarrow.int64()),
    ('over_18', pyarrow.bool_()),
    ('permalink', pyarrow.string()),
    ('selftext', pyarrow.string()),
    ('spoiler', pyarrow.bool_()),
    ('upvote_ratio', pyarrow.float64()),
])

submission_collection = pd.read_parquet(SUBMISSION_PARQUET_PATH, engine='pyarrow', schema=submission_schema)
# Print some details about the submission collection.
print(f"Submission collection shape: {submission_collection.shape}")

Submission collection shape: (798, 13)


In [125]:
# Display the first few rows of the submission collection.
display(submission_collection.head())

Unnamed: 0,title,created_utc,id,is_original_content,link_flair_text,locked,name,num_comments,over_18,permalink,selftext,spoiler,upvote_ratio
0,Nivea Along,1744832000.0,1k0t4jk,False,YOLO,False,t3_1k0t4jk,5,False,/r/wallstreetbets/comments/1k0t4jk/nivea_along/,After -7% yesterday and -10% today,False,0.67
1,Powell to Volatile Stock Market: You’re on You...,1744836000.0,1k0unbq,False,News,False,t3_1k0unbq,2,False,/r/wallstreetbets/comments/1k0unbq/powell_to_v...,,False,0.86
2,Super sick timing,1744835000.0,1k0umm6,False,Loss,False,t3_1k0umm6,3,False,/r/wallstreetbets/comments/1k0umm6/super_sick_...,,False,0.75
3,My second week trading options. SPY BAC and BABA,1744835000.0,1k0ugf8,False,Gain,False,t3_1k0ugf8,1,False,/r/wallstreetbets/comments/1k0ugf8/my_second_w...,This is my second—and hopefully last—week trad...,False,0.75
4,Don’t see this too often,1744835000.0,1k0ubo1,False,Discussion,False,t3_1k0ubo1,11,False,/r/wallstreetbets/comments/1k0ubo1/dont_see_th...,,False,0.86


In [126]:
# Save the collected data to parquet format
COMMENT_PARQUET_PATH = './data/wallstreetbets-comment-collection.parquet'

# Create a pyarrow schema for the comment data
comment_schema = pyarrow.schema([
    ('parent_post_id', pyarrow.string()),
    ('parent_comment_id', pyarrow.string()),
    ('comment_id', pyarrow.string()),
    ('author', pyarrow.string()),
    ('created_utc', pyarrow.float64()),
    ('score', pyarrow.int64()),
    ('body', pyarrow.string())
])

comment_collection = pd.read_parquet(COMMENT_PARQUET_PATH, engine='pyarrow', schema=comment_schema)
print(f"Comment collection shape: {comment_collection.shape}")

Comment collection shape: (151805, 7)


In [127]:
# Display the first few rows of the comment collection.
display(comment_collection.head())

Unnamed: 0,parent_post_id,parent_comment_id,comment_id,author,created_utc,score,body
0,1jwqbs7,t1_mmq5ys9,mmr2q1q,JazzlikePackage5128,1744474000.0,1,Ty
1,1jwqbs7,t1_mmumxfs,mn0wa66,shmoopdoop6969,1744615000.0,1,why
2,1jwqbs7,t1_mn0gfl4,mnavkz2,diggin-the-doge,1744752000.0,1,I take it all back. Tim Dillon special just re...
3,1jwqbs7,t1_mmxdo0h,mmzwh9l,Hugheston987,1744597000.0,1,![img](emote|t5_2th52|58355)
4,1jwqbs7,t1_mmplbah,mmsxb6r,markHart99,1744496000.0,1,![img](emote|t5_2th52|4258)


## Part 2: Initial Analysis

In [128]:
# Initialize the Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

# Function to analyze sentiment of a single comment
def analyze_sentiment(comment):
    # Tokenize the comment
    tokens = word_tokenize(comment.lower())

    # Remove stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Get sentiment scores
    sentiment_scores = sia.polarity_scores(' '.join(filtered_tokens))

    return sentiment_scores

# Function to clean text
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters
    text = re.sub(r'\@\w+|\#', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

In [129]:
# Get the first submssion from the collection
submission = submission_collection.iloc[0]
print(f"Submission Details:")
print(f"Dumping submission details: {submission.to_dict()}")
print("-" * 80)

# Show the results of the analysis.
sentiment_scores = analyze_sentiment(submission.title)
print(f"Submission: {submission.title}")
print(f"Sentiment Scores: {sentiment_scores}")
print("-" * 80)

Submission Details:
Dumping submission details: {'title': 'Nivea Along', 'created_utc': 1744831733.0, 'id': '1k0t4jk', 'is_original_content': False, 'link_flair_text': 'YOLO', 'locked': False, 'name': 't3_1k0t4jk', 'num_comments': 5, 'over_18': False, 'permalink': '/r/wallstreetbets/comments/1k0t4jk/nivea_along/', 'selftext': 'After -7% yesterday and -10% today ', 'spoiler': False, 'upvote_ratio': 0.67}
--------------------------------------------------------------------------------
Submission: Nivea Along
Sentiment Scores: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
--------------------------------------------------------------------------------


## Part 3: Sentiment Score for Collection

In [130]:
# Add the sentiment scores to the submission collection.
print("Adding sentiment scores to the submission collection...")
print("Provided the analysis for fields: selftext.")
print("-" * 80)
submission_collection['sentiment'] = submission_collection['selftext'].apply(analyze_sentiment)
print("Sentiment scores added to the submission collection.")
print("-" * 80)
print("Example of the submission collection with sentiment scores:")
display(submission_collection[['title', 'selftext', 'sentiment']].head(10))
print("-" * 80)
print("Parse the sentiment scores into separate columns.")
print("Add four columns to the submission collection.")
submission_collection[['neg', 'neu', 'pos', 'compound']] = submission_collection['sentiment'].apply(pd.Series)
print("Sentiment scores parsed into separate columns.")
print("-" * 80)
print("Add a predicted (ss_) to the sentiment scores columns.")
submission_collection.rename(columns={'neg': 'ss_neg', 'neu': 'ss_neu', 'pos': 'ss_pos', 'compound': 'ss_compound'}, inplace=True)
print("Sentiment scores columns renamed.")
print("-" * 80)
print("Example of the submission collection with sentiment scores parsed into separate columns:")
display(submission_collection[['title', 'selftext', 'ss_neg', 'ss_neu', 'ss_pos']].head(10))

Adding sentiment scores to the submission collection...
Provided the analysis for fields: selftext.
--------------------------------------------------------------------------------
Sentiment scores added to the submission collection.
--------------------------------------------------------------------------------
Example of the submission collection with sentiment scores:


Unnamed: 0,title,selftext,sentiment
0,Nivea Along,After -7% yesterday and -10% today,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,Powell to Volatile Stock Market: You’re on You...,,"{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound..."
2,Super sick timing,,"{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound..."
3,My second week trading options. SPY BAC and BABA,This is my second—and hopefully last—week trad...,"{'neg': 0.0, 'neu': 0.671, 'pos': 0.329, 'comp..."
4,Don’t see this too often,,"{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound..."
5,I was about to rope and then +97k,,"{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound..."
6,Powell says Federal Reserve can wait on any in...,,"{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound..."
7,Made back the last Wendy’s paycheck I lost,,"{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound..."
8,"What Are Your Moves Tomorrow, April 17, 2025",This post contains content not supported on ol...,"{'neg': 0.0, 'neu': 0.827, 'pos': 0.173, 'comp..."
9,After market observation. When I finished buyi...,https://preview.redd.it/41ilvj6f39ve1.png?widt...,"{'neg': 0.034, 'neu': 0.882, 'pos': 0.085, 'co..."


--------------------------------------------------------------------------------
Parse the sentiment scores into separate columns.
Add four columns to the submission collection.
Sentiment scores parsed into separate columns.
--------------------------------------------------------------------------------
Add a predicted (ss_) to the sentiment scores columns.
Sentiment scores columns renamed.
--------------------------------------------------------------------------------
Example of the submission collection with sentiment scores parsed into separate columns:


Unnamed: 0,title,selftext,ss_neg,ss_neu,ss_pos
0,Nivea Along,After -7% yesterday and -10% today,0.0,1.0,0.0
1,Powell to Volatile Stock Market: You’re on You...,,0.0,0.0,0.0
2,Super sick timing,,0.0,0.0,0.0
3,My second week trading options. SPY BAC and BABA,This is my second—and hopefully last—week trad...,0.0,0.671,0.329
4,Don’t see this too often,,0.0,0.0,0.0
5,I was about to rope and then +97k,,0.0,0.0,0.0
6,Powell says Federal Reserve can wait on any in...,,0.0,0.0,0.0
7,Made back the last Wendy’s paycheck I lost,,0.0,0.0,0.0
8,"What Are Your Moves Tomorrow, April 17, 2025",This post contains content not supported on ol...,0.0,0.827,0.173
9,After market observation. When I finished buyi...,https://preview.redd.it/41ilvj6f39ve1.png?widt...,0.034,0.882,0.085


In [131]:
# Add the sentiment scores to the submission collection.
print("Adding sentiment scores to the comment collection...")
print("Provided the analysis for fields: body.")
print("-" * 80)
comment_collection['sentiment'] = comment_collection['body'].apply(analyze_sentiment)
print("Sentiment scores added to the comment collection.")
print("-" * 80)
print("Example of the comment collection with sentiment scores:")
display(comment_collection[['body', 'sentiment']].head(10))
print("-" * 80)
print("Parse the sentiment scores into separate columns.")
print("Add four columns to the comment collection.")
comment_collection[['neg', 'neu', 'pos', 'compound']] = comment_collection['sentiment'].apply(pd.Series)
print("Sentiment scores parsed into separate columns.")
print("-" * 80)
print("Add a predicted (ss_) to the sentiment scores columns.")
comment_collection.rename(columns={'neg': 'ss_neg', 'neu': 'ss_neu', 'pos': 'ss_pos', 'compound': 'ss_compound'}, inplace=True)
print("Sentiment scores columns renamed.")
print("-" * 80)
print("Example of the submission collection with sentiment scores parsed into separate columns:")
display(comment_collection[['body', 'ss_neg', 'ss_neu', 'ss_pos']].head(10))

Adding sentiment scores to the comment collection...
Provided the analysis for fields: body.
--------------------------------------------------------------------------------
Sentiment scores added to the comment collection.
--------------------------------------------------------------------------------
Example of the comment collection with sentiment scores:


Unnamed: 0,body,sentiment
0,Ty,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound..."
1,why,"{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound..."
2,I take it all back. Tim Dillon special just re...,"{'neg': 0.0, 'neu': 0.69, 'pos': 0.31, 'compou..."
3,![img](emote|t5_2th52|58355),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,![img](emote|t5_2th52|4258),"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
5,yes,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound..."
6,Little retards are legitimately in shock ![img...,"{'neg': 0.345, 'neu': 0.655, 'pos': 0.0, 'comp..."
7,Feels like a horrible time to look for a diffe...,"{'neg': 0.424, 'neu': 0.424, 'pos': 0.152, 'co..."
8,AMD and NVDA are down 10% ![img](emote|t5_2th5...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
9,This is a classic Powell talks and shit hits t...,"{'neg': 0.211, 'neu': 0.409, 'pos': 0.38, 'com..."


--------------------------------------------------------------------------------
Parse the sentiment scores into separate columns.
Add four columns to the comment collection.
Sentiment scores parsed into separate columns.
--------------------------------------------------------------------------------
Add a predicted (ss_) to the sentiment scores columns.
Sentiment scores columns renamed.
--------------------------------------------------------------------------------
Example of the submission collection with sentiment scores parsed into separate columns:


Unnamed: 0,body,ss_neg,ss_neu,ss_pos
0,Ty,0.0,0.0,1.0
1,why,0.0,0.0,0.0
2,I take it all back. Tim Dillon special just re...,0.0,0.69,0.31
3,![img](emote|t5_2th52|58355),0.0,1.0,0.0
4,![img](emote|t5_2th52|4258),0.0,1.0,0.0
5,yes,0.0,0.0,1.0
6,Little retards are legitimately in shock ![img...,0.345,0.655,0.0
7,Feels like a horrible time to look for a diffe...,0.424,0.424,0.152
8,AMD and NVDA are down 10% ![img](emote|t5_2th5...,0.0,1.0,0.0
9,This is a classic Powell talks and shit hits t...,0.211,0.409,0.38


## Part 4: Update Reddit Collection

In [132]:
# Save the collected data to parquet format
COMMENT_PARQUET_PATH = './data/wallstreetbets-comment-collection-wss.parquet'

# Create a pyarrow schema for the comment data
comment_schema = pyarrow.schema([
    ('parent_post_id', pyarrow.string()),
    ('parent_comment_id', pyarrow.string()),
    ('comment_id', pyarrow.string()),
    ('author', pyarrow.string()),
    ('created_utc', pyarrow.float64()),
    ('score', pyarrow.int64()),
    ('body', pyarrow.string()),
    ('ss_neg', pyarrow.float64()),
    ('ss_neu', pyarrow.float64()),
    ('ss_pos', pyarrow.float64()),
    ('ss_compound', pyarrow.float64())
])

# Save the collected data to parquet format
comment_collection.to_parquet(COMMENT_PARQUET_PATH, engine='pyarrow', schema=comment_schema)

In [133]:
# Save the collected data to parquet format
SUBMISSION_PARQUET_PATH = './data/wallstreetbets-collection-wss.parquet'

# Create a pyarrow schema for the data types.
submission_schema = pyarrow.schema([
    ('title', pyarrow.string()),
    ('created_utc', pyarrow.float64()),
    ('id', pyarrow.string()),
    ('is_original_content', pyarrow.bool_()),
    ('link_flair_text', pyarrow.string()),
    ('locked', pyarrow.bool_()),
    ('name', pyarrow.string()),
    ('num_comments', pyarrow.int64()),
    ('over_18', pyarrow.bool_()),
    ('permalink', pyarrow.string()),
    ('selftext', pyarrow.string()),
    ('spoiler', pyarrow.bool_()),
    ('upvote_ratio', pyarrow.float64()),
    ('ss_neg', pyarrow.float64()),
    ('ss_neu', pyarrow.float64()),
    ('ss_pos', pyarrow.float64()),
    ('ss_compound', pyarrow.float64())
])

# Save the collected data to parquet format
submission_collection.to_parquet(SUBMISSION_PARQUET_PATH, engine='pyarrow', schema=submission_schema)