In [None]:
# Main .py
!pip install pandas_market_calendars
!pip install praw
!pip install vaderSentiment
import praw
import pandas as pd
from datetime import datetime, timedelta
import json
import pandas_market_calendars as mcal
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [None]:


#TOP 15 S AND P TICKERS CURRENTLY RANKED BY WEIGHT, REMOVED SOME TICKERS LIKE "T,V,GS,GE,LLY,NOW,DIS" because scraping comments would return incorrect pairings.
#Data wasn't found on all 50 TICKERS.
tickers = [
    "NVDA",  # Nvidia Corp.
    "AAPL",  # Apple
    "COST",  # Costco Wholesale Corp.
    "TSLA",  # Tesla, Inc.
    "GOOGL", # Alphabet Inc. Class A
    "PM",    # Philip Morris International Inc.
    "MSFT",  # Microsoft
    "META",  # Meta Platforms, Inc. Class A
    "KO",    # Coca-Cola Company
    "AMZN",  # Amazon.com Inc.
    "PG",    # Procter & Gamble Company
    "BRK.B", # Berkshire Hathaway Class B
    "HD",    # Home Depot, Inc.
    "GOOG",  # Alphabet Inc. Class C
    "PLTR"   # Palantir
]
ticker_to_company = {
    "AAPL":  ["aapl", "apple"],
    "MSFT":  ["msft", "microsoft"],
    "NVDA":  ["nvda", "nvidia"],
    "AMZN":  ["amzn", "amazon"],
    "META":  ["meta", "facebook", "meta platforms"],
    "BRK.B": ["brk.b", "berkshire", "buffett", "berkshire hathaway"],
    "GOOGL": ["googl", "alphabet", "google"],
    "GOOG":  ["goog", "alphabet", "google"],
    "TSLA":  ["tsla", "tesla", "elon"],
    "COST":  ["cost", "costco", "costco wholesale"],
    "PG":    ["pg", "procter", "procter and gamble"],
    "HD":    ["hd", "home depot"],
    "KO":    ["ko", "coca cola", "coke"],
    "PM":    ["pm", "philip morris"],
    "PLTR":  ["pltr", "palantir"]
}






In [None]:
# Initialize Reddit API client using the credentials
def initialize_reddit_client():
    reddit = praw.Reddit(
        client_id='T_1IEZgS3e6ywIw4PxXjAQ',
        client_secret= 'oRZ-x7TyQanu5ehfVcVvSlvWMR514Q',
        user_agent="stock-volatility-sentiment-scraper by u/Osamabeenliftin69/Osamabeenliftin69",
        )
    print("Logged in.")
    return reddit

In [None]:
import pandas as pd
from datetime import datetime, timedelta
import concurrent.futures

def match_ticker(comment, mapping):
    """
    Check if the comment contains a known ticker symbol or the associated company name.
    Return the normalized ticker if a match is found, otherwise None.
    """
    for ticker, company in mapping.items():
        # Check if the ticker symbol itself exists in the comment
        if ticker.lower() in comment.lower():
            return ticker

        # Handle the mapping value: if it's a list, iterate its items; otherwise, treat it as a string.
        if isinstance(company, list):
            for term in company:
                if term.lower() in comment.lower():
                    return ticker
        else:
            if company.lower() in comment.lower():
                return ticker

    return None

def process_post(post, two_years_ago, tickers):
    """Process a single Reddit post and extract comment data for matching tickers."""
    local_results = []
    post_date = datetime.fromtimestamp(post.created_utc)
    # Skip posts that are too old
    if post_date < two_years_ago:
        return local_results
    # Process only posts with the expected title parts
    relevant_keywords = ["daily discussion", "stock discussion", "market discussion"]
    if not any(keyword in post.title.lower() for keyword in relevant_keywords):
        return local_results

    print(f"Processing post: {post.title} on {post_date.date()}")

    # Expand the entire comment tree
    post.comments.replace_more(limit=0)
    comment_list = post.comments.list()

    for comment in comment_list:
        # Some comments (e.g., deleted) lack a created timestamp
        if not hasattr(comment, "created_utc"):
            continue
        comment_date = datetime.fromtimestamp(comment.created_utc)
        if comment_date < two_years_ago:
            continue
        if comment_date.weekday() >= 5:
            continue

        comment_text = comment.body
        # Check each ticker; stop after the first match to avoid duplicate counting.
        matched_ticker = match_ticker(comment_text, ticker_to_company)
        if matched_ticker:
            local_results.append({
                "platform": "reddit",
                "ticker": matched_ticker,
                "post_title": post.title,
                "comment": comment_text,
                "comment_date": comment_date.date(),
                "comment_ups": getattr(comment, "ups", None),
                "comment_score": getattr(comment, "score", None)
            })
    return local_results

def scrape_reddit_data(reddit, tickers, limit):
    """
    Scrapes Reddit posts and comments based on daily discussion threads and tickers,
    then returns a DataFrame. Uses concurrent processing to speed up scraping.
    """
    if reddit is None:
        print("Reddit client is not initialized.")
        return pd.DataFrame()

    results = []
    two_years_ago = datetime.now() - timedelta(days=437)
    subreddits = ['wallstreetbets', 'stocks', 'investing']

    # Expanded query to capture more than just "Daily Discussion"
    query = '("Daily Discussion" OR "Stock Discussion" OR "Market Discussion" OR "DD")'

    try:
        for subreddit_name in subreddits:
            subreddit = reddit.subreddit(subreddit_name)
            print(f"Searching in r/{subreddit_name}...")
            posts = list(subreddit.search(query, limit=limit, time_filter='all'))

            # Use a ThreadPoolExecutor to process posts concurrently
            with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
                futures = [executor.submit(process_post, post, two_years_ago, tickers) for post in posts]
                for future in concurrent.futures.as_completed(futures):
                    results.extend(future.result())

        df = pd.DataFrame(results)
        return df

    except Exception as e:
        print(f"Error scraping Reddit data: {e}")
        return pd.DataFrame()



In [None]:
reddit_client = initialize_reddit_client()


if reddit_client:
    df = scrape_reddit_data(reddit_client, tickers, limit=1000)
    if not df.empty:
        print(df.head())
        print(df.size)
        print(df.shape)
    else:
        print("No posts or comments found.")

In [None]:
tickers_per_day = df.groupby("comment_date")["ticker"].count().reset_index(name="ticker_mentions")
print("Ticker Mentions Per Day:")
print(tickers_per_day.to_string(index=False))

# 2. Analyze Number of Different Tickers Total
# Count the number of unique tickers that appear in the dataset
unique_ticker_count = df["ticker"].nunique()
unique_tickers = df["ticker"].unique()
print("\nTotal Number of Different Tickers:", unique_ticker_count)
print("List of Unique Tickers:")
print(unique_tickers)


# 3. Additional Analysis: Count of Comments Per Ticker
comments_per_ticker = df.groupby("ticker")["comment"].count().reset_index(name="comment_count")
print("\nComment Count per Ticker:")
print(comments_per_ticker)

#test ticker comments
testticker = 'CRWD'
filtered_df = df[df['ticker'] == testticker]
pd.set_option('display.max_colwidth', None)
# Print the first 5 comments for that ticker
print(f"First 5 comments for {testticker}:")
print(filtered_df['comment'].head(20))

In [None]:
# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Compute a compound sentiment score for each comment
df["sentiment"] = df["comment"].apply(lambda x: sia.polarity_scores(x)["compound"])

# Convert comment_date column to datetime if not already
df["comment_date"] = pd.to_datetime(df["comment_date"])
# Create a new column with just the date portion
df["date"] = df["comment_date"].dt.date

daily_summary = df.groupby(["date", "ticker"]).agg(
    comment_volume=("ticker", "size"),  # Count of comments per group
    average_sentiment_score=("sentiment", "mean")  # Average sentiment score per group
).reset_index()

print(daily_summary.shape)
print(daily_summary.head(100))

days_per_ticker = daily_summary.groupby("ticker").size().sort_values(ascending=False)
print(days_per_ticker)


In [None]:
daily_summary.to_csv('daily_reddit_summaryasdf.csv', index=False)