In [3]:
pip install pandas openpyxl




In [9]:
import praw
import json
from datetime import datetime, timezone, timedelta
from textblob import TextBlob  # For sentiment analysis
import yfinance as yf  # For fetching stock data
import math  # For NaN checks
import re  # For regex extraction

# Step 1: Reddit Authentication
reddit = praw.Reddit(
    client_id='QjnPOG5Tx3U84SguHNp3zw',  # Replace with your Client ID
    client_secret='pGwZcyLMzdJNOtlNoj7Celc8yePtKg',  # Replace with your Client Secret
    user_agent='StockScrapper/1.0 by WaitBrilliant1758'  # Replace with your User Agent
)

# Step 2: Define the Subreddit and Query
subreddit_name = 'stocks'  # Target subreddit
stock_symbols = ['TSLA', 'AAPL', 'AMZN', 'GOOG', 'MSFT','JPM','BAC','GS','V','MA','JNJ','PFE','MRNA','UNH','KO','PEP','PG']  # List of stock symbols to track
limit = 100  # Number of posts to fetch

# Step 3: Function to fetch and cache stock data
def get_stock_data_cached(stock_symbol, start_date, end_date):
    """
    Fetch and cache stock data for a range of dates.
    """
    stock = yf.Ticker(stock_symbol)
    try:
        stock_info = stock.history(start=start_date.strftime('%Y-%m-%d'), end=end_date.strftime('%Y-%m-%d'))
        if stock_info.empty:
            print(f"No data available for {stock_symbol} during this period.")
            return None
        return stock_info
    except Exception as e:
        print(f"Error fetching data for {stock_symbol}: {e}")
        return None

# Step 4: Function to get stock price and change for a specific date
def get_stock_price_for_date(stock_info, post_date):
    """
    Get the stock price and change closest to the given post date.
    """
    try:
        if stock_info is None or stock_info.empty:
            return None, None
        
        # Convert the index to a list of dates and find the nearest date
        available_dates = stock_info.index
        nearest_date = min(available_dates, key=lambda d: abs(d - post_date))

        # Get the stock price and percentage change for the nearest date
        stock_price = stock_info.loc[nearest_date, 'Close']
        price_change = stock_info['Close'].pct_change().loc[nearest_date] * 100

        return stock_price, price_change
    except Exception as e:
        print(f"Error finding stock price for date {post_date}: {e}")
        return None, None

# Step 5: Function to extract stock price and change from text
def extract_price_and_change_from_text(text):
    price_pattern = r'\$?\b\d+(\.\d+)?\b'
    change_pattern = r'\b[-+]?\d+(\.\d+)?%'  # Percentage changes
    prices = re.findall(price_pattern, text)
    changes = re.findall(change_pattern, text)
    valid_prices = [float(price) for price in prices if price.replace('.', '', 1).isdigit()]
    valid_changes = [float(change.strip('%')) for change in changes if change.replace('.', '', 1).replace('-', '', 1).isdigit()]
    price = valid_prices[0] if valid_prices else None
    change = valid_changes[0] if valid_changes else None
    return price, change

# Step 6: Main logic to scrape data
data = []
subreddit = reddit.subreddit(subreddit_name)

# Fetch stock data for all symbols once (last 5 years)
end_date = datetime.today()
start_date = end_date - timedelta(days=5 * 365)
cached_stock_data = {}
for symbol in stock_symbols:
    cached_stock_data[symbol] = get_stock_data_cached(symbol, start_date, end_date)

for symbol in stock_symbols:
    posts = subreddit.search(symbol, limit=limit)
    
    for post in posts:
        post_date = datetime.fromtimestamp(post.created_utc, timezone.utc)
        sentiment_analysis = TextBlob(post.title + " " + post.selftext if post.selftext else post.title)
        sentiment_score = sentiment_analysis.sentiment.polarity
        sentiment = "Positive" if sentiment_score > 0 else "Negative" if sentiment_score < 0 else "Neutral"
        
        mention_count = post.title.lower().count(symbol.lower()) + (post.selftext.lower().count(symbol.lower()) if post.selftext else 0)
        text_content = post.title + " " + post.selftext if post.selftext else post.title
        extracted_price, extracted_change = extract_price_and_change_from_text(text_content)
        
        # Use cached data if needed
        if extracted_price is None or extracted_change is None:
            stock_info = cached_stock_data.get(symbol)
            extracted_price, extracted_change = get_stock_price_for_date(stock_info, post_date)
        
        data.append({
    "post_id": post.id,
    "post_text": text_content,
    "post_date": post_date.strftime('%Y-%m-%d %H:%M:%S'),
    "user_handle": post.author.name if post.author else "Unknown",
    "sentiment": sentiment,
    "sentiment_score": sentiment_score,
    "mention_count": mention_count,
    "stock_symbol": symbol,
    "stock_price": extracted_price if extracted_price is not None and not math.isnan(extracted_price) else "Unavailable",
    "price_change": extracted_change if extracted_change is not None and not math.isnan(extracted_change) else "Unavailable"
})


# Step 7: Save Data to JSON
output_file = "reddit_stock_data_final.json"
with open(output_file, "w") as file:
    json.dump(data, file, indent=4)

print(f"Data saved to {output_file}")


Data saved to reddit_stock_data_final.json


In [3]:
import pandas as pd
file_path="C:/Users/vdars/Downloads/reddit_stock_data_final (1).json"
data=pd.read_json(file_path)
data

Unnamed: 0,post_id,post_text,post_date,user_handle,sentiment,sentiment_score,mention_count,stock_symbol,stock_price,price_change
0,qfmugy,TSLA hits $1000. Makes it a first $1T Auto com...,2021-10-25 18:16:12,daynightcase,Positive,0.284375,3,TSLA,339.476654,-0.627405
1,vxi1zm,Was the TWTR bid by Elon just a way to hide a ...,2022-07-12 18:28:34,phatelectribe,Positive,0.024511,2,TSLA,237.039993,1.703345
2,qowk4z,The absurdity of Elon Musk's poll to decide to...,2021-11-07 20:03:49,vytasmike,Positive,0.025000,2,TSLA,387.646667,-4.840071
3,h0ehgw,"Every day I don't buy TSLA, is a day I wish I ...",2020-06-10 16:37:29,Estate4reaL,Positive,0.500000,2,TSLA,64.856003,-5.09341
4,pjng0j,"Elon Musk says Tesla (TSLA) is worth $3,000 a ...",2021-09-07 13:50:26,rugerapatt,Positive,0.199210,4,TSLA,250.973328,2.637784
...,...,...,...,...,...,...,...,...,...,...
1695,w5kbvd,Keep or Trade Inherited Stocks My husband and ...,2022-07-22 20:20:38,EColli93,Positive,0.320000,1,PG,135.132065,1.598336
1696,y9jbmq,Will rates have to rise to equal the rate of i...,2022-10-21 04:06:55,AmericanSahara,Positive,0.060000,1,PG,122.350662,1.252068
1697,u1qljf,Recession stocks with opportunity for scale? I...,2022-04-12 04:36:48,r2002,Positive,0.057177,0,PG,148.438324,-0.300971
1698,pihj57,If you like to swing-trade here's a list of st...,2021-09-05 17:39:36,LegendaryHODLer,Positive,0.208333,1,PG,132.428421,-0.374884
