## Part 1: Setup

### Install Packages

In [4]:
# Install the packages if they don't exist.
!pip install praw
!pip install pyarrow
!pip install python-dotenv

# Import Packages
import praw, time, os, pyarrow
from IPython.display import display
from dotenv import load_dotenv, dotenv_values
from requests import Session
import pandas as pd
from IPython import get_ipython

# Load environment variables from .env file
load_dotenv('.env')
config = dotenv_values()

# Get config from colab or other environment.
def is_colab():
    return get_ipython().__class__.__module__ == "google.colab._shell"

if is_colab():
    from google.colab import userdata
    config = {}
    config['CLIENT_SECRET'] = userdata.get('CLIENT_SECRET')
    config['CLIENT_ID'] = userdata.get('CLIENT_ID')
    config['NAME'] = userdata.get('NAME')
    config['REDIRECT_URI'] = userdata.get('REDIRECT_URI')
    config['USERNAME'] = userdata.get('USERNAME')
    config['PASSWORD'] = userdata.get('PASSWORD')

else:
    load_dotenv('.env')
    config = dotenv_values()




## Part 2: Collecting Data from Reddit

### Open Reddit Connection

In [5]:
# Create a custom session with a timeout
session = Session()
session.headers.update({'User-Agent': 'praw'})
session.timeout = 10  # Set a timeout of 10 seconds

# Login to Reddit using PRAW
reddit = praw.Reddit(
    client_id=config['CLIENT_ID'],
    client_secret=config['CLIENT_SECRET'],
    requestor_kwargs={"session": session},
    username=config['USERNAME'],
    password=config['PASSWORD'],
    user_agent="CS470 ML Project Access by u/GregorybLafayetteML"
)

# Add some peripheral config data
reddit.config.log_requests = 1
reddit.config.store_json_result = True

# Test the connection
try:
    username = reddit.user.me()
    print("Successfully logged in to Reddit!")
    print(f"Logged in as: u/{username}")
except Exception as e:
    print(f"Failed to log in: {e}")

KeyError: 'CLIENT_ID'

### Accessing Reddit Data

To access reddit posts, we'll need send a request with the number of post we want to get. The following example finds the top 10 hottest posts on the u/wallstreetbets subreddit. We'll show the post title, score, flair, and URL.

In [None]:
top_posts = reddit.subreddit('wallstreetbets').hot(limit=10)
print("Top 10 hot posts from r/wallstreetbets:")
for post in top_posts:
    print(f"Title: {post.title}, Score: {post.score}, Flair: {post.link_flair_text}, URL: {post.url}")

For this project, we'll need far more than ten posts at a time. The reddit API will limit our access to 100 posts at a time. Fortunately, the api uses a ListingGenerator which allows us to access our metered connection in sequential blocks. The following example shows how we can utilize this behavior, grabbing blocks of 100 posts at a time. In our example, we'll grab blocks of posts until we reach 5000 posts or our access times out. Notice that the procedure ends early with around 750-800 posts collected. The results are sparce, because our connection either timed out or was metered down by reddit. The latter option is more likely.

In [None]:
# Access the subreddit
subreddit = reddit.subreddit("wallstreetbets")

# Initialize variables
batch_size = 50 # Number of posts per batch
total_posts = 5000  # Total number of posts to fetch
all_posts = []  # To store all the retrieved posts
after = None  # To keep track of the last post for pagination

# Fetch posts in batches
while len(all_posts) < total_posts:
    # Fetch the next batch of posts
    submissions = subreddit.new(limit=batch_size, params={"after": after})

    batch_posts = []
    for submission in submissions:
        batch_posts.append(submission)

        # Update the `after` variable with the last submission's fullname
        after = submission.fullname

    # Add the batch to the main list
    all_posts.extend(batch_posts)

    # Exit loop if no more posts are available
    if not batch_posts:
        print("No more posts to fetch.")
        break

    # Optional delay to avoid rate limits
    time.sleep(5)  # Adjust the delay as necessary

# Process the data (example: print the total number of posts fetched)
print(f"Fetched {len(all_posts)} posts in total.")

Now that we have collected a large portion of posts/submssions, we'll parse the results and construct a dataframe with this data. We're going to collect more fields from this data than we might need right now, avoiding data limitations in the future.

In [None]:
# Parse are submission objects that we collected.
fields = ('title',
          'created_utc',
          'id',
          'is_original_content',
          'link_flair_text',
          'locked',
          'name',
          'num_comments',
          'over_18',
          'permalink',
          'selftext',
          'spoiler',
          'upvote_ratio')
list_of_submissions = []

# Parse each submission into a dictionary of the lised fields.
for submission in all_posts:
    full = vars(submission)
    sub_dict = {field:full[field] for field in fields}
    list_of_submissions.append(sub_dict)

# Create a python dataframe of these submissions.
collected_data = pd.DataFrame.from_records(list_of_submissions)

# Display the dataframe.
display(collected_data)

### Saving Reddit Data

In [None]:
# Save the collected data to parquet format
PARQUET_PATH = './data/wallstreetbets-collection.parquet'

# Create a pyarrow schema for the data types.
schema = pyarrow.schema([
    ('title', pyarrow.string()),
    ('created_utc', pyarrow.float64()),
    ('id', pyarrow.string()),
    ('is_original_content', pyarrow.bool_()),
    ('link_flair_text', pyarrow.string()),
    ('locked', pyarrow.bool_()),
    ('name', pyarrow.string()),
    ('num_comments', pyarrow.int64()),
    ('over_18', pyarrow.bool_()),
    ('permalink', pyarrow.string()),
    ('selftext', pyarrow.string()),
    ('spoiler', pyarrow.bool_()),
    ('upvote_ratio', pyarrow.float64()),
])

# If the parqet does not exist, create it.
if not os.path.exists(PARQUET_PATH):
    collected_data.to_parquet(PARQUET_PATH, engine='pyarrow', schema=schema)

# If the data file already exist, merge new data with the existing one.
else:
    old_parquet = pd.read_parquet(PARQUET_PATH, engine='pyarrow', schema=schema)
    new_parquet = pd.concat([old_parquet, collected_data])
    new_parquet = new_parquet.drop_duplicates(subset=['id','title','created_utc','name','permalink'], keep='last').reset_index(drop=True)
    new_parquet.to_parquet(PARQUET_PATH, engine='pyarrow', schema=schema)

# Use the new collected data to get comment stuff.
PARQUET_PATH = './data/wallstreetbets-collection.parquet'
submission_collection = pd.read_parquet(PARQUET_PATH, engine='pyarrow', schema=schema)
display(submission_collection)

## Part _: Getting Comment Data

### Creating a database of reddit threads

In [None]:
# Use the same methofology whih we used to collect submissions, but we'll add a parent submission id. and parent comment id.
# Since the comment section can be very deep, we'll limit comments to a breadth of 10.
# This may still be a lot more comments than we need for larger discussions.
def extract_comments_for(submission_id: str):
    try:
        submission = reddit.submission(id=submission_id)
        submission.comments.replace_more(limit=10)  # Limit to 10 levels of comments
        comments = []

        for comment in submission.comments.list():
            if isinstance(comment, praw.models.MoreComments):
                continue

            # NOTE: It looks like the top comment may be a user report. We'll ignore if is has certain text.
            SKIPTEXT = '**User Report**'
            if SKIPTEXT in comment.body:
                continue

            # Append the comment data to the list
            comments.append({
                'parent_post_id': submission_id,
                'parent_comment_id': comment.parent_id,
                'comment_id': comment.id,
                'author': str(comment.author),
                'created_utc': comment.created_utc,
                'score': comment.score,
                'body': comment.body
            })

        return comments

    except Exception as e:
        # Get the HTTP error code if available
        if hasattr(e, 'response') and e.response is not None:
            error_code = e.response.status_code
            print(f"HTTP Error {error_code} while fetching comments for submission {submission_id}")
        else:
            error_code = None

        # Print the an erroor message and return nothing.
        print(f"Error fetching comments for submission {submission_id}: {e}")
        return []



In [None]:
# Show the results from one submission's comments
submission_id = submission_collection.iloc[0]['id']

# How many actual comments are there for this submission?
submission = reddit.submission(id=submission_id)
print(f"Submission ID: {submission_id}")
print(f"Title: {submission.title}")
print(f"Number of comments: {submission.num_comments}")

# Get the comments for the submission
results = extract_comments_for(submission_id)

# Create a dataframe of the comments
comments_df = pd.DataFrame(results)

# Display the comments dataframe
display(comments_df)

In [None]:
# Create a pyarrow schema for the data types.
schema = pyarrow.schema([
    ('parent_post_id', pyarrow.string()),
    ('parent_comment_id', pyarrow.string()),
    ('comment_id', pyarrow.string()),
    ('author', pyarrow.string()),
    ('created_utc', pyarrow.float64()),
    ('score', pyarrow.int64()),
    ('body', pyarrow.string())
])

# Collect the comments for all the submissions.
all_comments = []
for submission in submission_collection['id']:
    comments = extract_comments_for(submission)
    all_comments.extend(comments)
    time.sleep(1)  # Optional delay to avoid rate limits

# Create a python dataframe of these comments.
comments_df = pd.DataFrame.from_records(all_comments)
display(comments_df)

In [None]:
# Save the parquet file.
PARQUET_PATH = './data/wallstreetbets-comment-collection.parquet'

# Write the comments to parquet file. If it exists, append to it.
if not os.path.exists(PARQUET_PATH):
    comments_df.to_parquet(PARQUET_PATH, engine='pyarrow', schema=schema)
else:
    old_parquet = pd.read_parquet(PARQUET_PATH, engine='pyarrow', schema=schema)
    new_parquet = pd.concat([old_parquet, comments_df])
    new_parquet = new_parquet.drop_duplicates(subset=['parent_post_id','parent_comment_id','author','created_utc'], keep='last').reset_index(drop=True)
    new_parquet.to_parquet(PARQUET_PATH, engine='pyarrow', schema=schema)

## Part _: Analysis of Reddit Data

## Part _: Sentiment Analysis

### Setup Tools

In [None]:
# Install packages
!pip install nltk
!pip install re

# Import packages
import nltk, re
nltk.download(['punkt',
               'punkt_tab',
               'stopwords',
               'vader_lexicon',
               'names',
               'averaged_perceptron_tagger',
               'wordnet'])

from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

### Create some basic tools.

In [None]:
# Initialize the Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

# Function to analyze sentiment of a single comment
def analyze_sentiment(comment):
    # Tokenize the comment
    tokens = word_tokenize(comment.lower())

    # Remove stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Get sentiment scores
    sentiment_scores = sia.polarity_scores(' '.join(filtered_tokens))

    return sentiment_scores

# Function to clean text
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters
    text = re.sub(r'\@\w+|\#', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

### Analyse a single submission

In [None]:
# Analyze the sentiment of a single submission.
# We'll look at a Discussion post with the id: '1jpxza5s', talking about robinhoods security
submission = reddit.submission(id='1jpxza5')

# Show the results of the analysis.
sentiment_scores = analyze_sentiment(submission.title)
print(f"Submission: {submission.title}")
print(f"Sentiment Scores: {sentiment_scores}")
print("-" * 80)