In [None]:
### Extract a PRAW post and visualize it using indents. ###

import os
import praw

# Define the subreddit
subreddit_name = 'news'

# Setup PRAW credentials
reddit = praw.Reddit(
    client_id=os.getenv("PRAW_CLIENT_ID"),
    client_secret=os.getenv("PRAW_CLIENT_SECRET"),
    user_agent=os.getenv("PRAW_USER_AGENT"),
)

def visualize_post(comments, depth=1, max_depth=3):
    if depth > max_depth:
        return
    indent = '    ' * depth  # Increase indentation for each depth level
    for comment in comments:
        if isinstance(comment, praw.models.MoreComments):
            continue
        print(f"{indent}{comment.score} upvotes: {comment.body[:100].replace("\n", "")}")  # Truncate long comments
        visualize_post(comment.replies, depth + 1, max_depth)

# Get the top post from a subreddit
subreddit = reddit.subreddit(subreddit_name)
top_post = next(subreddit.top(time_filter='day', limit=1))

top_post.comments.replace_more(limit=0)  # Load all comments
print(f"{top_post.score} upvotes: {top_post.title}\n{top_post.selftext}")
visualize_post(top_post.comments)


In [62]:
### Extract top posts from a specific subreddit and save as BSON files, while limiting tree depth and width. ###

import os
import praw
import bson

# Configure PRAW credentials
reddit = praw.Reddit(
    client_id=os.getenv("PRAW_CLIENT_ID"),
    client_secret=os.getenv("PRAW_CLIENT_SECRET"),
    user_agent=os.getenv("PRAW_USER_AGENT"),
)

# Define the subreddit
subreddit_name = 'all'

# Create dir to save bson files
directory = f"./{subreddit_name}"
if not os.path.exists(directory):
    os.makedirs(directory)

# Define limit for the number of posts
limit_posts = 10

subreddit = reddit.subreddit(subreddit_name)

def fetch_comments(submission, max_depth=3, max_top_level_comments=10, max_replies_per_comment=3):
    """Fetch comments from a submission with separate controls for depth, top-level comments, and replies."""
    submission.comments.replace_more(limit=0)  # Limit expansion of MoreComments

    def fetch(comment_list, depth):
        if depth > max_depth:
            return []  # Stop recursion beyond max depth
        local_comments = []
        comment_limit = max_top_level_comments if depth == 1 else max_replies_per_comment
        for comment in comment_list[:comment_limit]:  # Limit the number of comments processed at each depth
            if isinstance(comment, praw.models.MoreComments):
                continue  # Skip 'MoreComments' if any left
            comment_data = {
                'id': comment.id,
                'author': str(comment.author),
                'body': comment.body,
                'created_utc': comment.created_utc,
                'score': comment.score,
                'replies': fetch(comment.replies, depth + 1)  # Recursive call to process replies
            }
            local_comments.append(comment_data)
        return local_comments

    # Start fetching comments from the top level
    comments = fetch(submission.comments, 1)  # Start with depth 1
    return comments

for index, submission in enumerate(subreddit.top('day', limit=limit_posts)):
    post_data = {
        "title": submission.title,
        "text": submission.selftext,
        "author": str(submission.author),
        "score": submission.score,
        "created_utc": submission.created_utc,
        "num_comments": submission.num_comments,
        "id": submission.id,
        "url": submission.url,
        "comments": fetch_comments(submission)  # Fetch and store comments
    }

    # Serialize data to BSON
    bson_data = bson.BSON.encode(post_data)
    
    # Save BSON data to a file
    file_path = os.path.join(directory, f'post{index + 1}.bson')
    with open(file_path, 'wb') as file:
        file.write(bson_data)

print("Data saved successfully to BSON files.")


Call this function with 'time_filter' as a keyword argument.
  for index, submission in enumerate(subreddit.top('day', limit=limit_posts)):


Data saved successfully to BSON files.


In [9]:
import datetime
today = datetime.date.today().isoformat()
print(today)

2024-07-05


In [3]:
import os
import praw

# Configure PRAW credentials
reddit = praw.Reddit(
    client_id=os.getenv("PRAW_CLIENT_ID"),
    client_secret=os.getenv("PRAW_CLIENT_SECRET"),
    user_agent=os.getenv("PRAW_USER_AGENT"),
)

reddit.auth.limits # Check current PRAW auth limits

{'remaining': None, 'reset_timestamp': None, 'used': None}

In [63]:
### BSON to JSON ###

import bson
import json

# file path
bson_file_path = 'all/post1.bson'
json_file_path = 'all/post1.json'

# Read the BSON file
with open(bson_file_path, 'rb') as file:
    bson_data = file.read()
    data_dict = bson.BSON.decode(bson_data)  # Decode BSON data to dict

# Write data to JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(data_dict, json_file, indent=4)  # Serialize dictionary to JSON and save it

print(f"Converted {bson_file_path} to JSON and saved as {json_file_path}")


Converted all/post1.bson to JSON and saved as all/post1.json


In [65]:
### BSON to dict ###

import bson

def bson_to_dict(bson_file_path):
    """Read a BSON file and convert it to a Python dictionary."""
    with open(bson_file_path, 'rb') as file:
        bson_data = file.read()
        data_dict = bson.BSON.decode(bson_data)
        return data_dict
        
bson_file_path = 'all/post1.bson'
post_data = bson_to_dict(bson_file_path)
print(post_data)

{'title': "Claudia Sheinbaum becomes Mexico's first ever female president. ", 'text': '', 'author': 'MorningStarZ99', 'score': 107851, 'created_utc': 1717416176.0, 'num_comments': 4510, 'id': '1d72q9w', 'url': 'https://i.redd.it/1s12rmk1kc4d1.jpeg', 'comments': [{'id': 'l6wdoa0', 'author': 'AutoModerator', 'body': 'It looks like this post is about Politics. Various methods of filtering out content relating to Politics can be found [here](https://www.reddit.com/r/pics/wiki/v2/resources/filter/politics).\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/pics) if you have any questions or concerns.*', 'created_utc': 1717416178.0, 'score': 1, 'replies': []}, {'id': 'l6wl0y9', 'author': 'KuntaWuKnicks', 'body': 'When I read the headline\n\n “Number of assassinated candidates was 37 before the vote” I triple read it and thought one the headline can’t be right and two the story can’t be right\n\nIt was. \n\nW