second: rotations

In [3]:
import praw
import json
from datetime import datetime, timedelta
import os
import time
from collections import OrderedDict

# Initialize Reddit API
reddit = praw.Reddit(
    client_id=' ',
    client_secret=' ',
    user_agent=' '
)

# Constants
SUBREDDIT = "StartUpIndia"
# SUBREDDIT = "indianstartups"
POST_LIMIT = 200
COMMENT_LIMIT = 3
DATA_FILE = "startupindia_posts_comments.json"
MAX_POSTS_TO_CHECK = 10000  # Increased from 1000
TIME_FILTERS = ['year', 'month', 'week', 'day', 'all', 'hour']  # Multiple time filters

def load_existing_data():
    """Load existing data with OrderedDict for efficient pruning"""
    if os.path.exists(DATA_FILE):
        with open(DATA_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
            # Convert posts to OrderedDict to maintain insertion order
            data['posts'] = OrderedDict(
                sorted(data['posts'].items(), 
                      key=lambda x: x[1]['fetched_at'], 
                      reverse=True)
            )
            return data
    return {"posts": OrderedDict(), "last_updated": None}

def get_top_comments(submission):
    """Fetch top comments with error handling"""
    try:
        submission.comments.replace_more(limit=0)
        return [{
            'id': comment.id,
            'author': str(comment.author),
            'body': comment.body,
            'score': comment.score,
            'created_utc': comment.created_utc,
            'permalink': f"https://reddit.com{comment.permalink}"
        } for comment in submission.comments[:COMMENT_LIMIT]]
    except Exception as e:
        print(f"Error getting comments for {submission.id}: {str(e)}")
        return []

def fetch_with_time_filter(time_filter):
    """Generator that yields posts with a given time filter"""
    for post in reddit.subreddit(SUBREDDIT).top(time_filter=time_filter, limit=None):
        yield post

def main():
    print(f"Fetching up to {POST_LIMIT} new posts from r/{SUBREDDIT}...")
    
    # Load existing data
    existing_data = load_existing_data()
    existing_posts = existing_data['posts']
    existing_ids = set(existing_posts.keys())
    print(f"Database contains {len(existing_posts)} existing posts")
    
    new_posts = OrderedDict()
    total_processed = 0
    found_new = 0
    
    # Strategy: Try different time filters to find new posts
    for time_filter in TIME_FILTERS:
        if found_new >= POST_LIMIT:
            break
            
        print(f"\nChecking 'top {time_filter}' posts...")
        try:
            for post in fetch_with_time_filter(time_filter):
                total_processed += 1
                
                if post.id not in existing_ids:
                    post_data = {
                        'id': post.id,
                        'title': post.title,
                        'author': str(post.author),
                        'score': post.score,
                        'upvote_ratio': post.upvote_ratio,
                        'num_comments': post.num_comments,
                        'created_utc': post.created_utc,
                        'url': post.url,
                        'selftext': post.selftext,
                        'permalink': f"https://reddit.com{post.permalink}",
                        'fetched_at': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
                        'time_filter': time_filter,
                        'comments': get_top_comments(reddit.submission(id=post.id))
                    }
                    new_posts[post.id] = post_data
                    existing_ids.add(post.id)
                    found_new += 1
                    
                    if found_new % 10 == 0:
                        print(f"Found {found_new} new posts (processed {total_processed} total)")
                    
                    if found_new >= POST_LIMIT or total_processed >= MAX_POSTS_TO_CHECK:
                        break
                
                # Rate limiting
                if total_processed % 100 == 0:
                    time.sleep(2)
                
        except Exception as e:
            print(f"Error processing {time_filter} posts: {str(e)}")
    
    if not new_posts:
        print(f"No new posts found after checking {total_processed} posts across {len(TIME_FILTERS)} time periods")
        return
    
    # Merge and limit total posts to reasonable size
    merged_posts = OrderedDict(list(new_posts.items()) + list(existing_posts.items()))
    # if len(merged_posts) > 10000:  # Keep only most recent 5000 posts
    #     merged_posts = OrderedDict(list(merged_posts.items())[:5000])
    
    # Prepare final data
    updated_data = {
        "posts": merged_posts,
        "last_updated": datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
        "stats": {
            "total_posts": len(merged_posts),
            "new_posts_added": len(new_posts),
            "posts_processed": total_processed,
            "time_filters_used": TIME_FILTERS
        }
    }
    
    # Save data
    with open(DATA_FILE, 'w', encoding='utf-8') as f:
        json.dump(updated_data, f, ensure_ascii=False, indent=2)
    
    print(f"\nSuccessfully added {len(new_posts)} new posts")
    print(f"Total posts in database: {len(merged_posts)}")
    print(f"Posts processed this run: {total_processed}")
    print(f"Data saved to {DATA_FILE}")

if __name__ == "__main__":
    main()

Fetching up to 200 new posts from r/StartUpIndia...
Database contains 5909 existing posts

Checking 'top year' posts...

Checking 'top month' posts...


  'fetched_at': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),



Checking 'top week' posts...
Found 10 new posts (processed 2101 total)
Found 20 new posts (processed 2215 total)

Checking 'top day' posts...

Checking 'top all' posts...

Checking 'top hour' posts...


  "last_updated": datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),



Successfully added 22 new posts
Total posts in database: 5931
Posts processed this run: 3242
Data saved to startupindia_posts_comments.json


third: hot, new, top