In [None]:
from dotenv import load_dotenv
import os
import datetime
import csv
import praw

load_dotenv()

client_id = os.getenv("REDDIT_CLIENT_ID")
client_secret = os.getenv("REDDIT_CLIENT_SECRET")
user_agent = os.getenv("REDDIT_USER_AGENT")

reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)


In [11]:
import requests
import datetime
import csv
import time
import os

def get_posts_by_week(subreddit, start_date, end_date):
    """Holt Posts für einen bestimmten Zeitraum über Pushshift API"""
    url = "https://api.pushshift.io/reddit/search/submission"
    
    params = {
        'subreddit': subreddit,
        'after': int(start_date.timestamp()),
        'before': int(end_date.timestamp()),
        'size': 500,  # Max 500 pro Request
        'sort_type': 'created_utc',
        'sort': 'desc'
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.json().get('data', [])
    except Exception as e:
        print(f"Fehler bei API Call: {e}")
        return []

# Zeiträume definieren
end_date = datetime.datetime.now(datetime.UTC)
start_date = end_date - datetime.timedelta(days=365)

subreddits = [
    "wallstreetbets", "stocks", "investing", "StockMarket", "wallstreetbetsGER"
]

with open("./data/reddit_posts.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["subreddit", "title", "body", "created_utc", "upvotes"])

    for subreddit_name in subreddits:
        print(f"--- Posts aus r/{subreddit_name} ---")
        total_posts = 0
        
        # Gehe wochenweise durch das Jahr
        current_date = start_date
        week_count = 0
        
        while current_date < end_date:
            week_end = min(current_date + datetime.timedelta(days=7), end_date)
            week_count += 1
            
            print(f"Woche {week_count}: {current_date.strftime('%Y-%m-%d')} bis {week_end.strftime('%Y-%m-%d')}")
            
            posts = get_posts_by_week(subreddit_name, current_date, week_end)
            
            for post in posts:
                try:
                    writer.writerow([
                        subreddit_name,
                        post.get('title', ''),
                        post.get('selftext', '').replace('\n', ' ').strip(),
                        datetime.datetime.fromtimestamp(post['created_utc'], datetime.UTC).isoformat(),
                        post.get('score', 0)
                    ])
                    total_posts += 1
                except Exception as e:
                    print(f"Fehler beim Schreiben eines Posts: {e}")
                    continue
            
            print(f"  {len(posts)} Posts gefunden")
            
            # Rate limiting - Pushshift empfiehlt 1 Request pro Sekunde
            time.sleep(1)
            
            current_date = week_end
        
        print(f"Gesamt: {total_posts} Posts aus r/{subreddit_name}")

print("Scraping abgeschlossen!")

--- Posts aus r/wallstreetbets ---
Woche 1: 2024-06-13 bis 2024-06-20
Fehler bei API Call: 403 Client Error: Forbidden for url: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&after=1718273693&before=1718878493&size=500&sort_type=created_utc&sort=desc
  0 Posts gefunden
Woche 2: 2024-06-20 bis 2024-06-27
Fehler bei API Call: 403 Client Error: Forbidden for url: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&after=1718878493&before=1719483293&size=500&sort_type=created_utc&sort=desc
  0 Posts gefunden
Woche 3: 2024-06-27 bis 2024-07-04
Fehler bei API Call: 403 Client Error: Forbidden for url: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&after=1719483293&before=1720088093&size=500&sort_type=created_utc&sort=desc
  0 Posts gefunden
Woche 4: 2024-07-04 bis 2024-07-11
Fehler bei API Call: 403 Client Error: Forbidden for url: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&after=17

KeyboardInterrupt: 