In [29]:
import requests
import time
import pandas as pd

# Rutgers subreddit URL
rutgers_url = 'https://www.reddit.com/r/rutgers/.json'
# User-agent header to mimic a browser
headers = {'User-agent': 'Mozilla/5.0'}

rutgers_posts = []
after = None

for i in range(10000):  # Loop to fetch up to 10000 pages
    print(f"Fetching page {i + 1}")
    params = {'after': after} if after else {}
    
    response = requests.get(rutgers_url, params=params, headers=headers)
    if response.status_code == 200:
        json_data = response.json()
        rutgers_posts.extend(json_data['data']['children'])
        after = json_data['data'].get('after')  # Update the 'after' for pagination
    else:
        print('Status Code Error', response.status_code)
        break
    time.sleep(1)  # Sleep to avoid hitting rate limit

# Extract relevant data from posts
post_data = [{
    'title': post['data']['title'],
    'score': post['data']['score'],  # Net karma (upvotes - downvotes)
    'ups': post['data']['ups'],  # Total upvotes (this field is not always visible depending on the data access method)
    'downvotes': post['data']['ups'] - post['data']['score'],  # Calculated downvotes
    'id': post['data']['id'],
    'url': post['data']['url'],
    'created': post['data']['created_utc'],  # 'created_utc' for the UTC timestamp
    'body': post['data'].get('selftext', '')  # Use get to avoid KeyError if 'selftext' is missing
} for post in rutgers_posts]


# Create a DataFrame
df = pd.DataFrame(post_data)

# Print DataFrame to see if data looks correct
print(df.head())



Fetching page 1
Fetching page 2
Fetching page 3


KeyboardInterrupt: 

In [28]:
df.shape

(2477, 6)

In [26]:
# Save DataFrame to CSV
df.to_csv('rutgers_posts.csv', index=False)
print("Data saved to rutgers_posts.csv")

Data saved to rutgers_posts.csv
