# Get The Top Posts From The Top Subreddits
Get the top 200 posts of the past year from the top 500 subreddits and store their metadata as a CSV.

Data Stored In: top_posts.csv

In [1]:
import praw
import json
from config import *
from csv import writer
from datetime import datetime
from prawcore.exceptions import Forbidden

In [2]:
# Open and store the top subreddits as a list
with open('../data/top_subreddits.json', 'r') as file:
    top_subreddits = json.loads(file.read())
    top_subreddits = list(top_subreddits.keys())

In [3]:
# Initialize PRAW API
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT,
    username=REDDIT_USERNAME,
    password=REDDIT_PASSWORD,
)

In [4]:
# import pprint
# # assume you have a Reddit instance bound to variable `reddit`
# submission = reddit.submission(url='https://www.reddit.com/r/wallstreetbets/comments/qyth8s/this_is_how_you_invest/')
# print(submission.title)  # to make it non-lazy
# pprint.pprint(vars(submission))

In [5]:
# Store the metadata of each submission in a list
metadata = []

## Values that will be stored per submission (in order):
* image_id,
* unixtime,
* rawtime,
* title,
* total_votes,
* reddit_id,
* number_of_upvotes,
* subreddit,
* number_of_downvotes,
* localtime,
* score,
* number_of_comments,
* username

In [6]:
for subreddit in top_subreddits:
    try:
        for submission in reddit.subreddit(subreddit).top('year', limit=5):
            data = []
            
            # Get the upvotes, downvotes, and total votes
            upvotes = submission.score
            upvote_ratio = submission.upvote_ratio
            if upvote_ratio < 1:
                upvote_ratio = upvote_ratio * 100
            else:
                upvote_ratio = 100
            downvotes =  int((upvotes*100)/(upvote_ratio) - upvotes)
            
            total_votes = upvotes + downvotes
            
            # Get the score (the award)
            score = 0
            for reward in submission.all_awardings:
                try:
                    score += reward.coin_price * reward.count
                except:
                    pass
                
            # Get the correct dates and times
            unix_time_utc = submission.created_utc
            raw_time = datetime.utcfromtimestamp(unix_time_utc).strftime('%Y-%m-%d%H:%M:%S')
            local_time = datetime.fromtimestamp(unix_time_utc).strftime('%Y-%m-%d%H:%M:%S')
            
            # image_id
            try:
                data.append(submission.preview.images[0].id)
            except:
                data.append(0)
            # unixtime
            data.append(submission.created_utc)
            # rawtime
            data.append(raw_time)
            # title
            data.append(submission.title)
            # total_votes
            data.append(total_votes)
            # reddit_id
            data.append(submission.id)
            # number_of_upvotes
            data.append(upvotes)
            # subreddit
            data.append(subreddit)
            # number_of_downvotes
            data.append(downvotes)
            # localtime
            data.append(local_time)
            # score
            data.append(score)
            # number_of_comments
            data.append(submission.num_comments)
            # username
            try:
                data.append(submission.author.name)
            except:
                data.append(None)
            
            # Append this row to the CSV
            metadata.append(data)
    except Forbidden:
        print("Subreddit Partially or Fully Skipped: {}".format(subreddit))

BlackPeopleTwitter
ImGoingToHellForThis


In [7]:
header = [ 'image_id', 'unixtime', 'rawtime', 'title', 'total_votes', 
          'reddit_id', 'number_of_upvotes', 'subreddit', 'number_of_downvotes',
          'localtime', 'score', 'number_of_comments', 'username' ]

with open('../data/top_posts.csv', 'w') as file:
    # Create the CSV writer
    writer = writer(file)
    
    writer.writerow(header)
    
    for row in metadata:
        writer.writerow(row)