# Get The Top Posts From The Top Subreddits
Get the up to the top 500 posts of the past year from the top 500 public subreddits and store their metadata as a CSV. This does include NSFW posts.

Data Stored In: top_posts.csv

In [1]:
import praw
import json
from config import *
from csv import writer

In [2]:
# Open and store the top subreddits as a list
with open('../data/top_subreddits.json', 'r') as file:
    top_subreddits = json.loads(file.read())
    top_subreddits = list(top_subreddits.keys())

In [3]:
# Initialize PRAW API
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT,
    username=REDDIT_USERNAME,
    password=REDDIT_PASSWORD,
)

In [4]:
# Store the metadata of each submission in a list
metadata = []

## Values that will be stored per submission (in order):
* unixtime,
* title,
* total_votes,
* reddit_id,
* number_of_upvotes,
* subreddit,
* number_of_downvotes,
* score,
* number_of_comments,
* username

In [5]:
header = [ 'unixtime', 'title', 'total_votes', 'reddit_id', 'number_of_upvotes',
          'subreddit', 'number_of_downvotes', 'score', 'number_of_comments', 
          'username' ]

In [6]:
with open('../data/top_posts.csv', 'w') as file:
    # Create the CSV writer
    writer = writer(file)
    
    writer.writerow(header)
    
    for subreddit in top_subreddits:
        try:
            for submission in reddit.subreddit(subreddit).top('year', limit=500):
                data = []
                
                # Get the upvotes, downvotes, and total votes
                upvotes = submission.score
                upvote_ratio = submission.upvote_ratio
                if upvote_ratio < 1:
                    upvote_ratio = upvote_ratio * 100
                else:
                    upvote_ratio = 100
                downvotes =  int((upvotes*100)/(upvote_ratio) - upvotes)
                
                total_votes = upvotes + downvotes
                
                # Get the score (the award)
                score = 0
                for reward in submission.all_awardings:
                    try:
                        score += reward['coin_price'] * reward['count']
                    except:
                        pass
                    
                # Get the correct dates and times
                unix_time_utc = submission.created_utc
                
                # unixtime
                data.append(submission.created_utc)
                # title
                data.append(submission.title)
                # total_votes
                data.append(total_votes)
                # reddit_id
                data.append(submission.id)
                # number_of_upvotes
                data.append(upvotes)
                # subreddit
                data.append(subreddit)
                # number_of_downvotes
                data.append(downvotes)
                # score
                data.append(score)
                # number_of_comments
                data.append(submission.num_comments)
                # username
                try:
                    data.append(submission.author.name)
                except:
                    data.append(None)
                
                # Write this row to the output file
                writer.writerow(data)
        except:
            print("Subreddit fully/partially skipped: r/{}".format(subreddit))