# Reddit scraping

Based on the instructions given at https://github.com/pushshift/api and https://praw.readthedocs.io/en/latest/getting_started/authentication.html

for personal use only.

In [None]:
import pandas as pd
import os
import requests
import json

import datetime as dt

import praw

In [None]:
subreddits = ['https://www.reddit.com/r/...', 
        'https://www.reddit.com/r/...',
        'https://www.reddit.com/r/...',
        'https://www.reddit.com/r/...']

subreddit_names = [ele.split('/')[-1] for ele in subreddits]

url_api = 'https://api.pushshift.io/reddit'

comment_id_ep = 'submission/comment_ids'
submisison_ep = 'search/submission'

In [None]:
# Get submissions

def get_submissions(subreddit_name):
    
    # period in days
    period = 120
    n_max = 50

    n = 1000
    
    submissions = []
    bool_none = False
    
    for i in range(n_max):
        print(f'[{i+1}/{n_max}]: minus {(i+1)*period} days to {i*period}')
        url_request = f'{url_api}/{submisison_ep}/?subreddit={subreddit_name}&sort_type=created_utc&size={n}&after={(i+1)*period}d&before={i*period}d'

        response = requests.get(url_request)

        if response.status_code == 200:
            print('Success!')
        elif response.status_code == 404:
            print('Not Found.')
            continue

        json_data = json.loads(response.text)

        if not('data' in json_data.keys()):
            print('No data found')
            continue

        n_submissions = len(json_data['data'])

        if (n_submissions <= 0):
            print('No submissions found')
            if (bool_none):
                break
            
            bool_none = True
            continue

        submissions.extend(json_data['data'])
        
    return submissions


def get_reddit():
    
    # See https://praw.readthedocs.io/en/latest/getting_started/authentication.html
    reddit = praw.Reddit(
        client_id="",
        client_secret="",
        password="",
        user_agent="",
        username="",
    )
    
    return reddit

def export_subreddit(subreddit_name, submission_ids, path_base):

    reddit = get_reddit()

    subreddit = reddit.subreddit(subreddit_name)

    dt_unix = dt.datetime(1970, 1, 1)

    path_subreddit_out = os.path.join(path_base, subreddit_name)

    if not(os.path.isdir(path_subreddit_out)):
        os.mkdir(path_subreddit_out)

    print(subreddit.display_name)

    S = {}

    submissions_text, submissions_up, submissions_down, submissions_created = [], [], [], []

    for i, submission_id in enumerate(submission_ids):
        print(f'Submission [{i+1}/{len(submission_ids)}]')

        # Get
        submission = reddit.submission(submission_id)

        # Set
        submission_title = submissions.title
        submission_text = submission.selftext
        submission_up = submission.ups
        submission_down = submission.downs
        submission_created = dt_unix + dt.timedelta(seconds=submission.created_utc)

        # Convert
        submission_created = submission_created.strftime('%Y%m%dT%H%M%S')

        # Append
        submissions_title.append(submission_title)
        submissions_text.append(submission_text)
        submissions_up.append(submission_up)
        submissions_down.append(submission_down)
        submissions_created.append(submission_created)

        # Define
        path_submission_out = os.path.join(path_subreddit_out, 'submissions')

        if not(os.path.isdir(path_submission_out)):
            os.mkdir(path_submission_out)

        # Initialize
        SC = {}

        comments_text, comments_up, comments_down, comments_created = [], [], [], []

        submission.comments.replace_more(limit=None)
        comments = submission.comments.list()
        n_comments = len(comments)
        for j, comment in enumerate(comments):
            print(f'\tComment [{j+1}/{n_comments}]')

            # Set
            comment_text = comment.body
            comment_up = comment.ups
            comment_down = comment.downs
            comment_created = dt_unix + dt.timedelta(seconds=comment.created_utc)

            if ('[deleted]' in comment_text):
                continue

            # Convert
            comment_created = comment_created.strftime('%Y%m%dT%H%M%S')

            # Append
            comments_text.append(comment_text)
            comments_up.append(comment_up)
            comments_down.append(comment_down)
            comments_created.append(comment_created)

        # Add
        SC['text'] = [f'{submission_title} QQQ {submission_text}'] + comments_text
        SC['up'] = [submission_up] + comments_up
        SC['down'] = [submission_down] + comments_down
        SC['created'] = [submission_created] + comments_created

        # Create
        df = pd.DataFrame(SC)

        # Set
        filename_csv = f'{subreddit_name}_{submission_id}_{submission_created}.csv'
        file_csv = os.path.join(path_submission_out, filename_csv)

        # Export
        df.to_csv(file_csv, sep='\t', index=False)

    # Add
    S['title'] = submissions_title
    S['text'] = submissions_text
    S['up'] = submissions_up
    S['down'] = submissions_down
    S['created'] = submissions_created

    # Create
    df = pd.DataFrame(S)

    # Set
    filename_csv = f'{subreddit_name}_{n_submissions}.csv'
    file_csv = os.path.join(path_subreddit_out, filename_csv)

    # Export
    df.to_csv(file_csv, sep='\t', index=False)
    

In [None]:
path_base = r'C:\path\to\output'

# Iterate over subreddits
for k, subreddit_name in enumerate(subreddit_names):
    print(f'[{k+1}/{len(subreddit_names)}]: {subreddit_name}')

    # Get submissions
    submissions = get_submissions(subreddit_name)

    # Get submission ids
    submission_ids = [ele['id'] for ele in submissions]

    # Export
    export_subreddit(subreddit_name, submission_ids, path_base)
