# Downloading data from Reddit

Using the PRAW library to get all the subreddit submissions

In [1]:
import os
from pprint import pprint

REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_SECRET = os.getenv("REDDIT_SECRET")
REDDIT_USERNAME = os.getenv("REDDIT_USERNAME")
REDDIT_PASSWORD = os.getenv("REDDIT_PASSWORD")
USER_AGENT = "web:com.sysmed.quesmed:v1.0.0 (by /u/StefMitra)"

def exists(path):
    """Test whether a path exists. Returns False for broken symbolic links"""
    try:
        os.stat(path)
    except os.error:
        return False
    return True

In [2]:
# import requests
# import requests.auth

# client_auth = requests.auth.HTTPBasicAuth(REDDIT_CLIENT_ID, REDDIT_SECRET)
# payload = {"grant_type": "password", "username": REDDIT_USERNAME, "password": REDDIT_PASSWORD}
# headers = {"User-Agent": USER_AGENT}
# response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, params=payload, headers=headers)
# res_json = response.json()

# REDDIT_TOKEN = res_json['access_token']

In [3]:
import praw

reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_SECRET,
    user_agent=USER_AGENT
)

subreddit = reddit.subreddit("spinalfusion")

In [4]:
import pandas as pd

# https://www.reddit.com/dev/api/#fullnames
# params={'after':'t3_14bqehq'}

for post in subreddit.new(limit=1):
    print(post.id)
    print(post.title)
    print(pd.to_datetime(post.created_utc, unit='s'))
    print()
    pprint(vars(post))

16ivyc1
Proud of you all x
2023-09-14 22:25:04

{'_comments_by_id': {},
 '_fetched': False,
 '_reddit': <praw.reddit.Reddit object at 0x104d349d0>,
 'all_awardings': [],
 'allow_live_comments': False,
 'approved_at_utc': None,
 'approved_by': None,
 'archived': False,
 'author': Redditor(name='Disney_lover_x'),
 'author_flair_background_color': None,
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_template_id': None,
 'author_flair_text': None,
 'author_flair_text_color': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_lud3aqqg',
 'author_is_blocked': False,
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'banned_at_utc': None,
 'banned_by': None,
 'can_gild': False,
 'can_mod_post': False,
 'category': None,
 'clicked': False,
 'comment_limit': 2048,
 'comment_sort': 'confidence',
 'content_categories': None,
 'contest_mode': False,
 'created': 1694730304.0,
 'created_utc': 1694730304.0,
 'discussion_type': None,
 '

In [5]:
# from tqdm import tqdm

# keys = (
#     'id',
#     'created_utc',
#     'permalink',
#     'author',
#     'title',
#     'selftext',
#     'view_count',
#     'ups',
#     'downs',
#     'upvote_ratio',
#     'likes',
#     'all_awardings',
#     'media',
#     'media_embed'
# )

# stream_data = {k: [] for k in keys}

# pbar = tqdm()

# for post in reddit.subreddit("spinalfusion").stream.submissions():
#     pbar.update(1)
#     for k in keys:
#         stream_data[k].append(vars(post)[k])

# pbar.close()

In [6]:
# from tqdm import tqdm

# pbar = tqdm()
# stream_data = []

# getting = True
# kwargs = {'limit': 1000}
# last_id = None

# while getting:
#   if last_id:
#      kwargs['params'] = {'after': f"t3_{last_id}"}
#   print(kwargs)
#   if len(stream_data):
#      print(pd.to_datetime(stream_data[-1].created_utc, unit='s'))
#   for post in subreddit.new(**kwargs):
#       stream_data.append(post)
  
#   if last_id == stream_data[-1].id:
#      getting = False
#   else:
#     last_id = stream_data[-1].id
#   pbar.update(len(stream_data))

# pbar.close()

In [16]:
import pandas as pd
from tqdm import tqdm

keys = (
    'id',
    'created_utc',
    'permalink',
    'author',
    'title',
    'selftext',
    'view_count',
    'ups',
    'downs',
    'upvote_ratio',
    'likes',
    'all_awardings',
    'media',
    'media_embed'
)

data_file = 'data/reddit_raw_sep.h5'

if exists(data_file):
    df = pd.read_hdf(data_file, key='df')
else:
  data = {k: [] for k in keys}

  limit = 1000
  pbar = tqdm(total=limit)

  for post in subreddit.new(limit=limit):
      pbar.update(1)
      for k in keys:
          data[k].append(vars(post)[k])

  pbar.close()
  
  df = pd.DataFrame.from_dict(data)
  df.to_hdf(data_file, key='df', mode='w')

df.shape

(997, 14)

In [17]:
df['created_utc'] = df['created_utc'].apply(lambda x: pd.to_datetime(x, unit='s'))

In [18]:
f"Date range of posts: {df.iat[-1, 1]} to {df.iat[0, 1]}"

'Date range of posts: 2023-02-15 19:15:35 to 2023-09-14 22:25:04'