In [7]:
import os
from pprint import pprint
import datetime as dt

import praw
import pandas as pd

In [8]:
COLUMNS = [
    'title', 'score', 'id', 'name', 'url',
    'author', 'selftext', 'approved_at_utc', 'banned_at_utc', 'created_utc'
]

In [13]:
reddit = praw.Reddit(
    client_id=os.getenv('CLIENT_ID'),
    client_secret=os.getenv('CLIENT_SECRET'),
    username=os.getenv('USERNAME'),
    password=os.getenv('PASSWORD'),
    user_agent=f"DataForGoodTest"
)

In [6]:
subreddit = reddit.subreddit('vancouver')

In [7]:
type(subreddit)

praw.models.reddit.subreddit.Subreddit

In [8]:
%%time
posts = []
for submission in subreddit.new(limit=1_000): #only retrieve 1000 submissions
    posts.append(
        (
            submission.title,
            submission.score, #upvotes
            submission.id,
            submission.name,
            submission.url,
            submission.author,
            submission.selftext,
            submission.approved_at_utc,
            submission.banned_at_utc,
            submission.created_utc,
        )
    )

CPU times: user 250 ms, sys: 32.9 ms, total: 283 ms
Wall time: 21 s


In [9]:
df = pd.DataFrame(
    posts, 
    columns=COLUMNS
)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 976 entries, 0 to 975
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            976 non-null    object 
 1   score            976 non-null    int64  
 2   id               976 non-null    object 
 3   name             976 non-null    object 
 4   url              976 non-null    object 
 5   author           972 non-null    object 
 6   selftext         976 non-null    object 
 7   approved_at_utc  0 non-null      object 
 8   banned_at_utc    0 non-null      object 
 9   created_utc      976 non-null    float64
dtypes: float64(1), int64(1), object(8)
memory usage: 76.4+ KB


In [11]:
df.to_csv('raw_result.csv', index=False)

Which variables does PRAW retrieve?

In [12]:
pprint(vars(submission).keys())

dict_keys(['comment_limit', 'comment_sort', '_reddit', 'approved_at_utc', 'subreddit', 'selftext', 'author_fullname', 'saved', 'mod_reason_title', 'gilded', 'clicked', 'title', 'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls', 'link_flair_css_class', 'downs', 'thumbnail_height', 'top_awarded_type', 'hide_score', 'name', 'quarantine', 'link_flair_text_color', 'upvote_ratio', 'author_flair_background_color', 'subreddit_type', 'ups', 'total_awards_received', 'media_embed', 'thumbnail_width', 'author_flair_template_id', 'is_original_content', 'user_reports', 'secure_media', 'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed', 'link_flair_text', 'can_mod_post', 'score', 'approved_by', 'is_created_from_ads_ui', 'author_premium', 'thumbnail', 'edited', 'author_flair_css_class', 'author_flair_richtext', 'gildings', 'content_categories', 'is_self', 'mod_note', 'created', 'link_flair_type', 'wls', 'removed_by_category', 'banned_by', 'author_flair_type', 'domai

In [13]:
submission.title

'If anyone’s car was broken into Tuesday morning in south vancouver the vpd most likely has ur stuff'

In [14]:
submission.score

98

In [15]:
[key for key in vars(submission).keys() if 'date' in key or 'time' in key or 'utc' in key]

['approved_at_utc', 'banned_at_utc', 'created_utc']

In [16]:
df.tail()

Unnamed: 0,title,score,id,name,url,author,selftext,approved_at_utc,banned_at_utc,created_utc
971,Medical Emergency at Gateway Station,24,11ntm7q,t3_11ntm7q,https://www.reddit.com/r/vancouver/comments/11...,GenShibe,\n13:00 UPDATE: ALL CLEARED. \n\n========\n\n1...,,,1678466000.0
972,spotted: cat near 1770 Pendrell.,63,11ni8ak,t3_11ni8ak,https://www.reddit.com/gallery/11ni8ak,odditiesoflife,,,,1678433000.0
973,"Promotional Articles, etc",0,11ntgpq,t3_11ntgpq,https://www.reddit.com/r/vancouver/comments/11...,slappi01,Does anyone have any good recommendation for a...,,,1678465000.0
974,How to find out if a building has had seismic ...,10,11nt9tw,t3_11nt9tw,https://www.reddit.com/r/vancouver/comments/11...,sheepyshu,Is there a registry or some database for the c...,,,1678465000.0
975,If anyone’s car was broken into Tuesday mornin...,98,11nt8tq,t3_11nt8tq,https://www.reddit.com/r/vancouver/comments/11...,theaceofspades1191,A thief was breaking into multiple cars in sou...,,,1678465000.0


In [17]:
df.head()

Unnamed: 0,title,score,id,name,url,author,selftext,approved_at_utc,banned_at_utc,created_utc
0,Father stabbed to death outside Vancouver Star...,3,1259zb5,t3_1259zb5,https://vancouversun.com/news/local-news/fathe...,raulh,,,,1680056000.0
1,Squeaky the squirrel!,2,1259lzx,t3_1259lzx,https://v.redd.it/excgj4of3lqa1,Ok-Midnight-8732,,,,1680055000.0
2,Do 7-11 stores have a hard time surviving in V...,1,1258tjg,t3_1258tjg,https://www.reddit.com/r/vancouver/comments/12...,S-Wind,7-11 on Main Street @ 14 Avenue: Gone\n\n7-11 ...,,,1680053000.0
3,Curious what these little houses on top of thi...,2,1258re6,t3_1258re6,https://i.redd.it/42014o55xkqa1.jpg,No_Responsibility442,,,,1680053000.0
4,Rock Chip Dump Truck HWY 99 Near Vancouver Lan...,0,1258cxl,t3_1258cxl,https://www.reddit.com/r/vancouver/comments/12...,Suspro70,Seeking witnesses for a dump truck travelling ...,,,1680051000.0


In [18]:
dt.datetime.fromtimestamp(df.created_utc[0])

datetime.datetime(2023, 3, 28, 19, 7, 6)

In [19]:
df['created_utc'] = df['created_utc'].map(dt.datetime.fromtimestamp)

In [20]:
last_post = df.loc[df['created_utc'].idxmax()]

In [21]:
new_posts = []
for submission in subreddit.new(limit=1_000):
    if dt.datetime.fromtimestamp(submission.created_utc) <= last_post.created_utc:
        break
    new_posts.append(
        (
            submission.title,
            submission.score, #upvotes
            submission.id,
            submission.name,
            submission.url,
            submission.author,
            submission.selftext,
            submission.approved_at_utc,
            submission.banned_at_utc,
            submission.created_utc,
        )
    )

In [22]:
new_df = pd.DataFrame(new_posts, columns=COLUMNS)

In [23]:
new_df.info(b)

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            0 non-null      object
 1   score            0 non-null      object
 2   id               0 non-null      object
 3   name             0 non-null      object
 4   url              0 non-null      object
 5   author           0 non-null      object
 6   selftext         0 non-null      object
 7   approved_at_utc  0 non-null      object
 8   banned_at_utc    0 non-null      object
 9   created_utc      0 non-null      object
dtypes: object(10)
memory usage: 0.0+ bytes


In [1]:
import os
from pprint import pprint
import datetime as dt

import praw
import pandas as pd

In [2]:
reddit = praw.Reddit(
    client_id=os.getenv('CLIENT_ID'),
    client_secret=os.getenv('CLIENT_SECRET'),
    username=os.getenv('USERNAME'),
    password=os.getenv('PASSWORD'),
    user_agent=f"DataForGoodTest"
)

Version 7.6.1 of praw is outdated. Version 7.7.0 was released Saturday February 25, 2023.


In [3]:
from reddit_worker import SubRedditWorker, IncrementalCSVExporter

In [4]:
worker = SubRedditWorker(subreddit='vancouver', reddit_instance=reddit, exporter=IncrementalCSVExporter('result.csv'))

In [5]:
worker.run()

interval=300
interval=330
interval=330
interval=360
interval=360
interval=390
interval=420
interval=450
interval=480
interval=480
interval=480
interval=510
interval=510
interval=510
interval=540
interval=540
interval=540
interval=570


RequestException: error with request HTTPSConnectionPool(host='oauth.reddit.com', port=443): Max retries exceeded with url: /r/vancouver/new?limit=1000&raw_json=1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x13f7a52e0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))