In [2]:
import praw
import os
import datetime as dt
import time
import pandas as pd
from psaw import PushshiftAPI

In [3]:
# retrieving info w environment variables
USERNAME = os.environ.get('REDDIT_NLP_USERNAME')
PASSWORD = os.environ.get('REDDIT_NLP_PASSWORD')
CLIENT_ID = os.environ.get('REDDIT_NLP_CLIENT_ID')
CLIENT_SECRET = os.environ.get('REDDIT_NLP_SECRET')
TARGET_SUBREDDIT_NAME = 'republican'

In [4]:
reddit= praw.Reddit(user_agent='Comment Extraction (by u/Reddit_nlp_pa)',
                    client_id=CLIENT_ID, client_secret=CLIENT_SECRET,
                    username=USERNAME, password=PASSWORD)


In [5]:
api = PushshiftAPI(reddit)

In [6]:
print('Fetching data')
start_fetching_data = time.time()
start_epoch=int(dt.datetime(2017, 1, 1).timestamp())

submissions = list(api.search_submissions(
                                                    after=start_epoch,
                                                    subreddit=TARGET_SUBREDDIT_NAME,
                                                    limit=1_000_000))
end_fetching_data = time.time()
print(f'Finished fetching data. Elapsed time: {end_fetching_data-start_fetching_data}')

Fetching data




Finished fetching data. Elapsed time: 2024.6309807300568


In [7]:
print(f'submission count: {len(submissions)}')
print(f'1st submissions content: {submissions[0].title}')

submission count: 56246
1st submissions content: There the same picture.


In [8]:
post_df = pd.DataFrame([[p.title, p.author, p.created, p.id, p.name, p.num_comments, p.score]  for p in submissions])
post_df.rename(columns={0: 'Title', 1: 'Author', 2: 'created', 3: 'id', 4: 'fullname', 5: 'n_comments', 6: 'score'})

Unnamed: 0,Title,Author,created,id,fullname,n_comments,score
0,There the same picture.,,1.600251e+09,itmnrx,t3_itmnrx,0,1
1,"Bill Barr Rails Against Mail-In Voting, Goes A...",tabbykat69,1.600251e+09,itmks1,t3_itmks1,1,2
2,Most Hilarious Anti-Biden Commercial Yet?! Mus...,Curious447,1.600250e+09,itmim6,t3_itmim6,1,1
3,"Ayaan Hirsi Ali: On September 11, here's what ...",Foubar,1.600250e+09,itmh6v,t3_itmh6v,1,3
4,Red states with Blue Governors,JudasBackstab,1.600249e+09,itm59v,t3_itm59v,0,1
...,...,...,...,...,...,...,...
56241,"Obama: ""The U.S. Must Give Up Some Of Its Free...",IIRC,1.483307e+09,5lf6fz,t3_5lf6fz,57,11
56242,hello,,1.483302e+09,5lexjy,t3_5lexjy,0,1
56243,Democrats’ year-end moves seem like revenge on...,Pazaj,1.483301e+09,5lex0q,t3_5lex0q,0,1
56244,"In 2017, GOP sees mandate to undo Obama's agenda",lawblogz,1.483284e+09,5le44q,t3_5le44q,47,26


In [9]:
posts_file_name = 'r_' + TARGET_SUBREDDIT_NAME + '_posts_data.csv'
post_df.to_csv(posts_file_name)

In [10]:
# eliminates posts with a negative score, posts with a score higher than a certain threshold (to prevent frontpage influence)
# and eliminates posts with low activity (< 5  comments)

# potential source of errors: controversial posts w low upvotes due to disagreement


SCORE_TOP_THRESHOLD = 400 #determined by analysing the frontpage posts --> see frontpage analysis file
SCORE_BOTTOM_THRESHOLD = 1

filtered_posts = []
index = 0
post_filtering_start = time.time()
print('Starting post filtering')
for post in submissions:
    score = post.score
    n_comments = post.num_comments
    if score >= SCORE_BOTTOM_THRESHOLD and score <= SCORE_TOP_THRESHOLD and n_comments >= 5:
        filtered_posts.append(post)
    index +=1
    if index % 10000 == 0:
        current_time = time.time()
        print(f"Filtered {index} posts. Elapsed time: {current_time-post_filtering_start}")
    
        
post_filtering_end = time.time()

print(f'Finished filtering posts. Elapsed time: {post_filtering_end-post_filtering_start}')
print(f'Filtered posts count: {len(filtered_posts)}')

Starting post filtering
Filtered 10000 posts. Elapsed time: 0.008028030395507812
Filtered 20000 posts. Elapsed time: 0.01499319076538086
Filtered 30000 posts. Elapsed time: 0.0239713191986084
Filtered 40000 posts. Elapsed time: 0.0329129695892334
Filtered 50000 posts. Elapsed time: 0.0409235954284668
Finished filtering posts. Elapsed time: 0.045914411544799805
Filtered posts count: 8603


In [11]:
clean_post_df = pd.DataFrame([[p.title, p.author, p.created, p.id, p.name, p.num_comments, p.score]  for p in filtered_posts])
clean_post_df.rename(columns={0: 'Title', 1: 'Author', 2: 'created', 3: 'id', 4: 'fullname', 5: 'n_comments', 6: 'score'})

Unnamed: 0,Title,Author,created,id,fullname,n_comments,score
0,"Sorry, solar panels won’t stop California’s fires",Foubar,1.600243e+09,itkkbv,t3_itkkbv,5,18
1,I’ve officially made the switch MAGA2020,Chef-James,1.600242e+09,itkdl9,t3_itkdl9,24,74
2,"United States Coronavirus: 200,097 Dead",grizeldadagrate,1.600238e+09,itjdc8,t3_itjdc8,20,3
3,A surprising Republican wave election could be...,Foubar,1.600238e+09,itjasz,t3_itjasz,5,23
4,Judge Stands Up To Rioters! Holds Them On $1 M...,Curious447,1.600232e+09,ithb8l,t3_ithb8l,12,183
...,...,...,...,...,...,...,...
8598,McMullin blasts 'authoritarian' Trump,,1.483370e+09,5lk1df,t3_5lk1df,20,21
8599,"""Just stop daddy"" children scream in 911 call ...",,1.483334e+09,5lh8pu,t3_5lh8pu,36,46
8600,Trump Promises a Revelation on Hacking,,1.483330e+09,5lgu1y,t3_5lgu1y,9,10
8601,"Obama: ""The U.S. Must Give Up Some Of Its Free...",IIRC,1.483307e+09,5lf6fz,t3_5lf6fz,57,11


In [12]:
filtered_posts_file_name = 'r_' + TARGET_SUBREDDIT_NAME + '_filtered_posts_data.csv'
post_df.to_csv(filtered_posts_file_name)

In [13]:
#fetching all comments from every post in the filtered list

comment_f_start = time.time()

subreddit_comments = []
index =0
for post in filtered_posts:
    current_comments = post.comments
    current_comments.replace_more(limit=None)
    current_comments_list = current_comments.list()
    
    #merging current comments with master comment list
    subreddit_comments += current_comments_list
    index += 1
    if index % 100 == 0:
        current_time = time.time()
        print(f'{index} posts parsed. Elapsed time: {current_time-comment_f_start}')
        print(f'\tCurrent comment count: {len(subreddit_comments)}')
    

100 posts parsed. Elapsed time: 29.25132393836975
	Current comment count: 1180
200 posts parsed. Elapsed time: 59.32662630081177
	Current comment count: 2410
300 posts parsed. Elapsed time: 124.42566227912903
	Current comment count: 3990
400 posts parsed. Elapsed time: 236.9513819217682
	Current comment count: 5500
500 posts parsed. Elapsed time: 343.9844217300415
	Current comment count: 7280
600 posts parsed. Elapsed time: 455.60459303855896
	Current comment count: 8745
700 posts parsed. Elapsed time: 565.1213011741638
	Current comment count: 10092
800 posts parsed. Elapsed time: 676.0801582336426
	Current comment count: 11568
900 posts parsed. Elapsed time: 784.2969586849213
	Current comment count: 12832
1000 posts parsed. Elapsed time: 907.405357837677
	Current comment count: 14427
1100 posts parsed. Elapsed time: 1014.7926177978516
	Current comment count: 15771
1200 posts parsed. Elapsed time: 1120.4579594135284
	Current comment count: 16987
1300 posts parsed. Elapsed time: 1229.23

In [14]:
dir(subreddit_comments[0])

['MISSING_COMMENT_MESSAGE',
 'STR_FIELD',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_extract_submission_id',
 '_fetch',
 '_fetch_data',
 '_fetch_info',
 '_fetched',
 '_kind',
 '_reddit',
 '_replies',
 '_reset_attributes',
 '_safely_add_arguments',
 '_submission',
 '_url_parts',
 '_vote',
 'all_awardings',
 'approved_at_utc',
 'approved_by',
 'archived',
 'associated_award',
 'author',
 'author_flair_background_color',
 'author_flair_css_class',
 'author_flair_richtext',
 'author_flair_template_id',
 'author_flair_text',
 'author_flair_text_color',
 'author_flair_type',
 'author_fullname',
 'author_patreon_flair',
 'author_premium',
 'awarders',
 

In [15]:
print(subreddit_comments[0].submission.title)

Sorry, solar panels won’t stop California’s fires


In [16]:
raw_comments_df = pd.DataFrame([[c.author, c.body, c.score, c.subreddit, c.created_utc, c.id, c.submission.title, c.submission.id]  for c in subreddit_comments])
raw_comments_df.rename(columns={0: 'Author', 1: 'Body', 2: 'Score', 3: 'Subreddit', 4: 'created', 5: 'id', 6: 'Post', 7: 'Post Title'})

Unnamed: 0,Author,Body,Score,Subreddit,created,id,Post,Post Title
0,AutoModerator,/r/Republican is a partisan subreddit. This i...,1,Republican,1.600214e+09,g5f2o3d,"Sorry, solar panels won’t stop California’s fires",itkkbv
1,LibertyLibertyBooya,They can if they’re used to scoop up 100 years...,1,Republican,1.600216e+09,g5f6yp0,"Sorry, solar panels won’t stop California’s fires",itkkbv
2,usernamesarehard1979,"But on the flip side, the fires are stopping m...",1,Republican,1.600218e+09,g5f9i24,"Sorry, solar panels won’t stop California’s fires",itkkbv
3,dingleberry1,Click baity title. Their argument is that clim...,1,Republican,1.600218e+09,g5f9nlx,"Sorry, solar panels won’t stop California’s fires",itkkbv
4,PatnarDannesman,Great article,1,Republican,1.600224e+09,g5fkepc,"Sorry, solar panels won’t stop California’s fires",itkkbv
...,...,...,...,...,...,...,...,...
157371,bloodyheart15,Heres the article about how North Carolina can...,10,Republican,1.483305e+09,dbvmdmt,"In 2017, GOP sees mandate to undo Obama's agenda",5le44q
157372,lawblogz,Especially considering California's political ...,-2,Republican,1.483382e+09,dbwstva,"In 2017, GOP sees mandate to undo Obama's agenda",5le44q
157373,artyfoul,>perfectly square boxes all roughly the same s...,1,Republican,1.483394e+09,dbx2pt7,"In 2017, GOP sees mandate to undo Obama's agenda",5le44q
157374,,"That is one political scientist, making a bold...",1,Republican,1.483305e+09,dbvn2pb,"In 2017, GOP sees mandate to undo Obama's agenda",5le44q


In [17]:
raw_comments_df_name = 'r_' + TARGET_SUBREDDIT_NAME + '_raw_comments.csv'
raw_comments_df.to_csv(raw_comments_df_name, index=False)

author, sub, body, score,  id, url,name, creation_time, subreddit