In [9]:
import praw
import os
import datetime as dt
import time
import pandas as pd
from psaw import PushshiftAPI

In [10]:
# retrieving info w environment variables
USERNAME = os.environ.get('REDDIT_NLP_USERNAME')
PASSWORD = os.environ.get('REDDIT_NLP_PASSWORD')
CLIENT_ID = os.environ.get('REDDIT_NLP_CLIENT_ID')
CLIENT_SECRET = os.environ.get('REDDIT_NLP_SECRET')
TARGET_SUBREDDIT_NAME = 'worldnews'


In [11]:
reddit= praw.Reddit(user_agent='Comment Extraction (by u/Reddit_nlp_pa)',
                    client_id=CLIENT_ID, client_secret=CLIENT_SECRET,
                    username=USERNAME, password=PASSWORD)


In [8]:
#testing praw
subreddit = reddit.subreddit(TARGET_SUBREDDIT_NAME)
print(subreddit.display_name)  # Output: redditdev
print(subreddit.title)         # Output: reddit Development
print(subreddit.description)   # Output: A subreddit for discussion of ...

worldnews
World News
>>> - **Other Subs:**

>>> - [Related](http://goo.gl/ztbbza)
>>>     - /r/News
>>>     - /r/PoliticalDiscussion
>>>     - /r/WorldEvents
>>>     - /r/GeoPolitics
>>>     - /r/GlobalTalk 
>>>     - /r/Breakingnews 
>>>     - /r/Business
>>>     - /r/Economics
>>>     - /r/Environment
>>>     - /r/History
>>>     - /r/HumanRights
>>>     - /r/Features
>>>     - /r/UpliftingNews
>>>     - /r/NewsOfTheWeird
>>>     - /r/FakeNews
>>>     - /r/ID_News 

>>> - [N. America](https://goo.gl/dkfVnB)
>>>     - /r/Politics
>>>     - /r/USA
>>>     - /r/USANews
>>>     - /r/Canada
>>>     - /r/CanadaPolitics
>>>     - /r/OnGuardForThee
>>>     - /r/Cuba
>>>     - /r/Mexico
>>>     - /r/PuertoRico

>>> - [S. America](https://goo.gl/DDaqmY)
>>>     - /r/Argentina
>>>     - /r/Brasil
>>>     - /r/Chile
>>>     - /r/Colombia
>>>     - /r/Ecuador
>>>     - /r/Guyana
>>>     - /r/Nicaragua
>>>     - /r/Suriname 
>>>     - /r/Uruguay
>>>     - [/r/Venezuela](/r/vzla)

>>> - [Europe](ht

In [71]:
# max_results_per_request necessary due to bug
api = PushshiftAPI(reddit,max_results_per_request=100)

In [72]:
print('Fetching data')
start_fetching_data = time.time()
start_epoch=int(dt.datetime(2017, 1, 1).timestamp())

submissions = list(api.search_submissions(
                                                    after=start_epoch,
                                                    subreddit=TARGET_SUBREDDIT_NAME,
                                                    limit=10_000_000))
end_fetching_data = time.time()
print(f'Finished fetching data. Elapsed time: {end_fetching_data-start_fetching_data}')

Fetching data




TypeError: can only concatenate str (not "int") to str

In [None]:
print(f'submission count: {len(submissions)}')




In [None]:
post_df = pd.DataFrame([[p.title, p.author, p.created, p.id, p.num_comments, p.score, p.subreddit, p.fullname, p.upvote_ratio, p.locked, p.stickied, p.url]  for p in submissions])
post_df.rename(columns={0: 'Title', 1: 'Author', 2: 'created', 3: 'id', 4: 'n_comments', 5: 'score' , 6: 'subreddit', 7:'fullname', 8:'upvote_ratio', 9:'locked', 10:'stickied', 11:'url'})

In [None]:
posts_file_name = 'r_' + TARGET_SUBREDDIT_NAME + '_posts_data_massive_full.csv'
post_df.to_csv(posts_file_name, index=False)

In [None]:
# eliminates posts with a negative score, posts with a score higher than a certain threshold (to prevent frontpage influence)
# and eliminates posts with low activity (< 5  comments)

# potential source of errors: controversial posts w low upvotes due to disagreement


# SCORE_TOP_THRESHOLD = 1000 # arbitrary, determined by analysing the frontpage posts --> see frontpage analysis file
# SCORE_BOTTOM_THRESHOLD = 10 # minimal activity

filtered_posts = []
index = 0
post_filtering_start = time.time()
print('Starting post filtering')
for post in submissions:
    score = post.score
    n_comments = post.num_comments
    if score >= SCORE_BOTTOM_THRESHOLD and score <= SCORE_TOP_THRESHOLD and n_comments >= 5:
        filtered_posts.append(post)
    index +=1
    if index % 10000 == 0:
        current_time = time.time()
        print(f"Filtered {index} posts. Elapsed time: {current_time-post_filtering_start}")
    
        
post_filtering_end = time.time()

print(f'Finished filtering posts. Elapsed time: {post_filtering_end-post_filtering_start}')
print(f'Filtered posts count: {len(filtered_posts)}')

In [None]:
clean_post_df = pd.DataFrame([[p.title, p.author, p.created, p.id, p.name, p.num_comments, p.score]  for p in filtered_posts])
clean_post_df.rename(columns={0: 'Title', 1: 'Author', 2: 'created', 3: 'id', 4: 'fullname', 5: 'n_comments', 6: 'score'})

In [None]:
filtered_posts_file_name = 'r_' + TARGET_SUBREDDIT_NAME + '_filtered_posts_data.csv'
post_df.to_csv(filtered_posts_file_name)

In [None]:
#fetching all comments from every post in the filtered list

comment_f_start = time.time()

subreddit_comments = []
index =0
for post in filtered_posts:
    current_comments = post.comments
    current_comments.replace_more(limit=None)
    current_comments_list = current_comments.list()
    
    #merging current comments with master comment list
    subreddit_comments += current_comments_list
    index += 1
    if index % 100 == 0:
        current_time = time.time()
        print(f'{index} posts parsed. Elapsed time: {current_time-comment_f_start}')
        print(f'\tCurrent comment count: {len(subreddit_comments)}')
    

In [None]:
dir(subreddit_comments[0])

In [None]:
print(subreddit_comments[0].submission.title)

In [None]:
raw_comments_df = pd.DataFrame([[c.author, c.body, c.score, c.subreddit, c.created_utc, c.id, c.submission.title, c.submission.id]  for c in subreddit_comments])
raw_comments_df.rename(columns={0: 'Author', 1: 'Body', 2: 'Score', 3: 'Subreddit', 4: 'created', 5: 'id', 6: 'Post', 7: 'Post Title'})

In [None]:
raw_comments_df_name = 'r_' + TARGET_SUBREDDIT_NAME + '_raw_comments_v2.csv'
raw_comments_df.to_csv(raw_comments_df_name, index=False)

author, sub, body, score,  id, url,name, creation_time, subreddit