In [1]:
import praw
import pprint
import datetime
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
reddit = praw.Reddit(client_id = 'h8BBe0NNslxi8g', 
                     client_secret = 'gd2EfD_bd9njZI9zngbiD1WhMJo8lA', 
                     user_agent = 'Chrome:AwardPredictor:v0.0.1 (by /u/drdnm)')

Version 7.0.0 of praw is outdated. Version 7.1.0 was released Tuesday June 23, 2020.


In [3]:
aww = reddit.subreddit('aww')
hist = reddit.subreddit('history')
askred = reddit.subreddit('askreddit')

multi = reddit.subreddit('aww+history+askreddit')

In [4]:
multi_hot = multi.hot(limit = 100)

In [5]:
def get_toplevel_comment_info(subm, num_top_comments = 5):
    
    # 'num_top_level' controls the number of highest-upvoted top-level comments for which the replies will be counted
    
    t_del_1 = datetime.datetime.now()
    
    
    # Delete all "more comments" entries to avoid errors - puts a cap on max number of comments retrieved
    subm.comments.replace_more(limit = 0) 
    
    
    t_del_2 = datetime.datetime.now()
    t_del = (t_del_2 - t_del_1).total_seconds()
    
    

    # List of features of interest to pull from the API
    api_feat = {'Gilded': 'gilded',
                'Gildings': 'gildings',
                'Upvotes': 'ups',
                'Downvotes': 'downs',
                'Distinguished': 'distinguished',
                'Edited': 'edited',
                'Controversiality': 'controversiality',
                'OP comment': 'is_submitter'
               }
    
    
    
    t_feat_1 = datetime.datetime.now()
    
    
    
    # Iterate through all comments to extract their features
    max_number_of_comments = 3*num_top_comments
    comment_features = {}
    for comment_number, comment in enumerate(subm.comments):
        if comment_number > max_number_of_comments:
            break
            
        # For each comment build a dict to hold its features, indexed by comment id
        ID = comment.id
        comment_features[ID] = {}
        for feat_name in api_feat:
            comment_features[ID][feat_name] = comment.__dict__[api_feat[feat_name]]
               
        # Calculate age of comment (in minutes)
        comment_dtime = datetime.datetime.fromtimestamp(comment.created_utc)
        now_dtime = datetime.datetime.now()
        comment_features[ID]['Age'] = (now_dtime - comment_dtime).total_seconds()/60
        
        # Calculate upvote rate (per minute)
        comment_features[ID]['Upvote rate'] = comment_features[ID]['Upvotes']/comment_features[ID]['Age']

    
    
    t_feat_2 = datetime.datetime.now()
    t_feat = (t_feat_2 - t_feat_1).total_seconds()
    
    t_reply_1 = datetime.datetime.now()

    
    
    # Calculate average number of 2nd-level replies for top comments:
    #    (number of comments controlled by 'num_top_level' variable)
    
    #     First need to order the top-level comments by upvotes in order to grab the top ones
    ups = [(ID, comment_features[ID]['Upvotes']) for ID in comment_features]
    ups_by_comme = (pd.DataFrame(ups)
                      .rename(columns = {0:'Comment ID', 1: 'Comment Ups'})
                      .sort_values(by = 'Comment Ups', ascending = False)
                      .iloc[:num_top_comments,:]
                      .reset_index(drop = True)
                   )
    #     For each of the top-n comments now grab all replies and count them up 
    num_replies = [(comment.id, len(comment.replies.__dict__['_comments'])) 
                   for comment in subm.comments.__dict__['_comments'] 
                   if comment.id in ups_by_comme['Comment ID'].to_list()
                  ]
    num_replies = (pd.DataFrame(num_replies)
                     .rename(columns = {0:'Comment ID', 1:'Num Replies'})    
                  )
    #     Merge these on the 'Comment ID' column
    top_comment_performance = ups_by_comme.merge(num_replies, on = 'Comment ID')
    
    #     Calculate the number of upvotes per minute and replies per minute since the comment was created
    top_comment_performance['Upvote rate'] = [top_comment_performance.loc[j, 'Comment Ups'] /
                                              comment_features[top_comment_performance.loc[j, 'Comment ID']]['Age']
                                              for j in top_comment_performance.index
                                             ]
    top_comment_performance['Reply rate'] = [top_comment_performance.loc[j, 'Num Replies'] /
                                              comment_features[top_comment_performance.loc[j, 'Comment ID']]['Age']
                                              for j in top_comment_performance.index
                                             ]
    #     Calculate average and standard deviation of the rates, append them to the feature dictionary
    Avg_up_rate = top_comment_performance['Upvote rate'].mean()
    Std_up_rate = top_comment_performance['Upvote rate'].std()
    Avg_reply_rate = top_comment_performance['Reply rate'].mean()
    Std_reply_rate = top_comment_performance['Reply rate'].std()
    
    
    
    t_reply_2 = datetime.datetime.now()
    t_reply = (t_reply_2 - t_reply_1).total_seconds()
    
    
    # There is an opportunity to create more features out of the comments. These could include:
    #    explore the success of comments/replies made by the submitter of the original post
    #    look at the distribution of comment upvotes (or of comment replies)
    #    look at the average controversiality among comments or replies to comments
    #    calculate the rate of gildings among comments and/or comment replies
    
    return Avg_up_rate, Std_up_rate, Avg_reply_rate, Std_reply_rate, t_feat, t_reply, t_del



def submission_features(subm, num_top_comments = 5):
    
    # List of potentially informative features available from the API
    #### (bring this outside the function so it doesn't get re-written thousands of times as I loop over submissions) ####
    api_feat = {'Title': 'title',
                'Author': 'author',
                'ID': 'id',
                'Gilded': 'gilded',
                'Gildings': 'gildings',
                'Upvotes': 'ups',
                'Upvote ratio': 'upvote_ratio',
                'Post time': 'created_utc',
                'Views': 'view_count',
                'Discussion type': 'discussion_type',
                'Distinguished': 'distinguished',
                'Contest mode': 'contest_mode',
                'Content categories': 'content_categories',
                'Edited': 'edited',
                'Hidden': 'hidden',
                'Crosspostable': 'is_crosspostable',
                'Crossposts': 'num_crossposts',
                'Meta': 'is_meta',
                'OC': 'is_original_content',
                'Reddit media': 'is_reddit_media_domain',
                'Robot indexable': 'is_robot_indexable',
                'Selfpost': 'is_self',
                'Video': 'is_video',
                'Likes': 'likes',
                'Comments': 'num_comments',
                'Adult content': 'over_18',
                'Subreddit': 'subreddit',
               }
    
    # Iterate through desired features to build a dictionary containing feature values for this submission
    features = {}
    for feat_name in api_feat:
        features[feat_name] = subm.__dict__[api_feat[feat_name]]

    # Extract author and subreddit names as strings
    features['Author'] = features['Author'].name
    features['Subreddit'] = features['Subreddit'].display_name
    
    # Convert UTC timestamp to time of day (in minutes since beginning of UTC day)
    dtime_posted = datetime.datetime.fromtimestamp(features['Post time'])
    features['Post time'] = dtime_posted.hour*60 + dtime_posted.minute
    
    # Calculate age of the post (in minutes)
    features['Post age'] = (datetime.datetime.now() - dtime_posted).total_seconds()/60
    
    # Calculate upvotes per minute of age and comments per minute of age
    features['Upvote rate'] = features['Upvotes']/features['Post age']
    features['Comment rate'] = features['Comments']/features['Post age']
    
    # Extract and process comments
    if subm.num_comments == 0:
        Avg_up_rate, Std_up_rate, Avg_reply_rate, Std_reply_rate = (0, 0, 0, 0)
    else:
        Avg_up_rate, Std_up_rate, Avg_reply_rate, Std_reply_rate, t_feat, t_reply, t_del = get_toplevel_comment_info(subm, num_top_comments)
    
    features['Avg top comments up rate'] = Avg_up_rate
    features['Std top comments up rate'] = Std_up_rate
    features['Avg top comments reply rate'] = Avg_reply_rate
    features['Std top comments reply rate'] = Std_reply_rate

    
    return features, t_feat, t_reply, t_del

In [6]:
all_submission_features = {}

t_feat = 0
t_reply = 0
t_del = 0

t_start = datetime.datetime.now()
for j, subm in enumerate(multi_hot):
    ID = subm.id
    if j%10 == 0:
        print(j)
    all_submission_features[ID], t_feat_j, t_reply_j, t_del_j = submission_features(subm, 10)
    
    t_feat += t_feat_j
    t_reply += t_reply_j
    t_del += t_del_j
    
    if j == 10:
        break
t_end = datetime.datetime.now()

        
        
print('Time spent processing comment features: {:.6f}\n'.format(t_feat))
print('Time spent processing comment replies: {:.6f}\n'.format(t_reply))
print('Time spent deleting more comments: {:.6f}\n'.format(t_del))

print('Total time spent on all submissions: {:.5f}'.format((t_end - t_start).total_seconds()))


0
10
Time spent processing comment features: 16.988363

Time spent processing comment replies: 0.080727

Time spent deleting more comments: 0.000000

Total time spent on all submissions: 19.67003


In [7]:
df = pd.DataFrame(all_submission_features).transpose()

#df.head()

In [9]:
subm

Submission(id='kahsgq')

In [35]:
for k in reddit.info(['t3_kahsgq', 't3_kaikjs']):
    for comm in k.comments:
        print(comm.id)


gfao17x
gfb6gil
gfatx3e
gfaq7iq
gfb6fns
gfareaw
gfazj4a
gfb6fyg
gfbaens
gfbhnz0
gfayzqo
gfbflhe
gfb0o63
gfb57zp
gfbby0z
gfan1g1
gfbm3o6
gfaxt2t
gfaytu5
gfbb0qe
gfbd1tz
gfbi6eb
gfbin9k
gfaw0zc
gfb98wb
gfb9ssw
gfbdaeu
gfbeurl
gfbgm22
gfbi92d
gfbidr8
gfbjb24
gfbjeir
gfbjwzp
gfblm1k
gfb978t
gfap6eo
gfawsou
gfap4wj
gfaqce1
gfaxkez
gfaynmz
gfb44tq
gfax88g
gfaum06
gfaoxg6
gfb0wcz
gfb13ai
gfap3om
gfazln9
gfar1lv
gfawute
gfaybih
gfazn9e
gfbg6tb
gfaraej
gfazoc5
gfazzvy
gfb0028
gfb0keq
gfb1cic
gfb1vks
gfb1xrh
gfb22kx
gfb28kn
gfb4624
gfb5nak
gfb61g7
gfb6af8
gfb6pfw
gfb6rxn
gfb7dte
gfb7dup
gfb89oh
gfb8nu5
gfb8xyx
gfb9ipd
gfb9r1u
gfbabgd
gfbbce1
gfbbo55
gfbe7m1
gfbekgc
gfb6dq7
gfb6ndj
gfb6ofk
gfb7s5s
gfb87cx
gfb8dct
gfb8dk4
gfb9mm3
gfbabbd
gfbb1s3
gfbc8fo
gfbfg24
gfbfhm8
gfbfjkq
gfbfk2j
gfbfkar
gfbhaq9
gfaqcsq
gfbaux3
gfbboo6
gfbbse5
gfbcbgk
gfbcq6l
gfbcs6i
gfbcvpj
gfbcxd3
gfbcxsm
gfbczqq
gfbd027
gfbd1wv
gfbd215
gfbd49p
gfbd9kx
gfbd9vw
gfbdd7x
gfbdg7t
gfbdhae
gfbdilq
gfbdl0u
gfbdl8t
gfbdodr
gfbdx0c


In [45]:
for k in reddit.info(['t1_gfaw0zc', 't1_gfaytu5']):
    print(k.body)

Seems like a good problem to have
An animal on the plain woooooot
