### Install Libraries 

In [None]:
from dotenv import load_dotenv
import praw
import pandas as pd
from datetime import datetime
import pandas as pd
import time, os

In [None]:
load_dotenv()

client_id = os.getenv('REDDIT_CLIENT_ID')
client_secrets = os.getenv('REDDIT_CLIENT_SECRETS')

### Get Reddit Data 

In [None]:
# Use praw to get reddit post data

# function to get reddit post details
def get_post_details(full_link):
    post_id = full_link

    for _ in range(3):
        try:  # Try more times in case rate exceeded
            reddit = praw.Reddit(
                client_id= client_id,           # use .env file to access
                client_secret= client_secrets,  # use .env file to access
                user_agent='my_reddit_scraper by u/yourusername')

            submission = reddit.submission(id=post_id)
            # get submission title, id, score, and number of comments
            data = {
                'post_id': submission.id,
                'post_type': 'NIL',
                'created_timestamp': datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                'subreddit_id': submission.subreddit_id,
                'subreddit_name': submission.subreddit.display_name_prefixed,
                'title': submission.title,
                'author': submission.author.name if submission.author else 'Not Found',
                'author_id': submission.author_fullname if submission.author else 'Not Found',
                'comment_count': submission.num_comments,
                'vote_score': submission.score,
                'post_content': submission.selftext if submission.is_self else submission.url
            }
            return data
        except:
            print(f"FAIL: {post_url}")
            pass
    blank_data = {'post_id': post_id, 'post_type': 'Not Found', 'created_timestamp': 'Not Found',
                  'subreddit_id': 'Not Found', 'subreddit_name': 'Not Found', 'title': 'Not Found',
                  'author': 'Not Found', 'author_id': 'Not Found', 'comment_count': 'Not Found',
                  'vote_score': 'Not Found'}
    return blank_data



In [None]:
links = pd.read_csv('../data/deberta_v3_labelled_3_ALL.csv').drop_duplicates(subset='link_id')  # Load unique posts
links = links['link_id'].tolist()
links = [l for l in links if type(l)==str]  # Remove nan

# Read previously processed post_ids if exists
if os.path.exists('../data/reddit_posts_data.csv'):
    existing_df = pd.read_csv('../data/reddit_posts_data.csv', low_memory=False)
    existing_df = existing_df[existing_df['subreddit_id'] != "Not Found"]  # Try again for Not Found ones
    existing_ids = set(existing_df['post_id'].tolist())
else:
    existing_df = None
    existing_ids = set()

# filter out links that have already been processed
links = [link for link in links if link not in existing_ids]
print(f"Processing a total of {len(links)} links")
results = []

Processing a total of 7445 links


In [None]:
start_time = time.time()
i = 0
while i < len(links):
    full_link = links[i]
    i+=1
    if i % 100 == 1:  # Print progress and save every 100 entries
        try:
            if os.path.exists('../data/reddit_posts_data.csv'):
                existing_df = pd.read_csv('../data/reddit_posts_data.csv', low_memory=False)
            else:
                existing_df = None
            if existing_df is not None:  # Add new results to existing results and save
                pd.concat([existing_df, pd.DataFrame(results)], ignore_index=True).drop_duplicates().to_csv('../data/reddit_posts_data.csv', index=False)
            else:
                pd.DataFrame(results).drop_duplicates().to_csv('../data/reddit_posts_data.csv', index=False)
        except:
            pass
        print(f"{(time.time()-start_time)//60} min: {i + 1}/{len(links)}, {(i + 1) / len(links) * 100:.2f}%")
    try:
        data = get_post_details(full_link)
        results.append(data)
    except:
        pass
    time.sleep(0.5)

0.0 min: 2/7445, 0.03%
5.0 min: 102/7445, 1.37%
10.0 min: 202/7445, 2.71%
14.0 min: 302/7445, 4.06%
17.0 min: 402/7445, 5.40%
21.0 min: 502/7445, 6.74%
24.0 min: 602/7445, 8.09%
27.0 min: 702/7445, 9.43%
31.0 min: 802/7445, 10.77%
34.0 min: 902/7445, 12.12%
37.0 min: 1002/7445, 13.46%
41.0 min: 1102/7445, 14.80%
44.0 min: 1202/7445, 16.15%
47.0 min: 1302/7445, 17.49%
51.0 min: 1402/7445, 18.83%
54.0 min: 1502/7445, 20.17%
57.0 min: 1602/7445, 21.52%
61.0 min: 1702/7445, 22.86%
64.0 min: 1802/7445, 24.20%
68.0 min: 1902/7445, 25.55%
71.0 min: 2002/7445, 26.89%
74.0 min: 2102/7445, 28.23%
78.0 min: 2202/7445, 29.58%
82.0 min: 2302/7445, 30.92%
86.0 min: 2402/7445, 32.26%
90.0 min: 2502/7445, 33.61%
94.0 min: 2602/7445, 34.95%
98.0 min: 2702/7445, 36.29%
102.0 min: 2802/7445, 37.64%
105.0 min: 2902/7445, 38.98%
109.0 min: 3002/7445, 40.32%
113.0 min: 3102/7445, 41.67%
117.0 min: 3202/7445, 43.01%
120.0 min: 3302/7445, 44.35%
124.0 min: 3402/7445, 45.70%
128.0 min: 3502/7445, 47.04%
132.0 

In [None]:
# One final round of saving
if os.path.exists('../data/reddit_posts_data.csv'):
    existing_df = pd.read_csv('../data/reddit_posts_data.csv', low_memory=False)
else:
    existing_df = None
if existing_df is not None:
    pd.concat([existing_df, pd.DataFrame(results)], ignore_index=True).to_csv('../data/reddit_posts_data.csv', index=False)
else:
    pd.DataFrame(results).to_csv('../data/reddit_posts_data.csv', index=False)