# Reddit Data Scraping for Clinical Trial Recruitment

This notebook handles the ethical scraping of Reddit data to identify potential clinical trial participants.

OUTPUT:
- A json file containing the scraped data, with each post as a separate entry with its comments

INSTRUCTIONS:
- Adjust the constants in the Constants section as needed
- You will need to have a credentials.json file in the config directory, see config/credentials.json.example for the format
- Run the entire notebook to scrape the data and save it to a json file

## Setup and Configuration
1. Uses PRAW for Reddit API access
2. Scrapes data from Reddit using search functionality with keywords
3. Ensures user privacy
4. Saves data in structured format

In [10]:
# Constants
MAX_COMMENTS_PER_POST = 10
POST_LIMIT = 100

# Define target subreddits related to health conditions, this is just a starting point, we can add more subreddits if needed
SUBREDDITS = [
    'ChronicIllness',
    'ChronicPain',
    'medicine',
    'autoimmune'
]

# Keywords to filter relevant posts, similarly this is just a starting point, we can add more keywords if needed
KEYWORDS = [
    'clinical trial',
    'medical study',
    'research study',
    'treatment option',
    'new treatment',
    'experimental treatment',
]

In [11]:
import praw
import json
import pandas as pd
from datetime import datetime
import time
from pathlib import Path
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [12]:
# Load credentials from config file
def load_credentials():
    try:
        with open('../config/credentials.json', 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        logger.error("credentials.json not found in config directory")
        raise

# Initialize Reddit API client
def init_reddit_client(credentials):
    """Initialize Reddit API client
    
    Sample credentials.json:
    {
        "reddit": {
            "client_id": "your_client_id",
            "client_secret": "your_client_secret",
            "user_agent": "your_user_agent"
        }
    }
    """
    return praw.Reddit(
        client_id=credentials['reddit']['client_id'],
        client_secret=credentials['reddit']['client_secret'],
        user_agent=credentials['reddit']['user_agent']
    )

## Scraping Functions

In [14]:
def anonymize_data(text):
    """Remove or mask potentially identifying information"""
    # TODO: Currently just a placeholder. Implement more sophisticated anonymization if needed
    return text

def scrape_subreddit(reddit, subreddit_name, post_limit=POST_LIMIT):
    """Scrape posts and comments from a subreddit"""
    LIMIT_PER_KEYWORD = post_limit // len(KEYWORDS)

    subreddit = reddit.subreddit(subreddit_name)
    posts_data = []
    
    try:
        # Search for each keyword
        for keyword in KEYWORDS:
            logger.info(f"Searching for '{keyword}' in r/{subreddit_name}...")
            # Use Reddit's search functionality
            for post in subreddit.search(keyword, limit=LIMIT_PER_KEYWORD, sort='new'):
                # Skip if we already have this post (from a different keyword search)
                if any(p['post_id'] == post.id for p in posts_data):
                    continue
                    
                post_data = {
                    'post_id': post.id,
                    'author': post.author.name if post.author else None,
                    'subreddit': subreddit_name,
                    'title': anonymize_data(post.title),
                    'text': anonymize_data(post.selftext),
                    'created_utc': datetime.fromtimestamp(post.created_utc).isoformat(),
                    'score': post.score,
                    'matching_keyword': keyword,  # Track which keyword matched
                    'comments': []
                }
                
                # Get comments
                post.comments.replace_more(limit=0)
                for comment in post.comments[:MAX_COMMENTS_PER_POST]:
                    comment_data = {
                        'comment_id': comment.id,
                        'text': anonymize_data(comment.body),
                        'score': comment.score,
                        'author': comment.author.name if comment.author else None,
                        'created_utc': datetime.fromtimestamp(comment.created_utc).isoformat()
                    }
                    post_data['comments'].append(comment_data)
                
                posts_data.append(post_data)
                # time.sleep(2)  # Rate limiting
                
    except Exception as e:
        logger.error(f"Error scraping subreddit {subreddit_name}: {str(e)}")
        
    return posts_data

In [15]:
def save_data(data, filename='../data/raw_reddit_data.json'):
    """Save scraped data to file"""
    Path('../data').mkdir(exist_ok=True)
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)
    logger.info(f"Data saved to {filename}")

In [16]:
def main():
    # Load credentials and initialize Reddit client
    credentials = load_credentials()
    reddit = init_reddit_client(credentials)
    
    all_data = []
    
    # Scrape each subreddit
    for subreddit in SUBREDDITS:
        logger.info(f"Scraping r/{subreddit}...")
        subreddit_data = scrape_subreddit(reddit, subreddit)
        all_data.extend(subreddit_data)
        logger.info(f"Collected {len(subreddit_data)} posts from r/{subreddit}")
        
    # Save data
    save_data(all_data)

def load_saved_data():
    """Load the saved data and create a DataFrame for analysis preview"""
    with open('../data/raw_reddit_data.json', 'r', encoding='utf-8') as f:
        all_data = json.load(f)
    # Create DataFrame for analysis preview
    df = pd.json_normalize(
        all_data, 
        record_path='comments',
        meta=['post_id', 'subreddit', 'title', 'text', 'score', 
              'created_utc', 'matching_keyword'],
        record_prefix='comment_',  # Prefix for comment fields
        meta_prefix='post_'        # Prefix for post fields
    )
    
    return df


In [17]:
if __name__ == "__main__":
    main()
    df = load_saved_data()
    print(df.head())
    print("\nData Collection Summary:")
    print(f"Total posts collected: {df['post_post_id'].nunique()}")
    print(f"Total comments collected: {len(df)}")
    print("\nSubreddits distribution:")
    print(df['post_subreddit'].value_counts())

INFO:__main__:Scraping r/ChronicIllness...
INFO:__main__:Searching for 'clinical trial' in r/ChronicIllness...
INFO:__main__:Searching for 'medical study' in r/ChronicIllness...
INFO:__main__:Searching for 'research study' in r/ChronicIllness...
INFO:__main__:Searching for 'treatment option' in r/ChronicIllness...
INFO:__main__:Searching for 'new treatment' in r/ChronicIllness...
INFO:__main__:Searching for 'experimental treatment' in r/ChronicIllness...
INFO:__main__:Collected 84 posts from r/ChronicIllness
INFO:__main__:Scraping r/ChronicPain...
INFO:__main__:Searching for 'clinical trial' in r/ChronicPain...
INFO:__main__:Searching for 'medical study' in r/ChronicPain...
INFO:__main__:Searching for 'research study' in r/ChronicPain...
INFO:__main__:Searching for 'treatment option' in r/ChronicPain...
INFO:__main__:Searching for 'new treatment' in r/ChronicPain...
INFO:__main__:Searching for 'experimental treatment' in r/ChronicPain...
INFO:__main__:Collected 90 posts from r/ChronicP

  comment_comment_id                                       comment_text  \
0            ltxeqb4  Hey, I am so sorry you are dealing with this. ...   
1            ltxqenu  Yeah it’s always frustrating, I get the commen...   
2            loqe5s9                                          [deleted]   
3            lgh9vv6  You have to meet eligibility criteria for the ...   
4            lgieezt  You can also look at prestigious universities ...   

   comment_score  comment_author  comment_created_utc post_post_id  \
0              2  hotheadnchickn  2024-10-26T19:19:10      1gcwfy4   
1              2       stradamus  2024-10-26T20:33:06      1gcwfy4   
2              1            None  2024-09-24T14:42:35      1fohr4u   
3              1         podge91  2024-08-04T13:21:58      1ejxi7j   
4              1   Clawhands2022  2024-08-04T17:11:13      1ejxi7j   

   post_subreddit                                         post_title  \
0  ChronicIllness  I hate having to accept my life witho