In [1]:
"""
Reddit Full Community Scraper - Extracts complete subreddit information and saves to CSV
This script directly accesses the full communities search page for a keyword
"""

import logging
import csv
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('reddit_community_scraper')

class RedditCommunityScraper:
    def __init__(self):
        # Set up Chrome options
        self.options = Options()
        self.options.add_argument('--headless')
        self.options.add_argument('--disable-gpu')
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('--disable-dev-shm-usage')
        self.options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
        
        # Initialize the driver
        self.driver = webdriver.Chrome(options=self.options)
        self.communities = []
        
    def scrape_communities(self, keyword):
        """
        Directly access the full communities search page for a keyword and extract all data
        """
        # Reddit communities search URL
        search_url = f"https://www.reddit.com/search/?q={keyword}&type=communities"
        logger.info(f"Accessing full communities search for keyword: {keyword}")
        logger.info(f"Search URL: {search_url}")
        
        try:
            # Load the full communities search page
            self.driver.get(search_url)
            
            # Wait for community results to load
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='search-community']"))
            )
            
            # Give the page a moment to fully load
            time.sleep(3)
            
            # Scroll down to load more communities
            last_height = self.driver.execute_script("return document.body.scrollHeight")
            scroll_count = 0
            max_scrolls = 5  # Limit scrolling to prevent infinite loops
            
            while scroll_count < max_scrolls:
                # Scroll down
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                
                # Wait for new content to load
                time.sleep(2)
                
                # Calculate new scroll height
                new_height = self.driver.execute_script("return document.body.scrollHeight")
                
                # If heights are the same, we've reached the bottom or no more content is loading
                if new_height == last_height:
                    break
                    
                last_height = new_height
                scroll_count += 1
            
            # Find all community elements
            community_elements = self.driver.find_elements(By.CSS_SELECTOR, "div[data-testid='search-community']")
            logger.info(f"Found {len(community_elements)} communities in search results")
            
            # Process each community element
            for element in community_elements:
                try:
                    # Extract subreddit name
                    name_element = element.find_element(By.CSS_SELECTOR, "span[id^='search-community-title']")
                    name = name_element.text.strip() if name_element else "Unknown"
                    
                    # Extract description
                    try:
                        desc_element = element.find_element(By.CSS_SELECTOR, "p[data-testid='search-subreddit-desc-text']")
                        description = desc_element.text.strip()
                    except:
                        description = ""
                    
                    # Extract member count and online count
                    try:
                        members_text = "Unknown"
                        online_text = "Unknown"
                        
                        # Try to find the stats element with member counts
                        stats_element = element.find_element(By.CSS_SELECTOR, "div.text-12.text-neutral-content-weak")
                        if stats_element:
                            stats_text = stats_element.text.strip()
                            
                            # Example: "16M members · 4.5K online"
                            parts = stats_text.split('·')
                            
                            if len(parts) >= 1 and "members" in parts[0]:
                                members_text = parts[0].replace("members", "").strip()
                                
                            if len(parts) >= 2 and "online" in parts[1]:
                                online_text = parts[1].replace("online", "").strip()
                    except:
                        members_text = "Unknown"
                        online_text = "Unknown"
                    
                    # Extract subreddit URL
                    try:
                        url_element = element.find_element(By.TAG_NAME, "a")
                        url = url_element.get_attribute("href") if url_element else ""
                    except:
                        url = ""
                    
                    community_info = {
                        "name": name,
                        "description": description,
                        "members": members_text,
                        "online": online_text,
                        "url": url
                    }
                    
                    self.communities.append(community_info)
                    logger.info(f"Added community: {name}")
                    
                except Exception as e:
                    logger.error(f"Error extracting community data: {e}")
            
            return self.communities
            
        except Exception as e:
            logger.error(f"Error accessing or processing search page: {e}")
            return []
    
    def save_to_csv(self, filename=None):
        """Save community information to a CSV file."""
        if filename is None:
            filename = "reddit_communities.csv"
            
        try:
            with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ['name', 'description', 'members', 'online', 'url']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                
                writer.writeheader()
                for community in self.communities:
                    writer.writerow(community)
            
            logger.info(f"Saved {len(self.communities)} communities to {filename}")
            return True
        except Exception as e:
            logger.error(f"Error saving to CSV: {e}")
            return False
    
    def print_results(self):
        """Print the extracted community information."""
        print(f"\nFound {len(self.communities)} communities related to the search:")
        print("-" * 60)
        
        for i, comm in enumerate(self.communities, 1):
            print(f"{i}. {comm['name']}")
            print(f"   Description: {comm['description'][:100]}..." if len(comm['description']) > 100 else f"   Description: {comm['description']}")
            print(f"   Members: {comm['members']}, Online: {comm['online']}")
            print(f"   URL: {comm['url']}")
            print("-" * 60)
    
    def close(self):
        """Close the web driver."""
        self.driver.quit()
        logger.info("Closed web driver")

def main():
    # Define the keyword to search
    keyword = "jobs"
    
    # Create the scraper
    scraper = RedditCommunityScraper()
    
    try:
        # Scrape communities information
        scraper.scrape_communities(keyword)
        
        # Print the results
        scraper.print_results()
        
        # Save the results to a CSV file
        scraper.save_to_csv(f"{keyword}_communities.csv")
        
        return scraper.communities
    
    finally:
        # Close the driver
        scraper.close()

if __name__ == "__main__":
    main()

2025-04-28 13:18:25,796 - INFO - Accessing full communities search for keyword: jobs
2025-04-28 13:18:25,796 - INFO - Search URL: https://www.reddit.com/search/?q=jobs&type=communities
2025-04-28 13:18:40,083 - INFO - Found 59 communities in search results
2025-04-28 13:18:40,167 - INFO - Added community: r/jobs
2025-04-28 13:18:40,226 - INFO - Added community: r/RemoteJobs
2025-04-28 13:18:40,278 - INFO - Added community: r/DesignJobs
2025-04-28 13:18:40,330 - INFO - Added community: r/AskReddit
2025-04-28 13:18:40,381 - INFO - Added community: r/antiwork
2025-04-28 13:18:40,438 - INFO - Added community: r/torontoJobs
2025-04-28 13:18:40,506 - INFO - Added community: r/CanadaJobs
2025-04-28 13:18:40,560 - INFO - Added community: r/recruitinghell
2025-04-28 13:18:40,612 - INFO - Added community: r/careerguidance
2025-04-28 13:18:40,664 - INFO - Added community: r/DubaiJobs
2025-04-28 13:18:40,717 - INFO - Added community: r/Germany_Jobs
2025-04-28 13:18:40,771 - INFO - Added community:


Found 59 communities related to the search:
------------------------------------------------------------
1. r/jobs
   Description: /r/jobs is the number one community for advice relating to your career. Head to our discord for live...
   Members: 2M, Online: 516
   URL: https://www.reddit.com/r/jobs/
------------------------------------------------------------
2. r/RemoteJobs
   Description: Remote jobs: the future of work! READ RULES BEFORE POSTING! :) This is a place to discuss remote wor...
   Members: 273K, Online: 28
   URL: https://www.reddit.com/r/RemoteJobs/
------------------------------------------------------------
3. r/DesignJobs
   Description: Some redditors are skilled professionals, some redditors need skilled professionals. Scroll down for...
   Members: 163K, Online: 32
   URL: https://www.reddit.com/r/DesignJobs/
------------------------------------------------------------
4. r/AskReddit
   Description: r/AskReddit is the place to ask and answer thought-provoking qu

2025-04-28 13:18:46,700 - INFO - Closed web driver
