In [14]:
"""
Enhanced Reddit Community Scraper - With Continuous Scrolling
This script directly accesses the full communities search page for a keyword
and scrolls continuously until no new content appears
"""

import logging
import csv
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('reddit_community_scraper')

class RedditCommunityScraper:
    def __init__(self):
        # Set up Chrome options
        self.options = Options()
        self.options.add_argument('--headless')
        self.options.add_argument('--disable-gpu')
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('--disable-dev-shm-usage')
        self.options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
        
        # Initialize the driver
        self.driver = webdriver.Chrome(options=self.options)
        self.communities = []
        
    def scrape_communities(self, keyword, max_scrolls=30, wait_time=3):
        """
        Directly access the full communities search page for a keyword and extract all data
        with continuous scrolling until no new content appears
        
        Parameters:
        - keyword: The search term to find communities
        - max_scrolls: Maximum number of scroll attempts (default: 30)
        - wait_time: Time to wait after each scroll in seconds (default: 3)
        """
        # Reddit communities search URL
        search_url = f"https://www.reddit.com/search/?q={keyword}&type=communities"
        logger.info(f"Accessing full communities search for keyword: {keyword}")
        logger.info(f"Search URL: {search_url}")
        
        try:
            # Load the full communities search page
            self.driver.get(search_url)
            
            # Wait for community results to load
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='search-community']"))
            )
            
            # Give the page a moment to fully load
            time.sleep(wait_time)
            
            # Initial extraction to compare against
            initial_communities = self.extract_visible_communities()
            current_community_count = len(initial_communities)
            logger.info(f"Initially found {current_community_count} communities")
            
            # Scroll and extract until no new communities appear
            last_height = self.driver.execute_script("return document.body.scrollHeight")
            consecutive_no_new_communities = 0
            scroll_count = 0
            
            while scroll_count < max_scrolls and consecutive_no_new_communities < 3:
                # Scroll down
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                scroll_count += 1
                
                # Wait for new content to load
                time.sleep(wait_time)
                
                # Calculate new scroll height
                new_height = self.driver.execute_script("return document.body.scrollHeight")
                
                # If heights are the same, we might have reached the bottom
                if new_height == last_height:
                    consecutive_no_new_communities += 1
                    logger.info(f"No height change after scroll {scroll_count} - attempt {consecutive_no_new_communities}/3")
                else:
                    last_height = new_height
                    
                    # Extract communities again
                    current_communities = self.extract_visible_communities()
                    
                    # Check if we found new communities
                    if len(current_communities) > current_community_count:
                        new_communities = len(current_communities) - current_community_count
                        logger.info(f"Scroll {scroll_count}: Found {new_communities} new communities (total: {len(current_communities)})")
                        current_community_count = len(current_communities)
                        consecutive_no_new_communities = 0
                    else:
                        consecutive_no_new_communities += 1
                        logger.info(f"No new communities after scroll {scroll_count} - attempt {consecutive_no_new_communities}/3")
                
                # If we've had 3 consecutive scrolls with no new communities, we're likely done
                if consecutive_no_new_communities >= 3:
                    logger.info(f"No new communities found for 3 consecutive scrolls - stopping")
                    break
            
            # Final extraction of all communities
            self.communities = self.extract_visible_communities()
            logger.info(f"Total communities found after {scroll_count} scrolls: {len(self.communities)}")
            
            return self.communities
            
        except Exception as e:
            logger.error(f"Error accessing or processing search page: {e}")
            return []
    
    def extract_visible_communities(self):
        """
        Extract all currently visible community information from the page
        Returns a list of community dictionaries
        """
        communities = []
        
        try:
            # Find all community elements
            community_elements = self.driver.find_elements(By.CSS_SELECTOR, "div[data-testid='search-community']")
            
            # Process each community element
            for element in community_elements:
                try:
                    # Extract subreddit name
                    name_element = element.find_element(By.CSS_SELECTOR, "span[id^='search-community-title']")
                    name = name_element.text.strip() if name_element else "Unknown"
                    
                    # Skip if we already have this community (by name)
                    if any(c["name"] == name for c in communities):
                        continue
                    
                    # Extract description
                    try:
                        desc_element = element.find_element(By.CSS_SELECTOR, "p[data-testid='search-subreddit-desc-text']")
                        description = desc_element.text.strip()
                    except:
                        description = ""
                    
                    # Extract member count and online count
                    try:
                        members_text = "Unknown"
                        online_text = "Unknown"
                        
                        # Try to find the stats element with member counts
                        stats_element = element.find_element(By.CSS_SELECTOR, "div.text-12.text-neutral-content-weak")
                        if stats_element:
                            stats_text = stats_element.text.strip()
                            
                            # Example: "16M members · 4.5K online"
                            parts = stats_text.split('·')
                            
                            if len(parts) >= 1 and "members" in parts[0]:
                                members_text = parts[0].replace("members", "").strip()
                                
                            if len(parts) >= 2 and "online" in parts[1]:
                                online_text = parts[1].replace("online", "").strip()
                    except:
                        members_text = "Unknown"
                        online_text = "Unknown"
                    
                    # Extract subreddit URL
                    try:
                        url_element = element.find_element(By.TAG_NAME, "a")
                        url = url_element.get_attribute("href") if url_element else ""
                    except:
                        url = ""
                    
                    community_info = {
                        "name": name,
                        "description": description,
                        "members": members_text,
                        "online": online_text,
                        "url": url
                    }
                    
                    communities.append(community_info)
                    
                except Exception as e:
                    logger.error(f"Error extracting community data: {e}")
                    continue
            
            return communities
            
        except Exception as e:
            logger.error(f"Error extracting visible communities: {e}")
            return []
    
    def save_to_csv(self, filename=None):
        """Save community information to a CSV file."""
        if filename is None:
            filename = "reddit_communities.csv"
            
        try:
            with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ['name', 'description', 'members', 'online', 'url']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                
                writer.writeheader()
                for community in self.communities:
                    writer.writerow(community)
            
            logger.info(f"Saved {len(self.communities)} communities to {filename}")
            return True
        except Exception as e:
            logger.error(f"Error saving to CSV: {e}")
            return False
    
    def print_results(self):
        """Print the extracted community information."""
        print(f"\nFound {len(self.communities)} communities related to the search:")
        print("-" * 60)
        
        for i, comm in enumerate(self.communities, 1):
            print(f"{i}. {comm['name']}")
            print(f"   Description: {comm['description'][:100]}..." if len(comm['description']) > 100 else f"   Description: {comm['description']}")
            print(f"   Members: {comm['members']}, Online: {comm['online']}")
            print(f"   URL: {comm['url']}")
            print("-" * 60)
    
    def close(self):
        """Close the web driver."""
        self.driver.quit()
        logger.info("Closed web driver")

def main():
    # Define the keyword to search
    keyword = "jobs"
    
    # Create the scraper
    scraper = RedditCommunityScraper()
    
    try:
        # Scrape communities information with continuous scrolling
        scraper.scrape_communities(keyword)
        
        # Print the results
        scraper.print_results()
        
        # Save the results to a CSV file
        scraper.save_to_csv(f"{keyword}_communities.csv")
        
        return scraper.communities
    
    finally:
        # Close the driver
        scraper.close()

if __name__ == "__main__":
    main()

2025-04-28 15:17:29,663 - INFO - Accessing full communities search for keyword: jobs
2025-04-28 15:17:29,663 - INFO - Search URL: https://www.reddit.com/search/?q=jobs&type=communities
2025-04-28 15:17:34,619 - INFO - Initially found 10 communities
2025-04-28 15:17:38,566 - INFO - Scroll 1: Found 9 new communities (total: 19)
2025-04-28 15:17:43,063 - INFO - Scroll 2: Found 10 new communities (total: 29)
2025-04-28 15:17:48,242 - INFO - Scroll 3: Found 10 new communities (total: 39)
2025-04-28 15:17:53,913 - INFO - Scroll 4: Found 10 new communities (total: 49)
2025-04-28 15:18:00,020 - INFO - Scroll 5: Found 10 new communities (total: 59)
2025-04-28 15:18:06,678 - INFO - Scroll 6: Found 10 new communities (total: 69)
2025-04-28 15:18:13,628 - INFO - Scroll 7: Found 10 new communities (total: 79)
2025-04-28 15:18:21,260 - INFO - Scroll 8: Found 10 new communities (total: 89)
2025-04-28 15:18:29,303 - INFO - Scroll 9: Found 10 new communities (total: 99)
2025-04-28 15:18:38,030 - INFO -


Found 243 communities related to the search:
------------------------------------------------------------
1. r/jobs
   Description: /r/jobs is the number one community for advice relating to your career. Head to our discord for live...
   Members: 2M, Online: 535
   URL: https://www.reddit.com/r/jobs/
------------------------------------------------------------
2. r/RemoteJobs
   Description: Remote jobs: the future of work! READ RULES BEFORE POSTING! :) This is a place to discuss remote wor...
   Members: 273K, Online: 46
   URL: https://www.reddit.com/r/RemoteJobs/
------------------------------------------------------------
3. r/DesignJobs
   Description: Some redditors are skilled professionals, some redditors need skilled professionals. Scroll down for...
   Members: 163K, Online: 26
   URL: https://www.reddit.com/r/DesignJobs/
------------------------------------------------------------
4. r/AskReddit
   Description: r/AskReddit is the place to ask and answer thought-provoking q

2025-04-28 15:22:16,450 - INFO - Closed web driver


In [15]:
pd.read_csv('jobs_communities.csv')

Unnamed: 0,name,description,members,online,url
0,r/jobs,/r/jobs is the number one community for advice...,2M,535,https://www.reddit.com/r/jobs/
1,r/RemoteJobs,Remote jobs: the future of work! READ RULES BE...,273K,46,https://www.reddit.com/r/RemoteJobs/
2,r/DesignJobs,"Some redditors are skilled professionals, some...",163K,26,https://www.reddit.com/r/DesignJobs/
3,r/AskReddit,r/AskReddit is the place to ask and answer tho...,55M,11K,https://www.reddit.com/r/AskReddit/
4,r/antiwork,"A subreddit for those who want to end work, ar...",2.9M,1.1K,https://www.reddit.com/r/antiwork/
...,...,...,...,...,...
238,r/VermontJobs,This subreddit has been created to post job op...,516,5,https://www.reddit.com/r/VermontJobs/
239,r/Canada_Jobs,Find Latest Jobs in Canada https://www.governm...,523,2,https://www.reddit.com/r/Canada_Jobs/
240,r/JobsAschaffenburg,Hier werden in Zukunft regionale Jobs aus Asch...,115,5,https://www.reddit.com/r/JobsAschaffenburg/
241,r/LakelandJobs,"Hiring for your business in Lakeland, Florida?...",392,2,https://www.reddit.com/r/LakelandJobs/


In [18]:
"""
Script to extract community names from sorted DataFrame without the 'r/' prefix
"""
import pandas as pd
import re

# Read the CSV file
df = pd.read_csv('jobs_communities.csv')

# Function to convert member count to numeric value
def convert_member_count(member_str):
    if pd.isna(member_str) or member_str == 'Unknown':
        return 0
    
    # Remove any non-numeric characters except decimal point and K/M/B
    member_str = member_str.strip()
    
    # Convert to numeric value
    if 'K' in member_str:
        return float(member_str.replace('K', '')) * 1000
    elif 'M' in member_str:
        return float(member_str.replace('M', '')) * 1000000
    elif 'B' in member_str:
        return float(member_str.replace('B', '')) * 1000000000
    else:
        # Try to extract numeric value using regex
        numeric_match = re.search(r'(\d+\.?\d*)', member_str)
        if numeric_match:
            return float(numeric_match.group(1))
        return 0

# Create a numeric column for sorting
df['member_count_numeric'] = df['members'].apply(convert_member_count)

# Sort by member count (descending)
df_sorted = df.sort_values(by='member_count_numeric', ascending=False)

# Function to remove 'r/' prefix from community names
def remove_r_prefix(name):
    if isinstance(name, str) and name.startswith('r/'):
        return name[2:]
    return name
    
df_sorted['clean_name'] = df_sorted['name'].apply(remove_r_prefix)
community_names = df_sorted['clean_name'].tolist()
print("First 10 community names without 'r/' prefix:")
for i, name in enumerate(community_names[:10]):
    print(f"{i+1}. {name}")

# Save the list to a text file
with open('community_names.txt', 'w') as f:
    for name in community_names:
        f.write(f"{name}\n")

print(f"\nSaved {len(community_names)} community names to 'community_names.txt'")
print("\nCommunity names list (first 10):", community_names[:10])

First 10 community names without 'r/' prefix:
1. funny
2. AskReddit
3. aww
4. memes
5. Showerthoughts
6. pics
7. Jokes
8. AmItheAsshole
9. Futurology
10. personalfinance

Saved 243 community names to 'community_names.txt'

Community names list (first 10): ['funny', 'AskReddit', 'aww', 'memes', 'Showerthoughts', 'pics', 'Jokes', 'AmItheAsshole', 'Futurology', 'personalfinance']


In [13]:
"""
Reddit Data Science Posts and Comments Scraper
This script scrapes posts and their comments from Reddit for "data science job" posts
"""

from bs4 import BeautifulSoup
import csv
import re
import logging
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('reddit_comment_scraper')

class RedditCommentScraper:
    def __init__(self):
        # Set up Chrome options
        self.options = Options()
        self.options.add_argument('--headless')
        self.options.add_argument('--disable-gpu')
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('--disable-dev-shm-usage')
        self.options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
        
        # Initialize the driver
        self.driver = webdriver.Chrome(options=self.options)
        self.posts = []
        
    def search_posts(self, search_query):
        """
        Search for posts related to the search query
        """
        # Format the search query for URL
        formatted_query = search_query.replace(' ', '+')
        search_url = f"https://www.reddit.com/search/?q={formatted_query}"
        logger.info(f"Searching Reddit for: {search_query}")
        logger.info(f"Search URL: {search_url}")
        
        try:
            # Load the search page
            self.driver.get(search_url)
            
            # Wait for post results to load
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "a[data-testid='post-title']"))
            )
            
            # Give the page a moment to fully load
            time.sleep(3)
            
            # Find all post links
            post_links = self.driver.find_elements(By.CSS_SELECTOR, "a[data-testid='post-title']")
            logger.info(f"Found {len(post_links)} posts in search results")
            
            # Extract post URLs and titles
            posts_data = []
            for link in post_links:
                try:
                    url = link.get_attribute("href")
                    title = link.find_element(By.TAG_NAME, "faceplate-screen-reader-content").text.strip()
                    posts_data.append({"url": url, "title": title})
                except Exception as e:
                    logger.error(f"Error extracting post data: {e}")
            
            logger.info(f"Extracted data for {len(posts_data)} posts")
            return posts_data
            
        except Exception as e:
            logger.error(f"Error searching Reddit: {e}")
            return []
    
    def scrape_post_and_comments(self, post_url, post_title):
        """
        Scrape a specific post and all its comments
        """
        logger.info(f"Scraping post: {post_title}")
        
        try:
            # Navigate to the post
            self.driver.get(post_url)
            
            # Wait for the post content to load
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "shreddit-comment"))
            )
            
            # Give the page a moment to fully load
            time.sleep(3)
            
            # Extract post author
            try:
                post_author_element = self.driver.find_element(By.CSS_SELECTOR, "a[data-testid='post_author_link']")
                post_author = post_author_element.text.strip()
            except NoSuchElementException:
                post_author = "Unknown"
            
            # Create post data structure
            post_data = {
                "title": post_title,
                "url": post_url,
                "author": post_author,
                "comments": []
            }
            
            # Find all comments
            comment_elements = self.driver.find_elements(By.TAG_NAME, "shreddit-comment")
            logger.info(f"Found {len(comment_elements)} comments in post")
            
            # Process each comment
            for comment in comment_elements:
                try:
                    # Extract username
                    try:
                        # Try to find the username link
                        username_element = comment.find_element(By.CSS_SELECTOR, "a.text-12.font-bold")
                        username = username_element.text.strip()
                    except NoSuchElementException:
                        # If the link is not found, try to find the username span (for deleted users)
                        try:
                            username_span = comment.find_element(By.CSS_SELECTOR, "span.text-neutral-content-weak")
                            username = username_span.text.strip()
                        except NoSuchElementException:
                            # If all else fails, check if there's any username element
                            try:
                                username_element = comment.find_element(By.CSS_SELECTOR, ".author-name-meta a")
                                username = username_element.text.strip()
                            except:
                                username = "Unknown User"
                    
                    # Extract comment content
                    try:
                        content_element = comment.find_element(By.CSS_SELECTOR, "div[slot='comment']")
                        content = content_element.text.strip()
                    except NoSuchElementException:
                        content = "No content found"
                    
                    # Add to comments list
                    post_data["comments"].append({
                        "username": username,
                        "content": content
                    })
                    
                except Exception as e:
                    logger.error(f"Error extracting comment data: {e}")
            
            # Click on "More replies" buttons to load additional comments
            try:
                more_replies_buttons = self.driver.find_elements(By.XPATH, "//button[contains(text(), 'more repl')]")
                for button in more_replies_buttons[:5]:  # Limit to 5 to avoid too many clicks
                    try:
                        button.click()
                        time.sleep(2)  # Wait for new comments to load
                        
                        # Get the newly loaded comments
                        new_comments = self.driver.find_elements(By.TAG_NAME, "shreddit-comment")
                        logger.info(f"Loaded {len(new_comments) - len(comment_elements)} additional comments")
                        
                        # Process the new comments (only the ones we haven't processed yet)
                        for comment in new_comments[len(comment_elements):]:
                            try:
                                # Extract username
                                try:
                                    username_element = comment.find_element(By.CSS_SELECTOR, "a.text-12.font-bold")
                                    username = username_element.text.strip()
                                except NoSuchElementException:
                                    try:
                                        username_span = comment.find_element(By.CSS_SELECTOR, "span.text-neutral-content-weak")
                                        username = username_span.text.strip()
                                    except:
                                        username = "Unknown User"
                                
                                # Extract comment content
                                try:
                                    content_element = comment.find_element(By.CSS_SELECTOR, "div[slot='comment']")
                                    content = content_element.text.strip()
                                except NoSuchElementException:
                                    content = "No content found"
                                
                                # Add to comments list
                                post_data["comments"].append({
                                    "username": username,
                                    "content": content
                                })
                                
                            except Exception as e:
                                logger.error(f"Error extracting new comment data: {e}")
                        
                        # Update our list of processed comments
                        comment_elements = new_comments
                        
                    except Exception as e:
                        logger.error(f"Error clicking more replies button: {e}")
            except Exception as e:
                logger.error(f"Error finding more replies buttons: {e}")
            
            return post_data
            
        except Exception as e:
            logger.error(f"Error scraping post {post_url}: {e}")
            return None
    
    def process_search_results(self, search_query, max_posts=5):
        """
        Process the search results and scrape posts and comments
        """
        # Search for posts
        posts_data = self.search_posts(search_query)
        
        # Limit to max_posts
        posts_to_scrape = posts_data[:max_posts]
        
        # Scrape each post
        for i, post in enumerate(posts_to_scrape):
            logger.info(f"Processing post {i+1}/{len(posts_to_scrape)}: {post['title']}")
            post_data = self.scrape_post_and_comments(post['url'], post['title'])
            
            if post_data:
                self.posts.append(post_data)
            
            # Wait between posts to avoid rate limiting
            if i < len(posts_to_scrape) - 1:
                time.sleep(2)
        
        return self.posts
    
    def save_to_csv(self, filename_prefix=None):
        """
        Save the scraped data to CSV files (one for posts, one for comments)
        """
        if filename_prefix is None:
            filename_prefix = "reddit_data"
        
        posts_filename = f"{filename_prefix}_posts.csv"
        comments_filename = f"{filename_prefix}_comments.csv"
        
        # Save posts to CSV
        try:
            with open(posts_filename, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ['title', 'url', 'author', 'comment_count']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                
                writer.writeheader()
                for post in self.posts:
                    writer.writerow({
                        'title': post['title'],
                        'url': post['url'],
                        'author': post['author'],
                        'comment_count': len(post['comments'])
                    })
            
            logger.info(f"Saved {len(self.posts)} posts to {posts_filename}")
        except Exception as e:
            logger.error(f"Error saving posts to CSV: {e}")
        
        # Save comments to CSV
        try:
            with open(comments_filename, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ['post_title', 'post_url', 'username', 'comment_content']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                
                writer.writeheader()
                for post in self.posts:
                    for comment in post['comments']:
                        writer.writerow({
                            'post_title': post['title'],
                            'post_url': post['url'],
                            'username': comment['username'],
                            'comment_content': comment['content']
                        })
            
            total_comments = sum(len(post['comments']) for post in self.posts)
            logger.info(f"Saved {total_comments} comments to {comments_filename}")
        except Exception as e:
            logger.error(f"Error saving comments to CSV: {e}")
    
    def extract_comments_from_html(self, html_content):
        """
        Extract comments from pre-downloaded HTML content
        """
        logger.info("Extracting comments from provided HTML content")
        comments = []
        
        # Parse the HTML content
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Find all comment elements
        comment_elements = soup.find_all('shreddit-comment')
        logger.info(f"Found {len(comment_elements)} comment elements in HTML")
        
        # Process each comment element
        for comment in comment_elements:
            try:
                # Extract username
                username = "Unknown User"
                
                # Try different ways to get the username
                username_element = comment.select_one("a.text-12.font-bold")
                if username_element:
                    username = username_element.text.strip()
                else:
                    # Try for deleted users
                    deleted_username = comment.select_one("span.text-neutral-content-weak")
                    if deleted_username:
                        username = deleted_username.text.strip()
                    else:
                        # Try another selector for usernames
                        author_element = comment.select_one(".author-name-meta a, .author-name-meta span")
                        if author_element:
                            username = author_element.text.strip()
                
                # Extract comment content
                content = "No content found"
                content_element = comment.select_one("div[slot='comment']")
                if content_element:
                    content = content_element.text.strip()
                
                # Add to comments list
                comments.append({
                    "username": username,
                    "content": content
                })
                
            except Exception as e:
                logger.error(f"Error extracting comment data from HTML: {e}")
        
        logger.info(f"Extracted {len(comments)} comments from HTML")
        return comments
    
    def close(self):
        """Close the web driver"""
        self.driver.quit()
        logger.info("Closed web driver")

def main():
    # Define the search query
    search_query = "data science job"
    
    # Create the scraper
    scraper = RedditCommentScraper()
    
    try:
        # Process search results
        scraper.process_search_results(search_query, max_posts=3)
        
        # Save the results to CSV files
        scraper.save_to_csv("data_science_job")
        
        # Print a summary of the results
        print(f"\nFound {len(scraper.posts)} posts about data science jobs:")
        print("=" * 80)
        
        for i, post in enumerate(scraper.posts, 1):
            print(f"Post {i}: {post['title']}")
            print(f"Author: {post['author']}")
            print(f"URL: {post['url']}")
            print(f"Comments: {len(post['comments'])}")
            
            # Print first few comments
            print("\nSample Comments:")
            for j, comment in enumerate(post['comments'][:3], 1):
                print(f"  {j}. {comment['username']}: {comment['content'][:100]}..." if len(comment['content']) > 100 else f"  {j}. {comment['username']}: {comment['content']}")
            
            if len(post['comments']) > 3:
                print(f"  ...and {len(post['comments']) - 3} more comments")
            
            print("=" * 80)
        
        return scraper.posts
    
    finally:
        # Close the driver
        scraper.close()

# Function to process pre-downloaded HTML
def process_html_file(file_path):
    try:
        # Read the HTML file
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        # Create the scraper (we don't need to use Selenium for this)
        scraper = RedditCommentScraper()
        
        # Extract comments from the HTML
        comments = scraper.extract_comments_from_html(html_content)
        
        # Print the results
        print(f"\nExtracted {len(comments)} comments from HTML file:")
        print("=" * 80)
        
        for i, comment in enumerate(comments, 1):
            print(f"{i}. {comment['username']}: {comment['content'][:100]}..." if len(comment['content']) > 100 else f"{i}. {comment['username']}: {comment['content']}")
        
        # Save to CSV
        with open('extracted_comments.csv', 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['username', 'comment_content']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            writer.writeheader()
            for comment in comments:
                writer.writerow({
                    'username': comment['username'],
                    'comment_content': comment['content']
                })
        
        logger.info(f"Saved {len(comments)} comments to extracted_comments.csv")
        
        return comments
        
    except Exception as e:
        logger.error(f"Error processing HTML file: {e}")
        return []

if __name__ == "__main__":
    # If we have a downloaded HTML file, process it directly
    html_file_path = "reddit_post.html"  # Change this to your HTML file path
    
    import os
    if os.path.exists(html_file_path):
        process_html_file(html_file_path)
    else:
        # Otherwise, do a live search and scrape
        main()

2025-04-28 15:16:58,911 - INFO - Searching Reddit for: data science job
2025-04-28 15:16:58,912 - INFO - Search URL: https://www.reddit.com/search/?q=data+science+job
2025-04-28 15:17:03,825 - INFO - Found 7 posts in search results
2025-04-28 15:17:03,984 - INFO - Extracted data for 7 posts
2025-04-28 15:17:03,984 - INFO - Processing post 1/3: Update half a year into my first data science job...
2025-04-28 15:17:03,984 - INFO - Scraping post: Update half a year into my first data science job...
2025-04-28 15:17:08,181 - INFO - Found 10 comments in post
2025-04-28 15:17:10,452 - INFO - Processing post 2/3: How's the job market for data scientists?
2025-04-28 15:17:10,453 - INFO - Scraping post: How's the job market for data scientists?
2025-04-28 15:17:14,729 - INFO - Found 9 comments in post
2025-04-28 15:17:17,389 - INFO - Processing post 3/3: Should I accept this data science job (i.e. how bad is the job market?)
2025-04-28 15:17:17,389 - INFO - Scraping post: Should I accept this da

KeyboardInterrupt: 

In [12]:
import pandas as pd
pd.read_csv('jobs_communities.csv')

Unnamed: 0,name,description,members,online,url
0,r/jobs,/r/jobs is the number one community for advice...,2M,516,https://www.reddit.com/r/jobs/
1,r/RemoteJobs,Remote jobs: the future of work! READ RULES BE...,273K,28,https://www.reddit.com/r/RemoteJobs/
2,r/DesignJobs,"Some redditors are skilled professionals, some...",163K,32,https://www.reddit.com/r/DesignJobs/
3,r/AskReddit,r/AskReddit is the place to ask and answer tho...,55M,12K,https://www.reddit.com/r/AskReddit/
4,r/antiwork,"A subreddit for those who want to end work, ar...",2.9M,1.5K,https://www.reddit.com/r/antiwork/
5,r/torontoJobs,Redditor approved jobs in the GTHA.,61K,19,https://www.reddit.com/r/torontoJobs/
6,r/CanadaJobs,This is a place to support job seekers and hir...,23K,12,https://www.reddit.com/r/CanadaJobs/
7,r/recruitinghell,Did a recruiter make you send them a resume an...,844K,574,https://www.reddit.com/r/recruitinghell/
8,r/careerguidance,"A place to discuss career options, to ask ques...",4.7M,1.6K,https://www.reddit.com/r/careerguidance/
9,r/DubaiJobs,We all know how daunting the job hunt can be i...,15K,16,https://www.reddit.com/r/DubaiJobs/


In [10]:
"""
Enhanced Reddit User Comments Scraper
This script focuses on scraping user comments from Reddit profiles
with improved scrolling to capture all available comments
"""

from bs4 import BeautifulSoup
import csv
import re
import logging
import time
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementNotInteractableException

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('reddit_comments_scraper')

class RedditCommentsScraper:
    def __init__(self):
        # Set up Chrome options
        self.options = Options()
        self.options.add_argument('--headless')
        self.options.add_argument('--disable-gpu')
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('--disable-dev-shm-usage')
        self.options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
        
        # Initialize the driver
        self.driver = webdriver.Chrome(options=self.options)
        self.posts = []
        self.user_data = {}
        
    def search_posts(self, search_query):
        """
        Search for posts related to the search query
        """
        # Format the search query for URL
        formatted_query = search_query.replace(' ', '+')
        search_url = f"https://www.reddit.com/search/?q={formatted_query}"
        logger.info(f"Searching Reddit for: {search_query}")
        logger.info(f"Search URL: {search_url}")
        
        try:
            # Load the search page
            self.driver.get(search_url)
            
            # Wait for post results to load
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "a[data-testid='post-title']"))
            )
            
            # Give the page a moment to fully load
            time.sleep(3)
            
            # Find all post links
            post_links = self.driver.find_elements(By.CSS_SELECTOR, "a[data-testid='post-title']")
            logger.info(f"Found {len(post_links)} posts in search results")
            
            # Extract post URLs and titles
            posts_data = []
            for link in post_links:
                try:
                    url = link.get_attribute("href")
                    title = link.find_element(By.TAG_NAME, "faceplate-screen-reader-content").text.strip()
                    posts_data.append({"url": url, "title": title})
                except Exception as e:
                    logger.error(f"Error extracting post data: {e}")
            
            logger.info(f"Extracted data for {len(posts_data)} posts")
            return posts_data
            
        except Exception as e:
            logger.error(f"Error searching Reddit: {e}")
            return []
    
    def scrape_post_and_comments(self, post_url, post_title):
        """
        Scrape a specific post and all its comments
        """
        logger.info(f"Scraping post: {post_title}")
        
        try:
            # Navigate to the post
            self.driver.get(post_url)
            
            # Wait for the post content to load
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "shreddit-comment"))
            )
            
            # Give the page a moment to fully load
            time.sleep(3)
            
            # Extract post author
            try:
                post_author_element = self.driver.find_element(By.CSS_SELECTOR, "a[data-testid='post_author_link']")
                post_author = post_author_element.text.strip()
            except NoSuchElementException:
                post_author = "Unknown"
            
            # Create post data structure
            post_data = {
                "title": post_title,
                "url": post_url,
                "author": post_author,
                "comments": []
            }
            
            # Find all comments
            comment_elements = self.driver.find_elements(By.TAG_NAME, "shreddit-comment")
            logger.info(f"Found {len(comment_elements)} comments in post")
            
            # Process each comment
            for comment in comment_elements:
                try:
                    # Extract username
                    try:
                        # Try to find the username link
                        username_element = comment.find_element(By.CSS_SELECTOR, "a.text-12.font-bold")
                        username = username_element.text.strip()
                    except NoSuchElementException:
                        # If the link is not found, try to find the username span (for deleted users)
                        try:
                            username_span = comment.find_element(By.CSS_SELECTOR, "span.text-neutral-content-weak")
                            username = username_span.text.strip()
                        except NoSuchElementException:
                            # If all else fails, check if there's any username element
                            try:
                                username_element = comment.find_element(By.CSS_SELECTOR, ".author-name-meta a")
                                username = username_element.text.strip()
                            except:
                                username = "Unknown User"
                    
                    # Skip [deleted] users
                    if username == "[deleted]":
                        continue
                    
                    # Extract comment content
                    try:
                        content_element = comment.find_element(By.CSS_SELECTOR, "div[slot='comment']")
                        content = content_element.text.strip()
                    except NoSuchElementException:
                        content = "No content found"
                    
                    # Add to comments list
                    post_data["comments"].append({
                        "username": username,
                        "content": content
                    })
                    
                except Exception as e:
                    logger.error(f"Error extracting comment data: {e}")
            
            # Click on "More replies" buttons to load additional comments
            try:
                more_replies_buttons = self.driver.find_elements(By.XPATH, "//button[contains(text(), 'more repl')]")
                for button in more_replies_buttons[:5]:  # Limit to 5 to avoid too many clicks
                    try:
                        button.click()
                        time.sleep(2)  # Wait for new comments to load
                        
                        # Get the newly loaded comments
                        new_comments = self.driver.find_elements(By.TAG_NAME, "shreddit-comment")
                        logger.info(f"Loaded {len(new_comments) - len(comment_elements)} additional comments")
                        
                        # Process the new comments (only the ones we haven't processed yet)
                        for comment in new_comments[len(comment_elements):]:
                            try:
                                # Extract username
                                try:
                                    username_element = comment.find_element(By.CSS_SELECTOR, "a.text-12.font-bold")
                                    username = username_element.text.strip()
                                except NoSuchElementException:
                                    try:
                                        username_span = comment.find_element(By.CSS_SELECTOR, "span.text-neutral-content-weak")
                                        username = username_span.text.strip()
                                    except:
                                        username = "Unknown User"
                                
                                # Skip [deleted] users
                                if username == "[deleted]":
                                    continue
                                
                                # Extract comment content
                                try:
                                    content_element = comment.find_element(By.CSS_SELECTOR, "div[slot='comment']")
                                    content = content_element.text.strip()
                                except NoSuchElementException:
                                    content = "No content found"
                                
                                # Add to comments list
                                post_data["comments"].append({
                                    "username": username,
                                    "content": content
                                })
                                
                            except Exception as e:
                                logger.error(f"Error extracting new comment data: {e}")
                        
                        # Update our list of processed comments
                        comment_elements = new_comments
                        
                    except Exception as e:
                        logger.error(f"Error clicking more replies button: {e}")
            except Exception as e:
                logger.error(f"Error finding more replies buttons: {e}")
            
            return post_data
            
        except Exception as e:
            logger.error(f"Error scraping post {post_url}: {e}")
            return None
    
    def process_search_results(self, search_query, max_posts=5):
        """
        Process the search results and scrape posts and comments
        """
        # Search for posts
        posts_data = self.search_posts(search_query)
        
        # Limit to max_posts
        posts_to_scrape = posts_data[:max_posts]
        
        # Scrape each post
        for i, post in enumerate(posts_to_scrape):
            logger.info(f"Processing post {i+1}/{len(posts_to_scrape)}: {post['title']}")
            post_data = self.scrape_post_and_comments(post['url'], post['title'])
            
            if post_data:
                self.posts.append(post_data)
            
            # Wait between posts to avoid rate limiting
            if i < len(posts_to_scrape) - 1:
                time.sleep(2)
        
        return self.posts
    
    def save_to_csv(self, filename_prefix=None):
        """
        Save the scraped comments to a CSV file
        """
        if filename_prefix is None:
            filename_prefix = "reddit_data"
        
        comments_filename = f"{filename_prefix}_comments.csv"
        
        # Save comments to CSV
        try:
            with open(comments_filename, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ['post_title', 'post_url', 'username', 'comment_content']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                
                writer.writeheader()
                for post in self.posts:
                    for comment in post['comments']:
                        writer.writerow({
                            'post_title': post['title'],
                            'post_url': post['url'],
                            'username': comment['username'],
                            'comment_content': comment['content']
                        })
            
            total_comments = sum(len(post['comments']) for post in self.posts)
            logger.info(f"Saved {total_comments} comments to {comments_filename}")
        except Exception as e:
            logger.error(f"Error saving comments to CSV: {e}")
        
    # USER PROFILE METHODS
    
    def handle_age_verification(self):
        """Handle age verification if it appears"""
        try:
            # Look for the age verification element with a short timeout
            # Fixed quotes in XPath expression
            age_verification = WebDriverWait(self.driver, 3).until(
                EC.presence_of_element_located((By.XPATH, "//span[contains(text(), \"Yes, I'm Over 18\")]"))
            )
            
            # If found, click it
            logger.info("Age verification detected, clicking 'Yes, I'm Over 18'")
            age_verification.click()
            
            # Wait for the page to load after verification
            time.sleep(2)
            return True
            
        except (TimeoutException, NoSuchElementException, ElementNotInteractableException):
            # No age verification found, which is fine
            logger.info("No age verification required")
            return False
    
    def click_comments_tab(self):
        """Click the Comments tab to only see comments"""
        try:
            # Try to find and click the Comments tab
            try:
                # First try more specific XPath with text()
                comments_tab = self.driver.find_element(By.XPATH, "//span[text()='Comments']/ancestor::a")
                logger.info("Found Comments tab with exact text match")
            except NoSuchElementException:
                # Try more general contains() match
                comments_tab = self.driver.find_element(By.XPATH, "//span[contains(text(), 'Comments')]/ancestor::a")
                logger.info("Found Comments tab with contains match")
            
            logger.info("Clicking Comments tab")
            comments_tab.click()
            time.sleep(2)
            return True
            
        except (NoSuchElementException, ElementNotInteractableException) as e:
            logger.error(f"Failed to click Comments tab: {e}")
            # Try to find Overview tab instead
            try:
                overview_tab = self.driver.find_element(By.XPATH, "//span[contains(text(), 'Overview')]/ancestor::a")
                logger.info("Clicking Overview tab instead")
                overview_tab.click()
                time.sleep(2)
                return True
            except:
                logger.error("Failed to find any navigation tabs")
                return False
    
    def scroll_and_scrape_comments(self, username, max_scrolls=10, max_comments=50):
        """
        Scroll down the page multiple times to load all comments
        and extract them after each scroll
        """
        comments = []
        last_comments_count = 0
        consecutive_no_new_comments = 0
        
        # Initial height
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        
        # Scroll and extract comments in batches
        for scroll_num in range(max_scrolls):
            logger.info(f"Scroll {scroll_num+1}/{max_scrolls} for user {username}")
            
            # Scroll down
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            
            # Wait for new content to load
            time.sleep(3)
            
            # Get new scroll height
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            
            # Extract comments after scrolling
            scroll_comments = self.extract_comments_from_page()
            
            # Add only new comments
            new_comments = [comment for comment in scroll_comments if comment not in comments]
            if new_comments:
                logger.info(f"Found {len(new_comments)} new comments after scroll {scroll_num+1}")
                comments.extend(new_comments)
                consecutive_no_new_comments = 0
            else:
                consecutive_no_new_comments += 1
                logger.info(f"No new comments found after scroll {scroll_num+1}")
            
            # If we've reached max comments, stop scrolling
            if len(comments) >= max_comments:
                logger.info(f"Reached maximum comment limit ({max_comments})")
                break
                
            # If same height for 3 consecutive times, no more content
            if new_height == last_height:
                logger.info("No more content to load (scroll height unchanged)")
                consecutive_no_new_comments += 1
            else:
                last_height = new_height
            
            # If no new comments for 3 consecutive scrolls, stop
            if consecutive_no_new_comments >= 3:
                logger.info(f"No new comments for 3 consecutive scrolls, stopping")
                break
        
        logger.info(f"Total comments extracted for {username}: {len(comments)}")
        return comments[:max_comments]
    
    def extract_comments_from_page(self):
        """Extract comments from the current page"""
        comments = []
        
        # Find all comment elements on the page using multiple selectors
        comment_selectors = [
            "div.Comment", 
            "shreddit-comment", 
            "div.p-md",
            "div.mt-2xs"
        ]
        
        comment_elements = []
        for selector in comment_selectors:
            elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
            comment_elements.extend(elements)
        
        # Process each comment element
        for element in comment_elements:
            try:
                comment_data = {}
                
                # Get the subreddit and title context
                try:
                    # Find links to subreddits
                    subreddit_elements = element.find_elements(By.CSS_SELECTOR, "a[href^='/r/']")
                    if subreddit_elements:
                        comment_data["subreddit"] = subreddit_elements[0].text.strip()
                    else:
                        comment_data["subreddit"] = "Unknown subreddit"
                    
                    # Find the post title
                    title_elements = element.find_elements(By.CSS_SELECTOR, "a[aria-label*='Thread for'], a[href*='/comments/']")
                    if title_elements:
                        for title_elem in title_elements:
                            aria_label = title_elem.get_attribute("aria-label")
                            if aria_label and "comment on " in aria_label:
                                comment_data["title"] = aria_label.split("comment on ")[1]
                                break
                            elif title_elem.text and len(title_elem.text.strip()) > 5:
                                comment_data["title"] = title_elem.text.strip()
                                break
                    else:
                        comment_data["title"] = "Unknown title"
                except Exception as e:
                    logger.error(f"Error extracting comment context: {e}")
                    comment_data["subreddit"] = "Unknown subreddit"
                    comment_data["title"] = "Unknown title"
                
                # Get the comment content
                try:
                    # Try different content selectors
                    content_selectors = [
                        "div.md p", 
                        "div[data-click-id='text'] p", 
                        "div.text-neutral-content-strong p",
                        "div[slot='comment']",
                        "div.text-neutral-content-strong",
                        "p"
                    ]
                    
                    content = "No content found"
                    for selector in content_selectors:
                        try:
                            content_elements = element.find_elements(By.CSS_SELECTOR, selector)
                            if content_elements:
                                content = content_elements[0].text.strip()
                                if content:
                                    break
                        except:
                            continue
                    
                    comment_data["content"] = content
                except Exception as e:
                    logger.error(f"Error extracting comment content: {e}")
                    comment_data["content"] = "Error extracting content"
                
                # Get the timestamp
                try:
                    time_elements = element.find_elements(By.TAG_NAME, "time")
                    if time_elements:
                        comment_data["time"] = time_elements[0].get_attribute("datetime")
                        comment_data["time_display"] = time_elements[0].text.strip()
                    else:
                        comment_data["time"] = "Unknown time"
                        comment_data["time_display"] = "Unknown time"
                except Exception as e:
                    logger.error(f"Error extracting comment time: {e}")
                    comment_data["time"] = "Unknown time"
                    comment_data["time_display"] = "Unknown time"
                
                # Add to comments list if it contains actual content
                if comment_data["content"] and comment_data["content"] != "No content found" and comment_data["content"] != "Error extracting content":
                    comments.append(comment_data)
            
            except Exception as e:
                logger.error(f"Error processing comment element: {e}")
        
        return comments
    
    def scrape_user_profile(self, username):
        """Scrape a Reddit user's profile focusing on comments"""
        profile_url = f"https://www.reddit.com/user/{username}/"
        logger.info(f"Scraping profile: {profile_url}")
        
        try:
            # Skip [deleted] users
            if username == "[deleted]":
                logger.info(f"Skipping [deleted] user")
                return None
                
            # Navigate to the profile
            self.driver.get(profile_url)
            
            # Wait for the page to load
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            
            # Handle age verification if needed
            self.handle_age_verification()
            
            # Try to click on the Comments tab to focus only on comments
            self.click_comments_tab()
            
            # Give the page a moment to load
            time.sleep(2)
            
            # Scroll and extract comments
            comments = self.scroll_and_scrape_comments(username)
            
            # Store the data
            self.user_data[username] = {
                "username": username,
                "profile_url": profile_url,
                "comments": comments
            }
            
            logger.info(f"Successfully scraped {len(comments)} comments from {username}'s profile")
            return self.user_data[username]
            
        except Exception as e:
            logger.error(f"Error scraping profile {username}: {e}")
            return None
    
    def get_unique_usernames_from_comments(self):
        """Extract unique usernames from collected comments"""
        usernames = set()
        
        for post in self.posts:
            for comment in post.get("comments", []):
                username = comment.get("username", "")
                # Skip [deleted] users and unknown users
                if username and username != "[deleted]" and username != "Unknown User":
                    usernames.add(username)
        
        logger.info(f"Extracted {len(usernames)} unique usernames from comments")
        return list(usernames)
    
    def scrape_all_users_from_comments(self, max_users=None):
        """Scrape profiles of all users who left comments"""
        usernames = self.get_unique_usernames_from_comments()
        
        # Limit to max_users if specified
        if max_users:
            usernames = usernames[:min(max_users, len(usernames))]
        
        logger.info(f"Preparing to scrape {len(usernames)} user profiles")
        
        for i, username in enumerate(usernames):
            logger.info(f"Processing user {i+1}/{len(usernames)}: {username}")
            self.scrape_user_profile(username)
            
            # Wait between profiles to avoid rate limiting
            if i < len(usernames) - 1:
                time.sleep(2)
        
        return self.user_data
    
    def save_user_comments_to_csv(self, filename="reddit_user_comments.csv"):
        """Save user comments to CSV"""
        try:
            with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ['username', 'subreddit', 'post_title', 'comment_content', 'time']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                
                writer.writeheader()
                for username, user_data in self.user_data.items():
                    for comment in user_data.get('comments', []):
                        writer.writerow({
                            'username': username,
                            'subreddit': comment.get('subreddit', 'Unknown'),
                            'post_title': comment.get('title', 'Unknown'),
                            'comment_content': comment.get('content', 'No content'),
                            'time': comment.get('time', 'Unknown')
                        })
            
            total_comments = sum(len(user_data.get('comments', [])) for user_data in self.user_data.values())
            logger.info(f"Saved {total_comments} user comments to {filename}")
            return True
        except Exception as e:
            logger.error(f"Error saving user comments to CSV: {e}")
            return False
    
    def save_user_data_to_json(self, filename="reddit_users.json"):
        """Save user data to JSON"""
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(self.user_data, f, indent=4)
            
            logger.info(f"Saved data for {len(self.user_data)} users to {filename}")
            return True
        except Exception as e:
            logger.error(f"Error saving user data to JSON: {e}")
            return False
    
    def close(self):
        """Close the web driver"""
        self.driver.quit()
        logger.info("Closed web driver")

def main():
    # Define the search query
    search_query = "data science job"
    
    # Create the scraper
    scraper = RedditCommentsScraper()
    
    try:
        # Process search results
        scraper.process_search_results(search_query, max_posts=3)
        
        # Save the comment results to CSV file
        scraper.save_to_csv("data_science_job")
        
        # Extract and scrape user profiles from the comments
        scraper.scrape_all_users_from_comments(max_users=5)  # Limit to 5 users for testing
        
        # Save user comments data
        scraper.save_user_comments_to_csv()
        scraper.save_user_data_to_json()
        
        # Print a summary of the results
        print(f"\nScraped {len(scraper.posts)} posts about data science jobs")
        print(f"Scraped {len(scraper.user_data)} user profiles")
        
        # Sample of user comments
        for username, user_data in scraper.user_data.items():
            print(f"\nUser: {username}")
            print(f"Comments: {len(user_data.get('comments', []))}")
            
            # Print first 2 comments as a sample
            for i, comment in enumerate(user_data.get('comments', [])[:2]):
                print(f"  {i+1}. [{comment.get('subreddit', 'Unknown')}] {comment.get('title', 'Unknown title')}")
                print(f"     Content: {comment.get('content', 'No content')[:100]}..." if len(comment.get('content', '')) > 100 else f"     Content: {comment.get('content', 'No content')}")
            
            if len(user_data.get('comments', [])) > 2:
                print(f"  ...and {len(user_data.get('comments', [])) - 2} more comments")
        
        return {"posts": scraper.posts, "users": scraper.user_data}
    
    finally:
        # Close the driver
        scraper.close()

if __name__ == "__main__":
    main()

2025-04-28 14:59:07,210 - INFO - Searching Reddit for: data science job
2025-04-28 14:59:07,210 - INFO - Search URL: https://www.reddit.com/search/?q=data+science+job
2025-04-28 14:59:12,069 - INFO - Found 7 posts in search results
2025-04-28 14:59:12,233 - INFO - Extracted data for 7 posts
2025-04-28 14:59:12,233 - INFO - Processing post 1/3: Anyone working as a data scientist/ data analyst/ data engineer? Is getting a job in this field tough as a fresher?
2025-04-28 14:59:12,233 - INFO - Scraping post: Anyone working as a data scientist/ data analyst/ data engineer? Is getting a job in this field tough as a fresher?
2025-04-28 14:59:16,533 - INFO - Found 2 comments in post
2025-04-28 14:59:18,611 - INFO - Processing post 2/3: Jobs in Entry level Data Science ?
2025-04-28 14:59:18,612 - INFO - Scraping post: Jobs in Entry level Data Science ?
2025-04-28 14:59:22,645 - INFO - Found 2 comments in post
2025-04-28 14:59:24,714 - INFO - Processing post 3/3: How's the job market for data sc


Scraped 3 posts about data science jobs
Scraped 4 user profiles

User: itsMalarky
Comments: 50
  1. [] I like the label.
     Content: Definitely worth it. Same area
  2. [] Making friends
     Content: Game night (Tuesday), trivia on Wednesdays (try to jump into a team near the bar) and cribbage night...
  ...and 48 more comments

User: Party_Lingonberry_58
Comments: 50
  1. [] ECE 606 - final?
     Content: The grades are posted in Learn!
  2. [] ECE 606 - final?
     Content: I haven’t seen the grades yet
  ...and 48 more comments

User: occasional_cynic
Comments: 50
  1. [] Opposing QB’s WILL need diapers. Giants select DL, Darius Alexander (Toledo).
     Content: BJ Hill has been pretty darn good.  He just got into Judge's doghouse because he was a terrible coac...
  2. [] Opposing QB’s WILL need diapers. Giants select DL, Darius Alexander (Toledo).
     Content: It's very important.  Dominating twenty year olds when you are that old is a red flag.  See: JMS.
  ...and 48 more com

2025-04-28 15:00:54,107 - INFO - Closed web driver


In [11]:
pd.read_csv('reddit_user_comments.csv')

Unnamed: 0,username,subreddit,post_title,comment_content,time
0,itsMalarky,,I like the label.,Definitely worth it. Same area,2025-04-28T00:58:56.716Z
1,itsMalarky,,Making friends,"Game night (Tuesday), trivia on Wednesdays (tr...",2025-04-27T23:43:02.042Z
2,itsMalarky,,I like the label.,Good beer everywhere. No point even making a c...,2025-04-27T18:17:59.349Z
3,itsMalarky,,You are now your username. What are you?,Bullshit.,2025-04-27T00:50:48.460Z
4,itsMalarky,,Nightmare pairing with a Marines,"Cool, still proving my point lol. Here ya go 🖍...",2025-04-27T00:48:06.562Z
...,...,...,...,...,...
148,occasional_cynic,Unknown subreddit,Unknown title,because AWS doesn’t route public replies back ...,Unknown time
149,occasional_cynic,Unknown subreddit,Unknown title,In theory yes.,Unknown time
150,AtmosphereGrouchy245,,Anyone working as a data scientist/ data analy...,"Hey folks, I’m on the hunt for a data science ...",2025-03-26T01:54:35.773Z
151,AtmosphereGrouchy245,Unknown subreddit,Unknown title,2,2022-02-17T00:00:00.000Z
