In [16]:
pip install requests beautifulsoup4 python-dotenv selenium pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.12 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

class GroupsIOScraper:
    def __init__(self):
        self.session = requests.Session()
        self.base_url = "https://groups.io"
        self.login_url = f"{self.base_url}/login"
        self.driver = None
        
    def login_with_requests(self, email, password):
        """Attempt to login using requests library"""
        try:
            # Get the login page first to obtain CSRF token
            response = self.session.get(self.login_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract CSRF token
            csrf_token = soup.find('input', {'name': 'csrf'})['value']
            monocle_token = soup.find('input', {'name': 'monocle'})['value']
            
            # Prepare login data
            login_data = {
                'email': email,
                'password': password,
                'csrf': csrf_token,
                'monocle': monocle_token,
                'timezone': 'America/New_York'
            }
            
            # Attempt login
            response = self.session.post(self.login_url, data=login_data)
            
            # Check if login was successful
            if response.url != self.login_url:  # Usually redirects after successful login
                print("Login successful using requests!")
                return True
            else:
                print("Login failed using requests, trying Selenium...")
                return False
                
        except Exception as e:
            print(f"Error during requests login: {str(e)}")
            return False
            
    def login_with_selenium(self, email, password):
        """Attempt to login using Selenium"""
        try:
            # Initialize Chrome driver if not already initialized
            if self.driver is None:
                self.driver = webdriver.Chrome()
            
            self.driver.get(self.login_url)
            
            # Wait for email field and enter credentials
            email_field = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, "email"))
            )
            email_field.send_keys(email)
            
            # Find and fill password field
            password_field = self.driver.find_element(By.ID, "password")
            password_field.send_keys(password)
            
            # Click login button
            login_button = self.driver.find_element(By.ID, "loginbutton")
            login_button.click()
            
            # Wait for redirect or new element that indicates successful login
            time.sleep(3)  # Give it some time to process
            
            # Check if login was successful
            if self.driver.current_url != self.login_url:
                print("Login successful using Selenium!")
                return True
            else:
                print("Login failed using Selenium")
                self.quit_driver()
                return False
                
        except Exception as e:
            print(f"Error during Selenium login: {str(e)}")
            self.quit_driver()
            return False
    
    def quit_driver(self):
        """Safely quit the Selenium driver"""
        if self.driver is not None:
            self.driver.quit()
            self.driver = None
    
    def get_driver(self):
        """Return the current driver instance"""
        return self.driver

In [24]:
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import time
import re

class GroupsIOForumScraper:
    def __init__(self, driver):
        """Initialize with Selenium webdriver instance"""
        self.driver = driver
        self.base_url = "https://groups.io/g/peds-endo"
        self.posts_data = []
        self.replies_data = []
        
    def navigate_to_topics(self):
        """Navigate to the topics page"""
        try:
            topics_url = f"{self.base_url}/topics"
            self.driver.get(topics_url)
            time.sleep(3)  # Wait for page to load
            return True
        except Exception as e:
            print(f"Error navigating to topics page: {str(e)}")
            return False

    def get_next_page_url(self):
        """Extract the URL for the next page"""
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        pagination = soup.find('ul', {'class': 'pagination'})
        if pagination:
            next_link = pagination.find('a', href=lambda x: x and 'page=' in x)
            if next_link:
                # Return complete URL
                return f"https://groups.io{next_link['href']}" if next_link['href'].startswith('/') else next_link['href']
        return None

    def collect_all_preview_data(self, max_pages=None):
        """Collect preview data from all pages before processing individual posts"""
        print("Starting to collect preview data from all pages...")
        all_preview_data = []
        current_page = 1
        
        while True:
            print(f"Collecting previews from page {current_page}...")
            
            # Scrape current page
            page_topics = self.scrape_topics_page()
            all_preview_data.extend(page_topics)
            
            # Check if we've reached max_pages
            if max_pages and current_page >= max_pages:
                print(f"Reached maximum pages limit ({max_pages})")
                break
                
            # Get next page URL
            next_url = self.get_next_page_url()
            if not next_url:
                print("No more pages available")
                break
                
            # Navigate to next page
            print(f"Navigating to next page: {next_url}")
            self.driver.get(next_url)
            time.sleep(3)  # Wait for page load
            current_page += 1
            
        print(f"Collected preview data for {len(all_preview_data)} topics")
        return all_preview_data

    def process_posts_with_replies(self, preview_data):
        """Process individual posts that have replies"""
        print("Starting to process individual posts with replies...")
        
        for topic in preview_data:
            if topic['reply_count'] > 0:
                print(f"Processing post: {topic['title']} (Reply count: {topic['reply_count']})")
                self.scrape_single_post(topic['url'], topic.copy())
            else:
                # For posts without replies, just add the preview data
                topic['post_id'] = self.extract_post_id(topic['url'])
                topic['full_content'] = topic['preview']
                self.posts_data.append(topic)

    def scrape_topics_page(self):
        """Scrape all topics from the current page"""
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        topics = []
        
        # Find all topic rows in the table
        topic_rows = soup.find('table', {'id': 'records'}).find_all('tr')
        
        for row in topic_rows:
            topic_data = self._parse_topic_row(row)
            if topic_data:
                topics.append(topic_data)
                
        return topics
    
    def _parse_topic_row(self, row):
        """Parse individual topic row"""
        try:
            # Main container for topic info
            topic_container = row.find('div', {'style': 'margin-top:3px;margin-bottom:3px;'})
            if not topic_container:
                return None
                
            # Get topic link and title
            subject_span = topic_container.find('span', {'class': 'subject'})
            if not subject_span:
                return None
                
            link = subject_span.find('a')
            topic_url = link['href'] if link else None
            topic_title = link.text.strip() if link else None
            
            # Get topic preview
            preview_div = topic_container.find('div', {'class': 'truncate-one-line'})
            preview_text = preview_div.text.strip() if preview_div else None
            
            # Get thread info (author, dates)
            thread_info = topic_container.find('span', {'class': 'thread-attribution'})
            author = thread_info.find(string=True, recursive=False).strip().replace('Started by', '').strip() if thread_info else None
            
            # Get dates
            dates = thread_info.find_all('span', {'title': True})
            start_date = dates[0]['title'] if dates else None
            last_reply_date = dates[1]['title'] if len(dates) > 1 else None
            
            # Get reply count
            reply_count = 0
            hashtag_span = subject_span.find('span', {'class': 'hashtag-position'})
            if hashtag_span:
                try:
                    reply_count = int(hashtag_span.text.strip())
                except ValueError:
                    pass
                    
            # Get if attachments exist
            has_attachments = bool(subject_span.find('i', {'class': 'fa-paperclip'}))
            
            return {
                'title': topic_title,
                'url': topic_url,
                'preview': preview_text,
                'author': author,
                'start_date': start_date,
                'last_reply_date': last_reply_date,
                'reply_count': reply_count,
                'has_attachments': has_attachments
            }
            
        except Exception as e:
            print(f"Error parsing row: {str(e)}")
            return None

    def extract_post_id(self, url):
        """Extract post ID from URL"""
        try:
            return url.split('/')[-1]
        except:
            return None
            
    def parse_date(self, date_str):
        """Parse date string into standardized format"""
        try:
            # Add your date parsing logic here
            return date_str
        except:
            return None
            
    def scrape_single_post(self, url, topic_data):
        """Scrape an individual post and its replies"""
        try:
            # Navigate to post page
            self.driver.get(url)
            time.sleep(2)
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            
            # Extract main post content
            main_post = soup.find('div', {'class': 'table-background-color expanded-message'})
            if not main_post:
                return
                
            # Get main post details
            post_id = self.extract_post_id(url)
            author = main_post.find('u').text.strip() if main_post.find('u') else None
            date = main_post.find('span', {'title': True})['title'] if main_post.find('span', {'title': True}) else None
            content = main_post.find('div', {'id': lambda x: x and x.startswith('msgbody')}).text.strip() if main_post.find('div', {'id': lambda x: x and x.startswith('msgbody')}) else None
            
            # Update the topic data with full content
            topic_data.update({
                'post_id': post_id,
                'full_content': content,
                'author': author,
                'date': self.parse_date(date)
            })
            self.posts_data.append(topic_data)
            
            # Get replies (all subsequent expanded-message divs)
            replies = soup.find_all('div', {'class': 'table-background-color expanded-message'})[1:]  # Skip first one (main post)
            
            for reply in replies:
                reply_data = {
                    'parent_post_id': post_id,
                    'reply_author': reply.find('u').text.strip() if reply.find('u') else None,
                    'reply_date': self.parse_date(reply.find('span', {'title': True})['title']) if reply.find('span', {'title': True}) else None,
                    'reply_content': reply.find('div', {'id': lambda x: x and x.startswith('msgbody')}).text.strip() if reply.find('div', {'id': lambda x: x and x.startswith('msgbody')}) else None
                }
                self.replies_data.append(reply_data)
                
        except Exception as e:
            print(f"Error scraping post {url}: {str(e)}")
            
    def scrape_forum(self, max_pages=None):
        """Main method to scrape the forum"""
        if not self.navigate_to_topics():
            return False
            
        # First collect all preview data
        preview_data = self.collect_all_preview_data(max_pages)
        
        # Then process posts with replies
        self.process_posts_with_replies(preview_data)
        
        return self.posts_data, self.replies_data
            
    def save_to_csv(self, posts_filename='posts.csv', replies_filename='replies.csv'):
        """Save posts and replies to separate CSV files"""
        # Save posts
        posts_df = pd.DataFrame(self.posts_data)
        posts_df.to_csv(posts_filename, index=False)
        print(f"Saved {len(self.posts_data)} posts to {posts_filename}")
        
        # Save replies
        replies_df = pd.DataFrame(self.replies_data)
        replies_df.to_csv(replies_filename, index=False)
        print(f"Saved {len(self.replies_data)} replies to {replies_filename}")

# Example usage:
"""
scraper = GroupsIOScraper()
if scraper.login_with_selenium(email, password):
    forum_scraper = GroupsIOForumScraper(scraper.get_driver())
    posts, replies = forum_scraper.scrape_forum(max_pages=5)
    forum_scraper.save_to_csv()
"""

'\nscraper = GroupsIOScraper()\nif scraper.login_with_selenium(email, password):\n    forum_scraper = GroupsIOForumScraper(scraper.get_driver())\n    posts, replies = forum_scraper.scrape_forum(max_pages=5)\n    forum_scraper.save_to_csv()\n'

In [5]:
scraper = GroupsIOScraper()
email = "allan.ascencio@gmail.com"
password = "Sah%b5BGn9TBgia"

# Try requests first, then Selenium if needed
if not scraper.login_with_requests(email, password):
    selenium_success = scraper.login_with_selenium(email, password)
    if not selenium_success:
        print("All login attempts failed.")


Error during requests login: 'NoneType' object is not subscriptable
Login successful using Selenium!


In [6]:
# Cell 2: Use the driver for other operations
driver = scraper.get_driver()
if driver:
    # Use the driver for additional operations
    driver.get("https://groups.io/g/peds-endo/topics")
    # ... perform other operations ...



In [25]:
scraper = GroupsIOScraper()
if scraper.login_with_selenium(email, password):
    forum_scraper = GroupsIOForumScraper(scraper.get_driver())
    posts, replies = forum_scraper.scrape_forum(max_pages=150)
    forum_scraper.save_to_csv()

Login successful using Selenium!
Starting to collect preview data from all pages...
Collecting previews from page 1...
Navigating to next page: https://groups.io/g/peds-endo/topics?page=2&after=1733530647300743547
Collecting previews from page 2...
Navigating to next page: https://groups.io/g/peds-endo/topics?page=3&after=1731481815837865745
Collecting previews from page 3...
Navigating to next page: https://groups.io/g/peds-endo/topics?page=2&before=1731366218515591380
Collecting previews from page 4...
Navigating to next page: https://groups.io/g/peds-endo/topics?page=3&after=1731481815837865745
Collecting previews from page 5...
Reached maximum pages limit (5)
Collected preview data for 100 topics
Starting to process individual posts with replies...
Processing post: Ped Endo in Orlando, FL (Reply count: 2)
Processing post: Hypercalcemia Case (Reply count: 8)
Processing post: Thyroid- DIO2 rs225014 homozygous genotype (Reply count: 3)
Processing post: Ped Endo in Austin TX and Valdos

In [8]:
# Cell 3: Clean up when done
scraper.quit_driver()