# Google Groups Data Scraper

## Overview
This Python script is designed to scrape thread titles, URLs, and post details from all pages of Google Groups search results based on a specified group and search term. The results are comprehensively saved into a CSV file. This script is tailored for the group "alt.conspiracy.area51" with the search term "lazar".

## Functionality
- **Pagination Handling**: Navigates through all available pages of search results.
- **Data Extraction**: Retrieves thread titles, URLs, authors, timestamps, and content of each post.
- **Error Handling**: Implements robust error handling to manage network issues and missing data.
- **Data Storage**: Results are saved into a CSV file, named with the group, search term, and the timestamp of when the file was generated.

## Output File
The output CSV file is named in the format `{group_name}_{search_term}_{timestamp}.csv`, making it easy to identify and organize files based on the group, search term, and the time of data scraping.

## Libraries Used
- `requests`: For performing HTTP requests.
- `beautifulsoup4`: For parsing HTML and XML documents.
- `csv`: For writing the data into a CSV format.
- `datetime`: For generating the current timestamp to append to the output file name.

## Usage
To use this script, ensure you have the necessary Python libraries installed and execute the script in an environment where you have permissions to read from websites and write files. This script assumes compliance with Google's Terms of Service and the appropriate use of data scraped from the internet.


In [None]:
print("hello world")

In [3]:
# FINAL VERSION
# This script scrapes thread titles, URLs, and post details from all pages of Google Groups search results for a given query and group.
# It saves the scraped data into a CSV file with the group name, search term, and current time of file generation included in the filename.
# The script includes error handling to avoid NoneType errors and implements pagination to gather data from all result pages.

import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Hardcoded inputs for group and search term
# group_name = "alt.conspiracy.area51"
# search_term = "disinfo"

group_name = "sci.space"
search_term = "test"

# URL of the Google Groups search page
base_url = "https://groups.google.com"
search_url = f"https://groups.google.com/g/{group_name}/search?q={search_term}&pli=1"

# Create a session with retry functionality
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)

# Data storage for collected posts
all_posts = []


def get_threads_on_page(url):
    try:
        response = session.get(url)
        response.raise_for_status()  # Raise an exception for 4xx or 5xx status codes
        soup = BeautifulSoup(response.content, "html.parser")
        threads = soup.find_all("div", class_="cXEmmc")

        for thread in threads:
            title_element = thread.find("div", class_="t17a0d")
            title = title_element.text.strip() if title_element else "No Title"
            link_element = thread.find("a", class_="ZLl54")
            thread_url = base_url + link_element["href"] if link_element else "No URL"
            thread_response = session.get(thread_url)
            thread_response.raise_for_status()
            thread_soup = BeautifulSoup(thread_response.content, "html.parser")
            posts = thread_soup.find_all("section", class_="BkrUxb")

            for post in posts:
                content_element = post.find("div", class_="ptW7te")
                content = (
                    " ".join(p.text.strip() for p in content_element.find_all("p"))
                    if content_element
                    else "No Content"
                )
                author_element = post.find("h3", class_="s1f8Zd")
                author = author_element.text.strip() if author_element else "No Author"
                timestamp_element = post.find("span", class_="zX2W9c")
                timestamp = (
                    timestamp_element.text.strip()
                    if timestamp_element
                    else "No Timestamp"
                )
                all_posts.append(
                    {
                        "Thread Title": title,
                        "Thread URL": thread_url,
                        "Author": author,
                        "Timestamp": timestamp,
                        "Content": content,
                    }
                )

        return soup

    except requests.exceptions.RequestException as e:
        print("An error occurred while making the request:", e)
        return None


def get_all_pages():
    next_page_url = search_url
    while next_page_url:
        soup = get_threads_on_page(next_page_url)
        if soup is None:
            break
        next_page_element = soup.find("a", class_="G0iuSb")
        next_page_url = (
            base_url + next_page_element["href"] if next_page_element else None
        )


# Start the process to get all pages
get_all_pages()

# Save data to a CSV file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{group_name}_{search_term}_{timestamp}.csv"
with open(filename, "w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(
        file,
        fieldnames=["Thread Title", "Thread URL", "Author", "Timestamp", "Content"],
    )
    writer.writeheader()
    for post in all_posts:
        writer.writerow(post)

print(f"Data has been saved to {filename}")

Data has been saved to sci.space_test_20240707_183529.csv
