<a href="https://colab.research.google.com/github/CSE291A-GEO/anti-geo/blob/simran-dev/Anti_GEO_Dataset_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [195]:
# Only needed for one-time setup
# -------------------------------
!pip3 install serpapi
!pip3 install google-search-results
!pip3 install google-search



In [196]:
import json
import csv
import zipfile
import io
from serpapi.google_search import GoogleSearch
from google.colab import files
import os
import time

In [197]:
SERP_API_KEY = "dc7249b6d400425f10ca763575494bb64e36c4d71bac350733fa624f75135297"
QUERIES_CSV_FILE = "combined_queries_10.csv"
QUERIES_JSON_FILE = "geo_bench_dataset.json"
QUERY_SOURCES = ["VACOS", "DebateQA", "HotpotQA", "Pinocchios", "QuoraQuestions"]
DATASET_ZIP_FILE = "anti_geo_dataset.tsv.zip"
DATASET_TSV_FILE = "anti_geo_dataset.tsv"
SEARCHED_QUERIES_FILE = "searched_queries.txt"

In [198]:
def load_queries_from_json(json_path=QUERIES_JSON_FILE, tags=[]):
    """
    Load queries from a JSON file, optionally filtering by tags.

    Args:
        json_path (str): Path to the JSON file containing queries and metadata.
        tags (list): List of tags to filter queries. If empty, all queries are returned.

    Returns:
        list: A list of query strings that match any of the specified tags.
              If tags list is empty, returns all queries.
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)

    tagSet = set(tags)
    queries = []

    for entry in dataset:
        t = set(entry.get('tags', []))
        if len(tags) == 0 or len(tagSet.intersection(t)) > 0:
          queries.append(entry['query'])

    return queries

def load_queries_from_csv(csv_path=QUERIES_CSV_FILE, include_sources=[], exclude_sources=[]):
    """
    Reads queries from CSV with optional source filtering.

    Parameters:
    - csv_path: path to the CSV file
    - include_sources: list of sources to include (if empty or None, includes all)
    - exclude_sources: list of sources to exclude (if empty or None, excludes none)

    Returns:
    - List of queries after filtering
    """
    queries = []

    with open(csv_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            query = row.get('query').strip()
            source = row.get('source').strip()

            if include_sources and source not in include_sources:
                continue

            if exclude_sources and source in exclude_sources:
                continue

            queries.append(query)
    return queries

def search_queries(queries, api_key):
    """
    Perform Google search queries using serpAPI, collect and store search
    results.

    Args:
        queries (list): List of query strings to search.
        api_key (str): API key for authenticating the search requests.

    Returns:
        list: A list of dictionaries, each containing:
            - "query": the original query string,
            - "organic_results": list of search results from Google,
            - "ai_overview": AI overview related information (if available),
            - "ai_mode": AI mode related search results.
    Side Effects:
        - Appends processed queries to a local file SEARCHED_QUERIES_FILE.
        - Downloads the updated searches file for access.
    """
    print(f"Scraping results for {len(queries)} queries.")
    if len(queries) == 0:
      print(f"No queries to search.")
      return []

    dataset = []
    for i, query in enumerate(queries):
        print(f"Processing query {i + 1}: {query}")
        success = False

        while not success:
          try:
            print("Querying Google search")

            # API Spec: https://serpapi.com/search-api
            params = {
                "engine": "google",
                "q": query,
                "api_key": api_key,
                "num": 100
            }
            search = GoogleSearch(params)
            results = search.get_dict()
            organic_results = results.get('organic_results', [])

            print("Querying Google AI Overview")
            ai_overview = results.get('ai_overview')
            ai_overview_results = {}
            if ai_overview and 'page_token' in ai_overview:
                overview_params = {
                    "engine": "google_ai_overview",
                    "api_key": api_key,
                    "page_token": ai_overview['page_token']
                }
                overview_search = GoogleSearch(overview_params)
                overview_results = overview_search.get_dict()
                ai_overview_results = overview_results.get('ai_overview', {})

            print("Querying Google AI Mode")
            # API Spec: https://serpapi.com/playground?engine=google_ai_mode&q=Compare+wool%2C+down%2C+and+synthetic+jackets+in+terms+of+insulation%2C+water+resistance%2C+and+durability&gl=us&hl=en
            ai_mode_params = {
                "engine": "google_ai_mode",
                "q": query,
                "api_key": api_key
            }

            ai_mode_search = GoogleSearch(ai_mode_params)
            ai_mode_results = ai_mode_search.get_dict()

            dataset.append({
                "query": query,
                "organic_results": organic_results,
                "ai_overview": ai_overview_results,
                "ai_mode": ai_mode_results
            })

            with open(SEARCHED_QUERIES_FILE, "a", encoding="utf-8") as q:
                      q.write(query + "\n")

            success = True
          except Exception as e:
              print(f"Error processing query '{query}': {e}")
              print(f"Retrying query after {15} seconds...")
              time.sleep(15)

    print(f"File: {SEARCHED_QUERIES_FILE} updated with newly searched queries")
    files.download(SEARCHED_QUERIES_FILE)
    print(f"File: {SEARCHED_QUERIES_FILE} downloaded")
    return dataset

def create_zip_with_tsv(new_rows, zip_path=DATASET_ZIP_FILE, tsv_filename=DATASET_TSV_FILE):
    """
    Create or update a zipped TSV file by merging existing content with new rows.

    Args:
        new_rows (list): List of lists, each inner list representing a TSV row (strings).
        zip_path (str): Path to the zipped file to create or update.
        tsv_filename (str): Name of the TSV file inside the zip archive.

    Returns:
        str: The path to the updated zip file containing the TSV.
    Side Effects:
        - Reads existing TSV data from the zip if present.
        - Writes updated TSV back to zip, overwriting existing content.
    """
    # Step 1: Extract or create temporary TSV file
    existing_rows = []
    if os.path.exists(zip_path):
        print(f"Extracting existing search results from {DATASET_ZIP_FILE}.")
        with zipfile.ZipFile(zip_path, 'r') as z:
            with z.open(tsv_filename) as f:
                existing_output = io.StringIO(f.read().decode('utf-8'))
                csv_reader = csv.reader(existing_output, delimiter='\t')
                for i, row in enumerate(csv_reader):
                    existing_rows.append(row)
    else:
        os.makedirs(os.path.dirname(zip_path), exist_ok=True)
        open(tsv_filename, 'wb').close()
        existing_rows.append(["query", "organic_results", "ai_overview", "ai_mode"])

    # Step 2: Append old and new content
    all_rows = existing_rows
    all_rows.extend(new_rows)
    output = io.StringIO()
    writer = csv.writer(output, delimiter='\t')
    writer.writerows(all_rows)
    tsv_bytes = output.getvalue().encode('utf-8')

    # Step 3: Write updated TSV back to ZIP (overwrite existing)
    print(f"Writing all search results to {DATASET_TSV_FILE} and zipping it as {DATASET_ZIP_FILE}.")
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
        z.writestr(tsv_filename, tsv_bytes)

    return zip_path

def store_results(dataset):
    """
    Process a dataset of search results and store them in a zipped TSV file.

    Args:
        dataset (list): List of dictionaries representing search results. Each entry should have keys:
            "query", "organic_results", "ai_overview", "ai_mode".

    Side Effects:
        - Serializes the dataset to TSV format inside a zipped file.
        - Downloads the zipped search results file.
        - Prints logging info about progress and file locations.
    """
    print(f"Extracting search results.")
    if len(dataset) == 0:
      print(f"No search results were returned.")
      return

    output = io.StringIO()

    results = []
    for entry in dataset:
        # Serialize each dictionary as a JSON string
        organic_results_json = json.dumps(entry['organic_results'], ensure_ascii=False)
        ai_overview_json = json.dumps(entry.get('ai_overview', None), ensure_ascii=False)
        ai_mode_json = json.dumps(entry.get('ai_mode', None), ensure_ascii=False)
        results.append([entry['query'], organic_results_json, ai_overview_json, ai_mode_json])

    zip_file = create_zip_with_tsv(results)
    print(f"Results stored in {DATASET_ZIP_FILE}")
    files.download(zip_file)
    print(f"{DATASET_ZIP_FILE} downloaded")

def filter_queries(queries, filename=SEARCHED_QUERIES_FILE):
    """
    Filter out queries that have already been scraped by reading from a file.

    Args:
        queries (list): List of query strings to filter.
        filename (str): Path to the file containing previously scraped queries.

    Returns:
        list: A filtered list of queries excluding those found in the scraped queries file.
    Side Effects:
        - Prints information about how many queries were filtered out.
    """
    print(f"Filtering out already scraped queries.")
    file_queries = set()
    if os.path.exists(filename):
      with open(filename, 'r', encoding='utf-8') as f:
          file_queries = set(line.strip() for line in f if line.strip())

    filtered_queries = [q for q in queries if q not in file_queries]

    if len(queries) == len(filtered_queries):
      print(f"All queries are new. None were filtered out.")
    else:
      print(f"Found {len(queries) - len(filtered_queries)} queries already scraped")

    return filtered_queries

def load_queries_and_scrape_results(api_key=SERP_API_KEY, batch_size=1):
    """
    Generate a dataset of search results based on filtered queries from CSV and perform scraping in batches.

    Args:
        api_key (str): API key for the search provider.
        batch_size (int): Number of queries to process in each batch (default is 5).

    Side Effects:
        - Loads queries from configured CSV with include/exclude source filtering.
        - Filters out previously scraped queries.
        - Performs search and stores results in batches.
        - Prints queries and status messages.
    """
    queries = load_queries_from_csv(include_sources=[], exclude_sources=[])
    queries = ["Black, mid-weight jacket with cell phone pocket and headphone jack in top pocket Please provide some product recommendations for me"]
    filtered_queries = filter_queries(queries)

    print(f"Total filtered queries to process: {len(filtered_queries)}")

    for i in range(0, len(filtered_queries), batch_size):
        batch = filtered_queries[i : i + batch_size]
        print(f"Processing batch {i // batch_size + 1} with {len(batch)} queries")
        print(f"Queries in batch: {batch}")
        results = search_queries(batch, api_key)
        store_results(results)

In [199]:
# Only needs to be run once to generate the dataset
zip_file = load_queries_and_scrape_results()

Filtering out already scraped queries.
All queries are new. None were filtered out.
Total filtered queries to process: 1
Processing batch 1 with 1 queries
Queries in batch: ['Black, mid-weight jacket with cell phone pocket and headphone jack in top pocket Please provide some product recommendations for me']
Scraping results for 1 queries.
Processing query 1: Black, mid-weight jacket with cell phone pocket and headphone jack in top pocket Please provide some product recommendations for me
Querying Google search
Querying Google AI Overview
Querying Google AI Mode
File: searched_queries.txt updated with newly searched queries


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

File: searched_queries.txt downloaded
Extracting search results.
Extracting existing search results from anti_geo_dataset.tsv.zip.
Writing all search results to anti_geo_dataset.tsv and zipping it as anti_geo_dataset.tsv.zip.
Results stored in anti_geo_dataset.tsv.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

anti_geo_dataset.tsv.zip downloaded


In [200]:
def load_and_deserialize_zipped_tsv(zip_filepath=DATASET_ZIP_FILE, tsv_filename=DATASET_TSV_FILE):
  """
  Load and deserialize data from a TSV file inside a ZIP archive.

  This function reads a TSV file stored inside a ZIP archive,
  deserializes JSON-encoded columns into corresponding Python
  data structures, and returns a list of dictionaries representing
  the dataset.

  Args:
      zip_filepath (str): Path to the ZIP file containing the TSV.
      tsv_filename (str): Name of the TSV file inside the ZIP archive.

  Returns:
      list of dict: A list where each element is a dictionary with keys:
          - 'query' (str): The query string.
          - 'organic_results' (list): List of organic search results.
          - 'ai_overview' (dict or None): AI overview data, or None if unavailable or invalid.
          - 'ai_mode' (dict or None): AI mode data, or None if unavailable or invalid.

  Notes:
      - JSON decoding errors or missing keys in the TSV columns for
        'organic_results', 'ai_overview', or 'ai_mode' will result in
        empty lists or None as default values to ensure robustness.
  """
  dataset = []
  with zipfile.ZipFile(zip_filepath, 'r') as zf:
      with zf.open(tsv_filename) as tsv_file:
          # Read the TSV data as text
          text_file = io.TextIOWrapper(tsv_file, encoding='utf-8')
          tsv_reader = csv.DictReader(text_file, delimiter='\t')

          for row in tsv_reader:
              entry = {}
              entry['query'] = row['query']

              # Deserialize JSON string back to Python list for organic_results
              try:
                  entry['organic_results'] = json.loads(row['organic_results'])
              except (json.JSONDecodeError, KeyError):
                  entry['organic_results'] = []

              # Deserialize JSON string back to Python dict for ai_overview
              try:
                  entry['ai_overview'] = json.loads(row['ai_overview'])
              except (json.JSONDecodeError, KeyError):
                  entry['ai_overview'] = None

              # Deserialize JSON string back to Python dict for ai_mode
              try:
                  entry['ai_mode'] = json.loads(row['ai_mode'])
              except (json.JSONDecodeError, KeyError):
                  entry['ai_mode'] = None

              dataset.append(entry)

  return dataset

In [201]:
load_and_deserialize_zipped_tsv()

[{'query': 'how to propagate a monstera albo cutting in water',
  'organic_results': [{'position': 1,
    'title': "A Beginner's Guide to Propagating a Monstera Albo in Water",
    'link': 'https://tandranicole.com/propagating-monstera-albo/',
    'redirect_link': 'https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://tandranicole.com/propagating-monstera-albo/&ved=2ahUKEwjGv6OAw4GRAxVc3skDHRsgO5kQFnoECCEQAQ',
    'displayed_link': 'https://tandranicole.com › propagating-monstera-albo',
    'favicon': 'https://serpapi.com/searches/691f7404d355ccadc71921e6/images/36e82349453ce288a8b271449da801576cf414feeed13f8c3a02b987f138e2d5.png',
    'date': 'Mar 8, 2024',
    'snippet': 'Place your cutting into your clear vase then fill with water about 2 to 3 inches above the node. Your position of your cutting should be upright ...',
    'snippet_highlighted_words': ['cutting', 'water', 'cutting'],
    'sitelinks': {'inline': [{'title': 'Monstera Albo A.K.A Varigated...',
     