<a href="https://colab.research.google.com/github/Abhignya-Jagathpally/Abhignya_INFO5731_Fall2025/blob/test/ss1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Your code here
!pip -q install pandas tqdm

import urllib.request
import json
import time
import pandas as pd
from tqdm.auto import tqdm

def simple_fetch(url):
    """
    Simplest possible fetch - no fancy headers, no compression handling
    """
    try:
        # Create the most basic request possible
        request = urllib.request.Request(url)

        # Just add a basic User-Agent
        request.add_header('User-Agent', 'Mozilla/5.0 (compatible; DataCollector 1.0)')

        with urllib.request.urlopen(request, timeout=30) as response:
            # Read raw bytes
            content = response.read()

            # Try to decode as UTF-8, with fallback
            try:
                text = content.decode('utf-8')
            except UnicodeDecodeError:
                # If UTF-8 fails, try latin-1 (which accepts any byte)
                text = content.decode('latin-1')

            # Parse JSON
            return json.loads(text)

    except Exception as e:
        print(f"Error: {e}")
        print(f"URL: {url}")
        return None

def collect_all_narrators():
    """
    Collect all narrator data with minimal complexity
    """
    base_url = "https://ddr.densho.org/api/0.2/narrator/"
    all_narrators = []
    offset = 0
    limit = 50  # Start with smaller batches

    print("Starting simple data collection...")

    # First, get total count
    first_url = f"{base_url}?limit=1&offset=0"
    first_data = simple_fetch(first_url)

    if not first_data:
        print(" Could not fetch initial data")
        return []

    total_records = first_data.get('total', 0)
    print(f"Total records to collect: {total_records}")

    # Setup progress bar
    pbar = tqdm(total=total_records, desc="Collecting narrators")

    while True:
        # Build URL with parameters
        url = f"{base_url}?limit={limit}&offset={offset}"

        print(f"Fetching: {url}")
        data = simple_fetch(url)

        if not data:
            print(f"Failed to fetch data at offset {offset}")
            break

        # Get the narrator objects
        narrators = data.get('objects', [])

        if not narrators:
            print("No more narrators found")
            break

        # Process each narrator
        for narrator in narrators:
            links = narrator.get('links', {})

            narrator_data = {
                'id': narrator.get('id'),
                'name': narrator.get('display_name'),
                'bio_text': narrator.get('bio'),
                'generation': narrator.get('generation'),
                'birth_location': narrator.get('birth_location'),
                'birth_date': narrator.get('b_date'),
                'death_date': narrator.get('d_date'),
                'page_url': links.get('html'),
                'json_url': links.get('json'),
                'interviews_api': links.get('interviews'),
                'image_url': links.get('img'),
                'thumb_url': links.get('thumb'),
            }

            all_narrators.append(narrator_data)

        # Update progress
        pbar.update(len(narrators))

        # Check if we're done
        next_offset = data.get('next_offset')
        if next_offset is None or next_offset <= offset:
            print("Reached end of data")
            break

        # Be polite - wait between requests
        time.sleep(1)
        offset = next_offset

    pbar.close()
    return all_narrators

# ===== RUN THE COLLECTION =====

print(" Starting simple Densho data collection...")

narrators = collect_all_narrators()

if narrators:
    print(f"\n Successfully collected {len(narrators)} narrator records!")

    # Create DataFrame
    df = pd.DataFrame(narrators)

    # Save to CSV
    filename = "/content/sample_data/densho_narrators_raw.csv"
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f" Saved to {filename}")

    # Show some stats
    print(f"\n Data Summary:")
    print(f"   Total records: {len(df)}")
    print(f"   Columns: {len(df.columns)}")
    print(f"   Memory usage: {df.memory_usage(deep=True).sum() / 1024:.1f} KB")

    # Show sample data
    print(f"\n Sample Data:")
    pd.set_option('display.max_colwidth', 50)
    print(df[['id', 'name', 'generation', 'birth_location']].head())

    # Check for missing values
    print(f"\n Missing Values:")
    missing = df.isnull().sum()
    for col, count in missing.items():
        if count > 0:
            pct = (count / len(df)) * 100
            print(f"   {col}: {count} ({pct:.1f}%)")

    print(f"\n Data collection complete! File saved as '{filename}'")

else:
    print(" No data was collected")
    print("\nTroubleshooting suggestions:")
    print("1. Try running this code on your local machine")
    print("2. Check your internet connection")
    print("3. The API might be temporarily unavailable")

 Starting simple Densho data collection...
Starting simple data collection...
Total records to collect: 1009


Collecting narrators:   0%|          | 0/1009 [00:00<?, ?it/s]

Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=0
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=25
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=50
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=75
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=100
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=125
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=150
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=175
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=200
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=225
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=250
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=275
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=300
Fetching: https://ddr.densho.org/api/0.2/narrator/?limit=50&offset=325
Fetching: h