In [8]:
import requests
import pandas as pd
import concurrent.futures
import time

# --- CONFIGURATION ---
DOMAIN = "data.cityofnewyork.us"
SEARCH_TERMS = ["Education", "City Government"]
MIN_ROWS = 100000  # Threshold
MAX_WORKERS = 10  # Number of parallel threads (don't go too high or API blocks you)

def fetch_all_candidates(search_term):
    """
    Crawls the Discovery API using 'limit' and 'offset' to get EVERYTHING.
    """
    print(f"\n--- Crawling full catalog for: '{search_term}' ---")
    url = "http://api.us.socrata.com/api/catalog/v1"
    
    candidates = []
    offset = 0
    limit = 2000 # Max allowed per page usually approx 2000-5000 depending on API version
    
    while True:
        params = {
            'domains': DOMAIN,
            'q': search_term,
            'limit': limit,
            'offset': offset,
            'only': 'datasets'
        }
        
        try:
            resp = requests.get(url, params=params).json()
            results = resp.get('results', [])
            
            if not results:
                break # No more results, stop looping
                
            candidates.extend(results)
            print(f"  Fetched batch: {len(results)} items (Total so far: {len(candidates)})")
            
            offset += limit
            
        except Exception as e:
            print(f"Error fetching page: {e}")
            break
            
    return candidates

def get_row_count_safe(dataset_meta):
    """
    Worker function to check a single dataset's row count.
    """
    d = dataset_meta['resource']
    d_id = d['id']
    name = d['name']
    
    url = f"https://{DOMAIN}/resource/{d_id}.json?$select=count(*)"
    
    try:
        # Short timeout because we are doing many requests
        r = requests.get(url, timeout=5)
        if r.status_code == 200:
            count = int(r.json()[0]['count'])
            if count > MIN_ROWS:
                return {
                    'dataset_id': d_id,
                    'row_count': count,
                    'name': name,
                    'download_url': f"https://{DOMAIN}/api/views/{d_id}/rows.csv?accessType=DOWNLOAD"
                }
    except:
        return None # Fail silently to keep speed up
    return None

# --- MAIN EXECUTION ---
all_candidates = []
seen_ids = set()

# 1. HARVEST ALL CANDIDATES
for term in SEARCH_TERMS:
    results = fetch_all_candidates(term)
    for res in results:
        res_id = res['resource']['id']
        if res_id not in seen_ids:
            seen_ids.add(res_id)
            all_candidates.append(res)

print(f"\nTotal unique candidates found: {len(all_candidates)}")
print(f"Checking row counts (this may take 1-2 minutes)...")

# 2. PARALLEL PROBE
valid_datasets = []

# Using ThreadPool to check sizes concurrently
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit all tasks
    future_to_meta = {executor.submit(get_row_count_safe, item): item for item in all_candidates}
    
    # Process results as they complete
    completed_count = 0
    for future in concurrent.futures.as_completed(future_to_meta):
        result = future.result()
        if result:
            valid_datasets.append(result)
            # Optional: Print live updates for big finds
            # print(f"  [FOUND] {result['row_count']:>9,} rows | {result['name'][:40]}...")
        
        completed_count += 1
        if completed_count % 100 == 0:
            print(f"  Scanned {completed_count}/{len(all_candidates)}...")

# --- OUTPUT ---
print(f"\n\n=== FINAL DATASET LIST (> {MIN_ROWS} rows) ===")
print(f"Total Qualified Datasets: {len(valid_datasets)}")

if valid_datasets:
    df = pd.DataFrame(valid_datasets)
    # Sort largest to smallest
    df = df.sort_values(by='row_count', ascending=False)
    
    print(df[['dataset_id', 'row_count', 'name']].head(20).to_string(index=False))
    
    # Save to CSV for your comparison script
    df.to_csv("huge_datasets_list.csv", index=False)
    print("\nSaved full list to 'huge_datasets_list.csv'")
    
    print("\n--- SAMPLE URLS FOR SPARK ---")
    print(df['download_url'].head(5).tolist())
else:
    print("No datasets found matching criteria.")


--- Crawling full catalog for: 'Education' ---
  Fetched batch: 693 items (Total so far: 693)

--- Crawling full catalog for: 'City Government' ---
  Fetched batch: 753 items (Total so far: 753)

Total unique candidates found: 1409
Checking row counts (this may take 1-2 minutes)...
  Scanned 100/1409...
  Scanned 200/1409...
  Scanned 300/1409...
  Scanned 400/1409...
  Scanned 500/1409...
  Scanned 600/1409...
  Scanned 700/1409...
  Scanned 800/1409...
  Scanned 900/1409...
  Scanned 1000/1409...
  Scanned 1100/1409...
  Scanned 1200/1409...
  Scanned 1300/1409...
  Scanned 1400/1409...


=== FINAL DATASET LIST (> 100000 rows) ===
Total Qualified Datasets: 94
dataset_id  row_count                                                                name
 rmhc-afj9  391530043                                                 DSNY - PlowNYC Data
 wewp-mm3p  102269689                                             311 Call Center Inquiry
 a9md-ynri    6178555                                    Ci

In [None]:
import time
import pandas as pd
import requests
import os
import tempfile
from pyspark.sql import SparkSession

import pandas as pd
from pandas.core.generic import NDFrame

import sys
import warnings

# 1. Fix the Recursion Error
sys.setrecursionlimit(5000)  # Increase from default 1000 to 5000

# 1. Save the original to_csv function
_original_to_csv = NDFrame.to_csv

# 2. Define a new wrapper function that fixes the argument
def _patched_to_csv(self, *args, **kwargs):
    # If the old argument is present, swap it for the new one
    if 'line_terminator' in kwargs:
        kwargs['lineterminator'] = kwargs.pop('line_terminator')
    
    # Call the original function with the fixed arguments
    return _original_to_csv(self, *args, **kwargs)

# 3. Apply the patch
NDFrame.to_csv = _patched_to_csv
print("Pandas 'line_terminator' patch applied.")

# --- CONFIGURATION ---
DATASET_LIST_FILE = "huge_datasets_list.csv"  # The list you generated earlier
OUTPUT_FILE = "profiler_benchmark_results.csv"

# Try to import the REAL Datamart Profiler (NYU Library)
try:
    import datamart_profiler
    HAS_DATAMART = True
    print("✅ 'datamart_profiler' library found. Using it for Local test.")
except ImportError:
    HAS_DATAMART = False
    print("⚠️ 'datamart_profiler' not found. Using Pandas .describe() as valid proxy.")

# ==========================================
# 1. LOCAL PROFILING (The Baseline)
# ==========================================
def run_local_profiler(download_url):
    """
    Simulates the standard AutoDDG flow:
    Download -> Save -> Run datamart_profiler (or Pandas) -> Load Stats
    """
    try:
        # 1. Download to a temp file (Local tools usually need a file on disk)
        # We time the download because local tools CANNOT stream efficiently like Spark
        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
            r = requests.get(download_url, stream=True)
            for chunk in r.iter_content(chunk_size=8192):
                tmp.write(chunk)
            tmp_path = tmp.name

        start_time = time.time()
        
        if HAS_DATAMART:
            # --- THE REAL DATAMART PROFILER ---
            # This is exactly what the original AutoDDG likely does
            metadata = datamart_profiler.process_dataset(tmp_path, include_sample=True)
        else:
            # --- PANDAS PROXY ---
            # Approximates the work: Load whole file -> Calc Stats
            df = pd.read_csv(tmp_path)
            stats = df.describe(include='all')
            # Simulate "Type Detection" (iterating columns)
            dtypes = df.dtypes.to_dict()

        duration = time.time() - start_time
        
        # Cleanup
        os.remove(tmp_path)
        return duration
        
    except Exception as e:
        print(f"  [Local Fail] {e}")
        return None

# ==========================================
# 2. SPARK PROFILING (The Scalable Solution)
# ==========================================
def run_spark_profiler(spark, download_url):
    """
    Simulates the Scalable flow:
    Stream Read -> Distributed Compute -> Collect Stats
    """
    try:
        # 1. Spark reads directly (Lazy Evaluation)
        # Note: We include reading in the time, but Spark streams it.
        start_time = time.time()
        
        # For simplicity in this script, we assume Spark can access the URL.
        # If your Spark is local, we might need to download first to be fair, 
        # BUT the advantage of Spark is streaming. 
        # Let's download to temp to give a fair "Apples to Apples" on PROCESSING speed.
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
            r = requests.get(download_url, stream=True)
            for chunk in r.iter_content(chunk_size=8192):
                tmp.write(chunk)
            tmp_path = tmp.name

        # --- THE SPARK WORK ---
        df = spark.read.option("header", "true").option("inferSchema", "true").csv(tmp_path)
        new_columns = [c.replace('.', '_') for c in df.columns]
        df = df.toDF(*new_columns)
        
        # Force computation of the profile
        # .summary() computes count, mean, stddev, min, max, etc.
        summary_stats = df.summary().collect()
        
        duration = time.time() - start_time
        
        os.remove(tmp_path)
        return duration

    except Exception as e:
        print(f"  [Spark Fail] {e}")
        return None

# ==========================================
# 3. RUN COMPARISON
# ==========================================
def main():
    # Load your list of datasets
    if not os.path.exists(DATASET_LIST_FILE):
        print("Please run the 'Deep Scan' script first to generate the dataset list.")
        return

    df_datasets = pd.read_csv(DATASET_LIST_FILE)[4:14] # Test top 10 first
    
    # Init Spark
    spark = SparkSession.builder.appName("ProfilerBenchmark").master("local[*]").getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")

    results = []
    
    print(f"\nComparing Profiling Speed on {len(df_datasets)} datasets...")
    print(f"{'Dataset Name':<30} | {'Rows':<10} | {'Local (s)':<10} | {'Spark (s)':<10} | {'Ratio'}")
    print("-" * 85)

    for _, row in df_datasets.iterrows():
        url = row['download_url']
        name = row['name'][:28]
        rows = row['row_count']
        
        # Run Local
        t_local = run_local_profiler(url)
        
        # Run Spark
        t_spark = run_spark_profiler(spark, url)
        
        # Calculate Speedup
        ratio = "N/A"
        if t_local and t_spark:
            if t_spark < t_local:
                ratio = f"{t_local / t_spark:.1f}x Faster"
            else:
                ratio = f"{t_spark / t_local:.1f}x Slower" # Happens on small data

                    # Format the values safely beforehand
            s_local = f"{t_local:.2f}" if isinstance(t_local, (int, float)) else "Err"
            s_spark = f"{t_spark:.2f}" if isinstance(t_spark, (int, float)) else "Err"
            
            # Print using string formatting (<10s is implicit for strings)
            print(f"{name:<30} | {rows:<10} | {s_local:<10} | {s_spark:<10} | {ratio}")
                    
        results.append({
            'dataset': name,
            'rows': rows,
            'time_local': t_local,
            'time_spark': t_spark
        })

    # Save
    pd.DataFrame(results).to_csv(OUTPUT_FILE, index=False)
    print(f"\nResults saved to {OUTPUT_FILE}")
    spark.stop()

if __name__ == "__main__":
    main()

Pandas 'line_terminator' patch applied.
✅ 'datamart_profiler' library found. Using it for Local test.


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/09 14:05:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable



Comparing Profiling Speed on 10 datasets...
Dataset Name                   | Rows       | Local (s)  | Spark (s)  | Ratio
-------------------------------------------------------------------------------------


  sample = sample.applymap(truncate_string)  # Truncate long values
                                                                                

NYC Historical Vital Records   | 3725986    | 36.60      | 455.32     | 12.4x Slower


  sample = sample.applymap(truncate_string)  # Truncate long values
                                                                                

Civil List                     | 3237466    | 23.79      | 227.98     | 9.6x Slower


  sample = sample.applymap(truncate_string)  # Truncate long values
                                                                                

Asset Management Parks Syste   | 3034726    | 37.86      | 443.03     | 11.7x Slower


  sample = sample.applymap(truncate_string)  # Truncate long values
[Stage 19:>                                                         (0 + 1) / 1]

In [None]:
import time
import pandas as pd
import requests
import os
import tempfile
from pyspark.sql import SparkSession
from pandas.core.generic import NDFrame
import sys
import warnings

# --- 1. PATCHES & CONFIG ---

# Fix Recursion Error for complex datasets
sys.setrecursionlimit(5000)

# Fix Pandas 'line_terminator' Compatibility
_original_to_csv = NDFrame.to_csv

def _patched_to_csv(self, *args, **kwargs):
    if 'line_terminator' in kwargs:
        kwargs['lineterminator'] = kwargs.pop('line_terminator')
    return _original_to_csv(self, *args, **kwargs)

NDFrame.to_csv = _patched_to_csv
print("Pandas 'line_terminator' patch applied.")

# Configuration
DATASET_LIST_FILE = "huge_datasets_list.csv"
OUTPUT_FILE = "profiler_benchmark_results_top4.csv"

# Check for Datamart Profiler
try:
    import datamart_profiler
    HAS_DATAMART = True
    print("✅ 'datamart_profiler' library found. Using it for Local test.")
except ImportError:
    HAS_DATAMART = False
    print("⚠️ 'datamart_profiler' not found. Using Pandas .describe() as valid proxy.")

# --- 2. PROFILER FUNCTIONS ---

def run_local_profiler(download_url):
    try:
        # Download to temp file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
            r = requests.get(download_url, stream=True)
            for chunk in r.iter_content(chunk_size=8192):
                tmp.write(chunk)
            tmp_path = tmp.name

        start_time = time.time()
        
        if HAS_DATAMART:
            metadata = datamart_profiler.process_dataset(tmp_path, include_sample=True)
        else:
            df = pd.read_csv(tmp_path)
            stats = df.describe(include='all')
            dtypes = df.dtypes.to_dict()

        duration = time.time() - start_time
        
        os.remove(tmp_path)
        return duration
        
    except Exception as e:
        # Don't print full error to keep table clean, just log failure
        return None

def run_spark_profiler(spark, download_url):
    try:
        # Download to temp file (for fair comparison of processing time)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
            r = requests.get(download_url, stream=True)
            for chunk in r.iter_content(chunk_size=8192):
                tmp.write(chunk)
            tmp_path = tmp.name

        start_time = time.time()
        
        # Spark Read
        df = spark.read.option("header", "true").option("inferSchema", "true").csv(tmp_path)
        
        # Fix column names (replace dots with underscores)
        new_columns = [c.replace('.', '_') for c in df.columns]
        df = df.toDF(*new_columns)
        
        # Force computation
        summary_stats = df.summary().collect()
        
        duration = time.time() - start_time
        
        os.remove(tmp_path)
        return duration

    except Exception as e:
        print(f"\n[Spark Error] {e}")
        return None

# --- 3. MAIN EXECUTION ---

def main():
    if not os.path.exists(DATASET_LIST_FILE):
        print("Dataset list file not found.")
        return

    # Load and Prepare the "Top 4 Ascending" List
    df = pd.read_csv(DATASET_LIST_FILE)
    datasets = df.to_dict('records') # Convert to list of dicts for easier sorting

    # Step A: Sort Descending to find the Giants (Using 'row_count'!)
    all_sorted_desc = sorted(datasets, key=lambda x: x['row_count'], reverse=True)
    
    # Step B: Take the Top 4 Giants
    top_4_monsters = all_sorted_desc[:4]
    
    # Step C: Sort them Ascending (Smallest Giant first)
    final_list = sorted(top_4_monsters, key=lambda x: x['row_count'])

    # Init Spark
    spark = SparkSession.builder.appName("ProfilerBenchmark").master("local[*]").getOrCreate()
    spark.sparkContext.setLogLevel("ERROR") # Hide progress bars

    results = []
    
    print(f"\nComparing Profiling Speed on Top 4 Datasets (Ascending Order)...")
    print(f"{'Dataset Name':<30} | {'Rows':<10} | {'Local (s)':<10} | {'Spark (s)':<10} | {'Ratio'}")
    print("-" * 85)

    for row in final_list:
        url = row['download_url']
        name = row['name'][:28] # Truncate name
        rows = row['row_count'] # <--- Fixed Key Here
        
        # 1. Run Local
        t_local = run_local_profiler(url)
        
        # 2. Run Spark
        t_spark = run_spark_profiler(spark, url)
        
        # 3. Calculate Ratio and Format Strings
        s_local = f"{t_local:.2f}" if t_local is not None else "Err"
        s_spark = f"{t_spark:.2f}" if t_spark is not None else "Err"
        
        ratio = "N/A"
        if t_local and t_spark:
            if t_spark < t_local:
                ratio = f"{t_local / t_spark:.1f}x Faster"
            else:
                ratio = f"{t_spark / t_local:.1f}x Slower"
        elif t_local is None and t_spark is not None:
            ratio = "Spark Won"
        elif t_local is not None and t_spark is None:
            ratio = "Local Won"

        # 4. Print Row
        print(f"{name:<30} | {rows:<10} | {s_local:<10} | {s_spark:<10} | {ratio}")
                
        results.append({
            'dataset': name,
            'rows': rows,
            'time_local': t_local,
            'time_spark': t_spark
        })

    # Save
    pd.DataFrame(results).to_csv(OUTPUT_FILE, index=False)
    print(f"\nResults saved to {OUTPUT_FILE}")
    spark.stop()

if __name__ == "__main__":
    main()

Pandas 'line_terminator' patch applied.
✅ 'datamart_profiler' library found. Using it for Local test.


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/09 15:26:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable



Comparing Profiling Speed on Top 4 Datasets (Ascending Order)...
Dataset Name                   | Rows       | Local (s)  | Spark (s)  | Ratio
-------------------------------------------------------------------------------------


  sample = sample.applymap(truncate_string)  # Truncate long values
                                                                                

J-51 Exemption and Abatement   | 4232279    | Err        | 113.71     | Spark Won


  sample = sample.applymap(truncate_string)  # Truncate long values
                                                                                

Civil Service List Certifica   | 6178555    | Err        | 764.51     | Spark Won


  sample = sample.applymap(truncate_string)  # Truncate long values
[Stage 12:==>                                                     (6 + 2) / 155]