# GeoQA — Report Generation & Batch Processing

This notebook demonstrates how to generate professional HTML quality reports
and batch-process multiple datasets.

In [None]:
import os
import geoqa
import pandas as pd

print(f"GeoQA version: {geoqa.__version__}")

## 1. Single Dataset Report

In [None]:
# Profile and generate report for a single dataset
profile = geoqa.profile(r"../../data/giza_buildings.shp")

# Generate HTML report
report_path = profile.to_html("reports/giza_buildings_report.html")
print(f"Report generated: {report_path}")
print(f"Quality Score: {profile.quality_score:.1f}/100")

## 2. Batch Process All Shapefiles

Profile every shapefile in the data directory and generate individual reports.

In [None]:
data_dir = r"../../data"
shapefiles = [f for f in os.listdir(data_dir) if f.endswith('.shp')]

print(f"Found {len(shapefiles)} shapefiles:")
for shp in shapefiles:
    print(f"  - {shp}")

In [None]:
# Batch profile all shapefiles
results = []

for shp in shapefiles:
    filepath = os.path.join(data_dir, shp)
    try:
        p = geoqa.profile(filepath)
        
        # Generate individual report
        report_name = f"reports/{p.name}_report.html"
        p.to_html(report_name)
        
        results.append({
            "Dataset": p.name,
            "Features": p.feature_count,
            "Columns": p.column_count,
            "Type": p.geometry_type,
            "CRS": p.crs or "None",
            "Invalid Geom": p.geometry_results['invalid_count'],
            "Empty Geom": p.geometry_results['empty_count'],
            "Null Attrs": p.attribute_results['total_nulls'],
            "Quality Score": round(p.quality_score, 1),
            "Report": report_name,
        })
        print(f"  ✅ {p.name}: Score={p.quality_score:.1f}")
    except Exception as e:
        results.append({"Dataset": shp, "Error": str(e)})
        print(f"  ❌ {shp}: {e}")

## 3. Comparison Dashboard

In [None]:
# Create comparison DataFrame
df = pd.DataFrame(results)
if 'Quality Score' in df.columns:
    df = df.sort_values('Quality Score', ascending=False)
df

## 4. Visualize Quality Scores

In [None]:
import matplotlib.pyplot as plt

if 'Quality Score' in df.columns:
    fig, ax = plt.subplots(figsize=(10, 5))
    
    scores = df['Quality Score'].values
    names = df['Dataset'].values
    colors = ['#16a34a' if s >= 80 else '#d97706' if s >= 60 else '#dc2626' for s in scores]
    
    bars = ax.barh(names, scores, color=colors, edgecolor='white', linewidth=0.5)
    ax.set_xlabel('Quality Score')
    ax.set_title('GeoQA Quality Scores — Dataset Comparison')
    ax.set_xlim(0, 100)
    
    for bar, score in zip(bars, scores):
        ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2,
                f'{score:.0f}', va='center', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig('reports/quality_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("Chart saved to reports/quality_comparison.png")

## 5. Export Summary to CSV

In [None]:
# Save comparison summary
csv_path = "reports/quality_summary.csv"
df.to_csv(csv_path, index=False)
print(f"Summary exported to {csv_path}")

---

**GeoQA** — Geospatial Data Quality Assessment & Interactive Profiling