# GeoQA — Quality Checks Deep Dive

This notebook explores GeoQA's quality checking capabilities in detail,
including geometry validation, attribute analysis, and issue remediation.

In [None]:
import geopandas as gpd
import geoqa
from shapely.geometry import Polygon, Point, LineString

print(f"GeoQA version: {geoqa.__version__}")

## 1. Create Test Data with Known Issues

Let's create a GeoDataFrame with intentional quality issues to see how GeoQA detects them.

In [None]:
# Create data with various quality issues
geometries = [
    Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),           # Valid
    Polygon([(0, 0), (2, 2), (2, 0), (0, 2)]),           # Invalid (self-intersecting bowtie)
    Polygon([(3, 0), (4, 0), (4, 1), (3, 1)]),           # Valid
    Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),           # Duplicate of row 0
    Polygon(),                                              # Empty geometry
    Polygon([(5, 5), (6, 5), (6, 6), (5, 6)]),           # Valid
]

gdf = gpd.GeoDataFrame(
    {
        "name": ["Building A", "Building B", None, "Building D", "Building E", "Building F"],
        "floors": [3, 5, 2, None, 1, 4],
        "area_sqm": [100.0, 200.0, None, 150.0, 50.0, 300.0],
        "use": ["residential", "commercial", "residential", None, None, "industrial"],
    },
    geometry=geometries,
    crs="EPSG:4326",
)

print(f"Created GeoDataFrame with {len(gdf)} features")
gdf

## 2. Profile the Problematic Data

In [None]:
profile = geoqa.profile(gdf, name="Test Buildings (Issues)")
profile.summary()

## 3. Detailed Quality Checks

In [None]:
# View all quality checks as a DataFrame
checks = profile.quality_checks()
checks

## 4. Detailed Geometry Results

In [None]:
geom = profile.geometry_results

print(f"Valid geometries: {geom['valid_count']}")
print(f"Invalid geometries: {geom['invalid_count']}")
print(f"Invalid indices: {geom['invalid_indices']}")
print(f"Empty geometries: {geom['empty_count']}")
print(f"Duplicate geometries: {geom['duplicate_count']}")
print(f"Mixed types: {geom['mixed_types']}")
print(f"Geometry types: {geom['geometry_types']}")

# Show invalid geometry reasons
for idx, reason in geom.get('invalid_reasons', []):
    print(f"  Row {idx}: {reason}")

## 5. Attribute Completeness Analysis

In [None]:
attr = profile.attribute_results

print(f"Total null values: {attr['total_nulls']}")
print(f"\nCompleteness per column:")
for col, pct in attr['completeness'].items():
    bar = '█' * int(pct / 5) + '░' * (20 - int(pct / 5))
    print(f"  {col:12s} {bar} {pct}%")

## 6. Fix Invalid Geometries

In [None]:
from geoqa.geometry import GeometryChecker

checker = GeometryChecker(gdf)
fixed_gdf = checker.fix_invalid()

# Verify fix
fixed_profile = geoqa.profile(fixed_gdf, name="Fixed Buildings")
print(f"Before fix — Invalid: {profile.geometry_results['invalid_count']}")
print(f"After fix  — Invalid: {fixed_profile.geometry_results['invalid_count']}")
print(f"\nQuality score improved: {profile.quality_score:.1f} → {fixed_profile.quality_score:.1f}")

## 7. Compare with Valid Data

In [None]:
# Profile a real dataset for comparison
try:
    real_profile = geoqa.profile(r"../../data/giza_buildings.shp")
    
    import pandas as pd
    comparison = pd.DataFrame([
        {
            "Dataset": profile.name,
            "Features": profile.feature_count,
            "Invalid": profile.geometry_results['invalid_count'],
            "Empty": profile.geometry_results['empty_count'],
            "Null Attrs": profile.attribute_results['total_nulls'],
            "Quality": f"{profile.quality_score:.1f}",
        },
        {
            "Dataset": real_profile.name,
            "Features": real_profile.feature_count,
            "Invalid": real_profile.geometry_results['invalid_count'],
            "Empty": real_profile.geometry_results['empty_count'],
            "Null Attrs": real_profile.attribute_results['total_nulls'],
            "Quality": f"{real_profile.quality_score:.1f}",
        },
    ])
    print(comparison.to_string(index=False))
except Exception as e:
    print(f"Could not load comparison data: {e}")

---

**GeoQA** — Geospatial Data Quality Assessment & Interactive Profiling