# GeoQA — Quality Checks Deep Dive

This notebook explores GeoQA's quality checking capabilities in detail,
including geometry validation, attribute analysis, and issue remediation.

In [1]:
import geopandas as gpd
import geoqa
from shapely.geometry import Polygon, Point, LineString

print(f"GeoQA version: {geoqa.__version__}")

GeoQA version: 0.1.0


## 1. Create Test Data with Known Issues

Let's create a GeoDataFrame with intentional quality issues to see how GeoQA detects them.

In [2]:
# Create data with various quality issues
geometries = [
    Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),           # Valid
    Polygon([(0, 0), (2, 2), (2, 0), (0, 2)]),           # Invalid (self-intersecting bowtie)
    Polygon([(3, 0), (4, 0), (4, 1), (3, 1)]),           # Valid
    Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),           # Duplicate of row 0
    Polygon(),                                              # Empty geometry
    Polygon([(5, 5), (6, 5), (6, 6), (5, 6)]),           # Valid
]

gdf = gpd.GeoDataFrame(
    {
        "name": ["Building A", "Building B", None, "Building D", "Building E", "Building F"],
        "floors": [3, 5, 2, None, 1, 4],
        "area_sqm": [100.0, 200.0, None, 150.0, 50.0, 300.0],
        "use": ["residential", "commercial", "residential", None, None, "industrial"],
    },
    geometry=geometries,
    crs="EPSG:4326",
)

print(f"Created GeoDataFrame with {len(gdf)} features")
gdf

Created GeoDataFrame with 6 features


Unnamed: 0,name,floors,area_sqm,use,geometry
0,Building A,3.0,100.0,residential,"POLYGON ((0 0, 1 0, 1 1, 0 1, 0 0))"
1,Building B,5.0,200.0,commercial,"POLYGON ((0 0, 2 2, 2 0, 0 2, 0 0))"
2,,2.0,,residential,"POLYGON ((3 0, 4 0, 4 1, 3 1, 3 0))"
3,Building D,,150.0,,"POLYGON ((0 0, 1 0, 1 1, 0 1, 0 0))"
4,Building E,1.0,50.0,,POLYGON EMPTY
5,Building F,4.0,300.0,industrial,"POLYGON ((5 5, 6 5, 6 6, 5 6, 5 5))"


## 2. Profile the Problematic Data

In [3]:
profile = geoqa.profile(gdf, name="Test Buildings (Issues)")
profile.summary()

{'name': 'Test Buildings (Issues)',
 'source': 'GeoDataFrame',
 'features': 6,
 'columns': 4,
 'geometry_type': 'Polygon',
 'crs': 'EPSG:4326',
 'bounds': {'minx': 0.0, 'miny': 0.0, 'maxx': 6.0, 'maxy': 6.0},
 'quality_score': 84.6,
 'geometry_checks': {'valid': 5, 'invalid': 1, 'empty': 1, 'duplicates': 1},
 'attribute_completeness': {'name': 83.3,
  'floors': 83.3,
  'area_sqm': 83.3,
  'use': 66.7}}

## 3. Detailed Quality Checks

In [4]:
# View all quality checks as a DataFrame
checks = profile.quality_checks()
checks

Unnamed: 0,Check,Status,Count,Severity,Details
0,Geometry Validity,FAIL,1,High,1 invalid geometries found
1,Empty Geometries,WARN,1,Medium,1 empty geometries found
2,Duplicate Geometries,WARN,1,Medium,1 duplicate geometries found
3,CRS Defined,PASS,1,,CRS: EPSG:4326
4,Attribute Completeness,FAIL,5,High,5 null values (20.8% of all cells)
5,Homogeneous Geometry Types,PASS,1,,Single geometry type


## 4. Detailed Geometry Results

In [5]:
geom = profile.geometry_results

print(f"Valid geometries: {geom['valid_count']}")
print(f"Invalid geometries: {geom['invalid_count']}")
print(f"Invalid indices: {geom['invalid_indices']}")
print(f"Empty geometries: {geom['empty_count']}")
print(f"Duplicate geometries: {geom['duplicate_count']}")
print(f"Mixed types: {geom['mixed_types']}")
print(f"Geometry types: {geom['geometry_types']}")

# Show invalid geometry reasons
for idx, reason in geom.get('invalid_reasons', []):
    print(f"  Row {idx}: {reason}")

Valid geometries: 5
Invalid geometries: 1
Invalid indices: [1]
Empty geometries: 1
Duplicate geometries: 1
Mixed types: False
Geometry types: {'Polygon': 6}
  Row 1: Self-intersection[1 1]


## 5. Attribute Completeness Analysis

In [6]:
attr = profile.attribute_results

print(f"Total null values: {attr['total_nulls']}")
print(f"\nCompleteness per column:")
for col, pct in attr['completeness'].items():
    bar = '█' * int(pct / 5) + '░' * (20 - int(pct / 5))
    print(f"  {col:12s} {bar} {pct}%")

Total null values: 5

Completeness per column:
  name         ████████████████░░░░ 83.3%
  floors       ████████████████░░░░ 83.3%
  area_sqm     ████████████████░░░░ 83.3%
  use          █████████████░░░░░░░ 66.7%


## 6. Fix Invalid Geometries

In [7]:
from geoqa.geometry import GeometryChecker

checker = GeometryChecker(gdf)
fixed_gdf = checker.fix_invalid()

# Verify fix
fixed_profile = geoqa.profile(fixed_gdf, name="Fixed Buildings")
print(f"Before fix — Invalid: {profile.geometry_results['invalid_count']}")
print(f"After fix  — Invalid: {fixed_profile.geometry_results['invalid_count']}")
print(f"\nQuality score improved: {profile.quality_score:.1f} → {fixed_profile.quality_score:.1f}")

Before fix — Invalid: 1
After fix  — Invalid: 0

Quality score improved: 84.6 → 91.2


## 7. Compare with Valid Data

In [8]:
# Profile a real dataset for comparison
try:
    real_profile = geoqa.profile(r"../../data/giza_buildings.shp")
    
    import pandas as pd
    comparison = pd.DataFrame([
        {
            "Dataset": profile.name,
            "Features": profile.feature_count,
            "Invalid": profile.geometry_results['invalid_count'],
            "Empty": profile.geometry_results['empty_count'],
            "Null Attrs": profile.attribute_results['total_nulls'],
            "Quality": f"{profile.quality_score:.1f}",
        },
        {
            "Dataset": real_profile.name,
            "Features": real_profile.feature_count,
            "Invalid": real_profile.geometry_results['invalid_count'],
            "Empty": real_profile.geometry_results['empty_count'],
            "Null Attrs": real_profile.attribute_results['total_nulls'],
            "Quality": f"{real_profile.quality_score:.1f}",
        },
    ])
    print(comparison.to_string(index=False))
except Exception as e:
    print(f"Could not load comparison data: {e}")

                Dataset  Features  Invalid  Empty  Null Attrs Quality
Test Buildings (Issues)         6        1      1           5    84.6
         giza_buildings     24615        1      0        1118    99.9


---

**GeoQA** — Geospatial Data Quality Assessment & Interactive Profiling