## 1. Setup & Imports

In [None]:
import pandas as pd
import json
from pathlib import Path
from datetime import datetime
import folium
from folium import plugins
from IPython.display import display, HTML
import rdflib
from rdflib import Graph, Namespace, URIRef, Literal
from collections import Counter

# Setup paths
DATA_DIR = Path("../data/processed")
PARQUET_FILE = DATA_DIR / "council_data.parquet"
GEOJSON_FILE = DATA_DIR / "augsburg_map.geojson"
NT_FILE = DATA_DIR / "metadata.nt"
TTL_FILE = DATA_DIR / "metadata.ttl"

print("‚úì Imports loaded")
print(f"\nData directory: {DATA_DIR}")

## 2. Load Parquet Data

Load the main council data from Parquet format.

In [None]:
# Load parquet
if PARQUET_FILE.exists():
    df = pd.read_parquet(PARQUET_FILE)
    print(f"‚úì Loaded {len(df)} papers from Parquet")
    print(f"\nColumns: {list(df.columns)}")
    print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")
else:
    print(f"‚ùå File not found: {PARQUET_FILE}")
    print("\nRun the pipeline first:")
    print("  python scripts/run_pipeline.py --test --limit 10")

## 3. Basic Statistics

In [None]:
# Overview
print("=" * 70)
print("PIPELINE OUTPUT STATISTICS")
print("=" * 70)

print(f"\nTotal papers: {len(df)}")
print(f"City: {df['city'].unique()[0] if len(df) > 0 else 'N/A'}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")

# Count papers with locations
papers_with_locations = df[df['locations'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]
print(f"\nPapers with locations: {len(papers_with_locations)} / {len(df)}")

# Total locations
total_locations = df['locations'].apply(lambda x: len(x) if isinstance(x, list) else 0).sum()
print(f"Total location mentions: {total_locations}")

# Locations with coordinates
geocoded_count = 0
for locs in df['locations']:
    if isinstance(locs, list):
        geocoded_count += sum(1 for loc in locs if loc.get('latitude') and loc.get('longitude'))

print(f"Geocoded locations: {geocoded_count} / {total_locations}")
print(f"Geocoding success rate: {geocoded_count/total_locations*100:.1f}%" if total_locations > 0 else "N/A")

print("\n" + "=" * 70)

## 4. Sample Papers

In [None]:
# Show first few papers
display_cols = ['id', 'name', 'date', 'city']
if all(col in df.columns for col in display_cols):
    print("\nFirst 5 papers:")
    display(df[display_cols].head())
else:
    print("\nAvailable columns:", df.columns.tolist())

## 5. Location Analysis

Analyze the extracted locations.

In [None]:
# Extract all locations into flat list
all_locations = []
for idx, row in df.iterrows():
    if isinstance(row['locations'], list):
        for loc in row['locations']:
            all_locations.append({
                'name': loc.get('name', ''),
                'latitude': loc.get('latitude'),
                'longitude': loc.get('longitude'),
                'source': loc.get('source', ''),
                'paper_id': row['id'],
                'paper_title': row.get('name', ''),
                'paper_date': row.get('date', '')
            })

df_locations = pd.DataFrame(all_locations)
print(f"Total location mentions: {len(df_locations)}")

if len(df_locations) > 0:
    # Top locations by count
    location_counts = df_locations['name'].value_counts().head(10)
    print("\n" + "=" * 70)
    print("TOP 10 MOST MENTIONED LOCATIONS")
    print("=" * 70)
    for loc, count in location_counts.items():
        print(f"  {loc:40} {count:3} mentions")

    # Source breakdown
    print("\n" + "=" * 70)
    print("LOCATIONS BY SOURCE")
    print("=" * 70)
    source_counts = df_locations['source'].value_counts()
    for source, count in source_counts.items():
        print(f"  {source:20} {count:3} locations")

    # Geocoding stats
    geocoded_locs = df_locations[df_locations['latitude'].notna()]
    print(f"\nGeocoded: {len(geocoded_locs)} / {len(df_locations)} ({len(geocoded_locs)/len(df_locations)*100:.1f}%)")
else:
    print("\n‚ö†Ô∏è  No locations found in the data")

## 6. Interactive Map with PDF Links

Create an interactive map using Folium. Each location marker includes:
- Location name
- Paper title and date
- **Direct link to original PDF**

In [None]:
# Load GeoJSON
if GEOJSON_FILE.exists():
    with open(GEOJSON_FILE, 'r', encoding='utf-8') as f:
        geojson_data = json.load(f)

    print(f"‚úì Loaded GeoJSON with {len(geojson_data['features'])} features")

    # Create map centered on Augsburg
    augsburg_center = [48.3705, 10.8978]
    m = folium.Map(
        location=augsburg_center,
        zoom_start=13,
        tiles='OpenStreetMap'
    )

    # Add markers for each location
    for feature in geojson_data['features']:
        coords = feature['geometry']['coordinates']
        props = feature['properties']

        # Build popup HTML with PDF link
        popup_html = f"""
        <div style="width: 300px;">
            <h4 style="margin-bottom: 10px;">{props.get('location_name', 'Unknown')}</h4>
            <hr style="margin: 5px 0;">
            <p style="margin: 5px 0;"><strong>Paper:</strong> {props.get('paper_title', 'N/A')}</p>
            <p style="margin: 5px 0;"><strong>Date:</strong> {props.get('paper_date', 'N/A')}</p>
            <p style="margin: 5px 0;"><strong>Source:</strong> {props.get('source', 'N/A')}</p>
        """

        # Add PDF link if available
        if props.get('pdf_url'):
            popup_html += f'<p style="margin: 10px 0;"><a href="{props["pdf_url"]}" target="_blank" style="color: #0066cc; font-weight: bold;">üìÑ Open PDF</a></p>'

        popup_html += "</div>"

        # Add marker
        folium.Marker(
            location=[coords[1], coords[0]],  # GeoJSON is [lon, lat], Folium needs [lat, lon]
            popup=folium.Popup(popup_html, max_width=300),
            tooltip=props.get('location_name', 'Unknown'),
            icon=folium.Icon(color='blue', icon='info-sign')
        ).add_to(m)

    # Add marker cluster for better performance with many locations
    # marker_cluster = plugins.MarkerCluster().add_to(m)

    # Display map
    display(m)

    print(f"\n‚úì Map created with {len(geojson_data['features'])} location markers")
    print("üí° Click on markers to see paper details and PDF links")

else:
    print(f"‚ùå GeoJSON file not found: {GEOJSON_FILE}")
    print("\nThe pipeline should have created this file automatically.")

## 7. RDF Conversion: N-Triples ‚Üí Turtle

Convert the RDF output from N-Triples (.nt) to Turtle (.ttl) format for better readability and YASGUI compatibility.

In [None]:
# Load N-Triples and convert to Turtle
if NT_FILE.exists():
    print(f"Loading RDF from: {NT_FILE}")

    # Create RDF graph
    g = Graph()

    # Parse N-Triples
    g.parse(str(NT_FILE), format='nt')

    print(f"‚úì Loaded {len(g)} triples")

    # Bind common namespaces for prettier output
    g.bind('oparl', Namespace('http://oparl.org/schema/1.1/'))
    g.bind('dct', Namespace('http://purl.org/dc/terms/'))
    g.bind('geo', Namespace('http://www.opengis.net/ont/geosparql#'))
    g.bind('xsd', Namespace('http://www.w3.org/2001/XMLSchema#'))

    # Serialize to Turtle
    print(f"\nConverting to Turtle format...")
    ttl_content = g.serialize(format='turtle')

    # Save to file
    with open(TTL_FILE, 'w', encoding='utf-8') as f:
        f.write(ttl_content)

    print(f"‚úì Saved Turtle file: {TTL_FILE}")
    print(f"  Size: {TTL_FILE.stat().st_size / 1024:.1f} KB")

    # Show sample triples
    print("\n" + "=" * 70)
    print("SAMPLE TURTLE OUTPUT (first 30 lines):")
    print("=" * 70)
    print("\n".join(ttl_content.split('\n')[:30]))
    print("\n... (truncated) ...")

else:
    print(f"‚ùå N-Triples file not found: {NT_FILE}")
    print("\nThe pipeline should have created this file automatically.")

## 8. YASGUI Upload Instructions

To test the RDF data in YASGUI:

1. **Open YASGUI**: https://yasgui.triply.cc/
2. **Upload TTL file**: 
   - Click on "Data" tab
   - Click "Upload file"
   - Select `data/processed/metadata.ttl`
3. **Run SPARQL queries**

### Example SPARQL Queries:

```sparql
# Query 1: Count all papers
PREFIX oparl: <http://oparl.org/schema/1.1/>
SELECT (COUNT(?paper) AS ?count)
WHERE {
  ?paper a oparl:Paper .
}

# Query 2: List all papers with titles
PREFIX oparl: <http://oparl.org/schema/1.1/>
PREFIX dct: <http://purl.org/dc/terms/>
SELECT ?paper ?title ?date
WHERE {
  ?paper a oparl:Paper ;
         dct:title ?title ;
         dct:date ?date .
}
ORDER BY DESC(?date)

# Query 3: Find papers mentioning specific locations
PREFIX oparl: <http://oparl.org/schema/1.1/>
PREFIX geo: <http://www.opengis.net/ont/geosparql#>
SELECT ?paper ?title ?location ?coords
WHERE {
  ?paper a oparl:Paper ;
         dct:title ?title ;
         oparl:relatesToArea ?location .
  ?location geo:asWKT ?coords .
}
```

## 9. File Summary

Summary of all generated files.

In [None]:
print("=" * 70)
print("GENERATED FILES SUMMARY")
print("=" * 70)

files_to_check = [
    (PARQUET_FILE, "Parquet (main data)"),
    (GEOJSON_FILE, "GeoJSON (map data)"),
    (NT_FILE, "N-Triples (RDF)"),
    (TTL_FILE, "Turtle (RDF, converted)"),
]

for file_path, description in files_to_check:
    if file_path.exists():
        size_kb = file_path.stat().st_size / 1024
        print(f"\n‚úì {description}")
        print(f"  Path: {file_path}")
        print(f"  Size: {size_kb:.1f} KB")
    else:
        print(f"\n‚úó {description}")
        print(f"  Path: {file_path}")
        print(f"  Status: NOT FOUND")

print("\n" + "=" * 70)

## 10. Next Steps

After validating the test run:

1. **Run Full Pipeline** (6 months of data):
   ```bash
   python scripts/run_pipeline.py --city augsburg
   ```

2. **Deeper Analysis**:
   - Temporal patterns (which areas are mentioned when?)
   - Spatial clustering (are decisions focused in certain districts?)
   - Topic modeling combined with locations

3. **Visualization**:
   - Heatmaps of council activity
   - Time-series animations
   - Network graphs of related locations

4. **Export for Research**:
   - Share Turtle file with collaborators
   - Publish to SPARQL endpoint
   - Create interactive web dashboard