In [None]:
# -*- sos -*-
###############################################################################


# 1. Generate query for healthcare facilities
[make_query_healthcare]
output: 'query_healthcare.osmql'
python: expand=True
    query = """[out:csv(::id,::type,name,amenity,healthcare,operator,
         "operator:type","healthcare:speciality","addr:city",
         ::lat,::lon; true; ",")][timeout:1800];

    area["ISO3166-1"="PL"][admin_level=2]->.pl;

    (
      nwr(area.pl)[amenity=hospital];
      nwr(area.pl)[amenity=clinic];
      nwr(area.pl)[amenity=pharmacy];
    );
    out center;

    """
    with open("{_output}", "w", encoding="utf-8") as f:
        f.write(query)

###############################################################################
# 2. Execute query and download data
[healthcare_data]
input: 'query_healthcare.osmql'
output: 'query_healthcare.csv'
sh: expand=True
  curl -X POST -d @{_input} https://overpass-api.de/api/interpreter -o {_output}


###############################################################################
# 3. Analyse healthcare data and generate a multi-page PDF report
[healthcare_analysis]
input: 'query_healthcare.csv'
output: 'healthcare_analysis.pdf'
python:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.backends.backend_pdf import PdfPages
    from math import radians, sin, cos, sqrt, atan2
    import matplotlib


    def plot_heatmap(data, title, label_column='city', top_n=10):
        import matplotlib.patheffects as pe
        fig, ax = plt.subplots(figsize=(6, 6))
        hb = ax.hexbin(data['lon'], data['lat'],
                       gridsize=120, cmap='inferno',
                       bins='log', mincnt=1)
        ax.set_title(title, fontweight='bold')
        ax.set_xlabel('Longitude')
        ax.set_ylabel('Latitude')
        fig.colorbar(hb, ax=ax, label='log10(count)')
        stats = (
            data.groupby(label_column)
                .agg(count=(label_column, 'size'),
                     centroid_lat=('lat', 'mean'),
                     centroid_lon=('lon', 'mean'))
                .reset_index()
                .sort_values('count', ascending=False)
        )
        for _, row in stats.head(top_n).iterrows():
            if row[label_column] != 'unknown':
                ax.text(row.centroid_lon, row.centroid_lat, row[label_column],
                        fontsize=8, ha='center', va='center', color='lightgreen',
                        fontweight='bold',
                        path_effects=[pe.withStroke(linewidth=2, foreground='black')])
        plt.tight_layout()
        pdf.savefig(fig)
        plt.close(fig)




    def plot_stacked_bar(df_grouped, title):
        plt.figure(figsize=(10, 5))
        df_grouped.plot(kind='bar', stacked=True, ax=plt.gca())
        plt.title(title)
        plt.xlabel('City'); plt.ylabel('Number of facilities')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        pdf.savefig(); plt.close()

    def plot_scatter(df, top_n=10):
        import matplotlib.patheffects as pe
        plt.figure(figsize=(6, 6))
        for amenity, group in df.groupby('amenity'):
            plt.scatter(group['lon'], group['lat'], s=5, alpha=0.6, label=amenity)

        plt.title('Geographical distribution by facility type')
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')

        city_counts = (
            df[df['city'] != 'unknown']
            .groupby('city')
            .size()
            .sort_values(ascending=False)
        )
        top_cities = city_counts.head(top_n).index

        centroids = (
            df[df['city'].isin(top_cities)]
            .groupby('city')
            .agg(centroid_lat=('lat', 'mean'),
                 centroid_lon=('lon', 'mean'))
            .reset_index()
        )

        for _, row in centroids.iterrows():
            plt.text(row.centroid_lon, row.centroid_lat, row['city'],
                     fontsize=7, ha='center', va='center', color='lightgreen',
                     fontweight='bold',
                     path_effects=[pe.withStroke(linewidth=1.5, foreground='black')])

        plt.legend(markerscale=3, fontsize=8)
        plt.tight_layout()
        pdf.savefig(); plt.close()

    def plot_scatter_split_by_type(df, amenities, top_n=10):
        import matplotlib.patheffects as pe
        colors = {
            'clinic': 'tab:blue',
            'hospital': 'tab:orange',
            'pharmacy': 'tab:green'
        }

        for amenity in amenities:
            subset = df[df['amenity'] == amenity]
            if subset.empty:
                continue

            fig, ax = plt.subplots(figsize=(6, 6))
            ax.scatter(subset['lon'], subset['lat'], s=5, alpha=0.6,
                       label=amenity, color=colors.get(amenity, 'gray'))

            ax.set_title(f'{amenity.title()} – geographical distribution')
            ax.set_xlabel('Longitude')
            ax.set_ylabel('Latitude')

            city_counts = (
                subset[subset['city'] != 'unknown']
                .groupby('city').size().sort_values(ascending=False)
            )
            top_cities = city_counts.head(top_n).index

            centroids = (
                subset[subset['city'].isin(top_cities)]
                .groupby('city')
                .agg(centroid_lat=('lat', 'mean'),
                     centroid_lon=('lon', 'mean'))
                .reset_index()
            )
            for _, row in centroids.iterrows():
                ax.text(row.centroid_lon, row.centroid_lat, row['city'],
                        fontsize=7, ha='center', va='center', color='lightgreen',
                        fontweight='bold',
                        path_effects=[pe.withStroke(linewidth=1.5, foreground='black')])

            legend_lines = [f"{city}: {city_counts[city]}" for city in top_cities]
            legend_lines.append(f"\nTotal cities: {city_counts.shape[0]}")
            legend_text = "\n".join(legend_lines)

            ax.text(1.05, 1.0, legend_text,
                    transform=ax.transAxes,
                    fontsize=8, va='top', ha='left',
                    bbox=dict(boxstyle="round", fc="white", ec="gray", alpha=0.8))

            plt.tight_layout()
            pdf.savefig(fig)
            plt.close(fig)




    EARTH_R = 6371.0  # km

    def haversine(lat1, lon1, lat2, lon2):
        dlat = radians(lat2 - lat1)
        dlon = radians(lon2 - lon1)
        a = sin(dlat/2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2
        return 2 * EARTH_R * atan2(sqrt(a), sqrt(1 - a))

    def plot_most_isolated_cities(df, title, pdf):
        # 1. Pick only hospitals and city centroids
        hospitals = df[df['amenity'] == 'hospital']
        centroids = (
            hospitals[hospitals['city'] != 'unknown']
            .groupby('city')[['lat', 'lon']]
            .mean()
            .reset_index()
        )

        # 2. for each city we search for nearest other city with hospital
        nearest_dist = []
        for i, row_i in centroids.iterrows():
            min_d = float('inf')
            for j, row_j in centroids.iterrows():
                if i == j:
                    continue
                d = haversine(row_i['lat'], row_i['lon'], row_j['lat'], row_j['lon'])
                if d < min_d:
                    min_d = d
            nearest_dist.append((row_i['city'], round(min_d, 2)))   # (miasto, km do najbliższego)

        # 3. sort desc and choose TOP-10 most isolated
        top10 = sorted(nearest_dist, key=lambda x: -x[1])[:10]
        labels  = [city for city, _ in top10]
        values  = [dist for _,  dist in top10]

        #chart
        plt.figure(figsize=(10, 4))
        bars = plt.bar(labels, values)
        plt.title(title)
        plt.ylabel('Distance to the nearest hospital city (km)')
        plt.xticks(rotation=45, ha='right')

        for bar, val in zip(bars, values):
            plt.text(bar.get_x() + bar.get_width()/2,
                     bar.get_height() + 1,
                     f'{val} km',
                     ha='center', va='bottom', fontsize=8)

        plt.tight_layout()
        pdf.savefig(); plt.close()



    def plot_city_type_heatmap(df, top_n_cities=20):
        # prepare pivot table: count of facilities by amenity type and city
        tbl = (df[df["city"] != "unknown"]
               .pivot_table(index="amenity",
                            columns="city",
                            aggfunc="size",
                            fill_value=0))

        # keep only top N cities with the most total facilities
        big = tbl.sum().sort_values(ascending=False).head(top_n_cities).index
        tbl = tbl[big]

        # draw heatmap 
        plt.figure(figsize=(10, 6), constrained_layout=True)
        im = plt.imshow(tbl, aspect="auto", cmap="viridis")

        # axis labels
        plt.xticks(ticks=range(len(tbl.columns)), labels=tbl.columns, rotation=45, ha="right")
        plt.yticks(ticks=range(len(tbl.index)),   labels=tbl.index)

        # label each row with the max city (red outline)
        for y, row in enumerate(tbl.values):
            x = row.argmax()
            plt.text(x, y, f"{row[x]}", va="center", ha="center",
                     color="white" if row[x] > row.mean() else "black",
                     bbox=dict(boxstyle="round,pad=0.2", fc="none", ec="red", lw=1))

        plt.colorbar(im, label="Number of facilities")
        plt.title("Top cities per healthcare facility type")
        pdf.savefig(); plt.close()

    def plot_bar_per_amenity(df, amenities, top_n=15):
        colors = {
            'clinic': 'tab:blue',
            'hospital': 'tab:orange',
            'pharmacy': 'tab:green'
        }

        for amenity in amenities:
            subset = df[df['amenity'] == amenity]
            city_counts = (
                subset[subset['city'] != 'unknown']
                .groupby('city').size().sort_values(ascending=False).head(top_n)
            )

            plt.figure(figsize=(10, 4))
            city_counts.plot(kind='bar', color=colors[amenity])
            plt.title(f'{amenity.title()} – top {top_n} cities')
            plt.ylabel('Number of facilities')
            plt.xlabel('City')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            pdf.savefig(); plt.close()





    df = pd.read_csv('query_healthcare.csv')
    df = df.rename(columns={'@lat': 'lat', '@lon': 'lon'})
    df = df.dropna(subset=['lat', 'lon'])

    # Keep only selected relevant amenity types
    relevant_amenities = [
        'hospital', 'clinic', 'pharmacy',
    ]

    df = df[df['amenity'].isin(relevant_amenities)]

    df['city'] = df['addr:city'].fillna('unknown')
    known_df = df[df['city'] != 'unknown']

    city_amenity = (
        known_df.groupby(['city', 'amenity'])
                .size()
                .unstack(fill_value=0)
    )
    top_cities = city_amenity.sum(axis=1).sort_values(ascending=False).head(20)
    city_amenity_top = city_amenity.loc[top_cities.index]

    with PdfPages('healthcare_analysis.pdf') as pdf:
        plot_heatmap(df, 'Spatial density of healthcare facilities', top_n=18)
        plot_stacked_bar(city_amenity_top, 'Healthcare facilities by city (top 20 cities)')
        plot_bar_per_amenity(df, relevant_amenities, top_n=15)
        plot_scatter(df,top_n=18)
        plot_scatter_split_by_type(df, relevant_amenities,top_n=18)
        plot_most_isolated_cities(df, 'Top 10 isolated hospital cities', pdf)
        plot_city_type_heatmap(df, top_n_cities=20)
        info = pdf.infodict()
        info['Title'] = 'Healthcare facilities in Poland'
        info['Author'] = 'SoS pipeline'



In [59]:
%sosrun -t healthcare_analysis.pdf

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   429    0    99  100   330    189    630 --:--:-- --:--:-- --:--:--   820


Traceback (most recent call last):
  File "/tmp/tmpklzr3db8.py", line 197, in <module>
    plot_stacked_bar(city_amenity_top, 'Healthcare facilities by city and facility type (top 20 cities)')
  File "/tmp/tmpklzr3db8.py", line 33, in plot_stacked_bar
    df_grouped.plot(kind='bar', stacked=True, ax=plt.gca())
  File "/opt/conda/lib/python3.10/site-packages/pandas/plotting/_core.py", line 1000, in __call__
    return plot_backend.plot(data, kind=kind, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/pandas/plotting/_matplotlib/__init__.py", line 71, in plot
    plot_obj.generate()
  File "/opt/conda/lib/python3.10/site-packages/pandas/plotting/_matplotlib/core.py", line 450, in generate
    self._compute_plot_data()
  File "/opt/conda/lib/python3.10/site-packages/pandas/plotting/_matplotlib/core.py", line 635, in _compute_plot_data
    raise TypeError("no numeric data to plot")
TypeError: no numeric data to plot
[91mERROR[0m: [91m[healthcare_analysis (healthcare_analysis]: 

RuntimeError: Workflow exited with code 1