## Global Location Extraction and Geospatial Mapping Using NLP and GIS


In [1]:
!python -m spacy download en_core_web_trf


Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-curated-transformers<1.0.0,>=0.2.2 (from en-core-web-trf==3.8.0)
  Downloading spacy_curated_transformers-0.3.1-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_tokenizers-0.0.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading spacy_curated_transformers-0.3.1-py2.py3-none-any.whl (237 kB)
[2K   [90m━━━━━

In [2]:
import re, json, time
from pathlib import Path
from collections import defaultdict
import pandas as pd, geopandas as gpd
from shapely.geometry import Point
from tqdm import tqdm
import folium
from folium.plugins import MarkerCluster
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import spacy

# Folder for outputs
OUTPUT_DIR = Path("output")
OUTPUT_DIR.mkdir(exist_ok=True)

# spaCy model
SPACY_MODEL = "en_core_web_trf"  # or "en_core_web_sm" for low-RAM
nlp = spacy.load(SPACY_MODEL, disable=["parser", "lemmatizer", "textcat"])

print("✅ spaCy model loaded successfully")


✅ spaCy model loaded successfully


In [3]:
import spacy
import pandas as pd
import re
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

texts = [
    "Massive flooding reported in Lagos, Oshodi this morning.",
    "Wildfire spreading near Santa Rosa, California — evacuations underway.",
    "Meeting scheduled in Abuja next week with the Ministry of Works.",
    "Just landed in London. The weather is terrible but the coffee is good!",
    "Earthquake felt in Kathmandu and nearby villages.",
    "New factory opening in Shenzhen; huge investment announced.",
    "Protest in downtown Minneapolis over recent events.",
]

def clean_entity(text):
    text = re.sub(r"http\S+|www\S+|@\w+|#", "", text)
    return text.strip(" .,:;\"'()[]")

# ====================================================================
# Extract and group GPE locations (combined as compounds)
# ====================================================================

rows = []

for i, t in enumerate(tqdm(texts, desc="Extracting locations")):
    doc = nlp(t)

    # Extract only GPE entities (countries, cities, states)
    gpes = [ent.text for ent in doc.ents if ent.label_ == "GPE"]

    # If multiple GPEs, combine them as one location
    if len(gpes) > 1:
        combined = " ".join(gpes)
        entity = clean_entity(combined)
        if entity:
            rows.append({
                "text_id": i,
                "text": t,
                "entity": entity,
                "label": "GPE"
            })
    elif len(gpes) == 1:
        entity = clean_entity(gpes[0])
        if entity:
            rows.append({
                "text_id": i,
                "text": t,
                "entity": entity,
                "label": "GPE"
            })

df_entities = pd.DataFrame(rows)
print("✅ Entities extracted:", len(df_entities))
print(df_entities)

Extracting locations: 100%|██████████| 7/7 [00:00<00:00, 86.21it/s]

✅ Entities extracted: 7
   text_id                                               text  \
0        0  Massive flooding reported in Lagos, Oshodi thi...   
1        1  Wildfire spreading near Santa Rosa, California...   
2        2  Meeting scheduled in Abuja next week with the ...   
3        3  Just landed in London. The weather is terrible...   
4        4  Earthquake felt in Kathmandu and nearby villages.   
5        5  New factory opening in Shenzhen; huge investme...   
6        6  Protest in downtown Minneapolis over recent ev...   

                  entity label  
0                  Lagos   GPE  
1  Santa Rosa California   GPE  
2                  Abuja   GPE  
3                 London   GPE  
4              Kathmandu   GPE  
5               Shenzhen   GPE  
6            Minneapolis   GPE  





In [4]:
#Geocode hierarchically

geolocator = Nominatim(user_agent="colab-loc-ner-mapper", timeout=10)
rate_geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

unique_entities = sorted(df_entities["entity"].unique())
geo_results = []

for place in tqdm(unique_entities, desc="Geocoding"):
    loc = rate_geocode(place, language="en")
    if loc:
        geo_results.append({
            "entity": place,
            "lat": loc.latitude,
            "lon": loc.longitude,
            "display": loc.address
        })
    else:
        geo_results.append({"entity": place, "lat": None, "lon": None, "display": None})

df_geo = pd.DataFrame(geo_results)
df_entities = df_entities.merge(df_geo, on="entity", how="left")

print("✅ Geocoding complete")
df_entities.to_csv(OUTPUT_DIR / "entities_geocoded.csv", index=False)
df_entities.head()


Geocoding: 100%|██████████| 7/7 [00:06<00:00,  1.00it/s]

✅ Geocoding complete





Unnamed: 0,text_id,text,entity,label,lat,lon,display
0,0,"Massive flooding reported in Lagos, Oshodi thi...",Lagos,GPE,6.455057,3.394179,"Lagos, Lagos Island, Lagos State, 100242, Nigeria"
1,1,"Wildfire spreading near Santa Rosa, California...",Santa Rosa California,GPE,38.440492,-122.714105,"Santa Rosa, Sonoma County, California, United ..."
2,2,Meeting scheduled in Abuja next week with the ...,Abuja,GPE,9.064331,7.489297,"Abuja, Municipal Area Council, Federal Capital..."
3,3,Just landed in London. The weather is terrible...,London,GPE,51.489334,-0.144055,"London, Greater London, England, United Kingdom"
4,4,Earthquake felt in Kathmandu and nearby villages.,Kathmandu,GPE,27.708317,85.320582,"Kathmandu Metropolitan City, Kathmandu, Bagama..."


In [5]:
#save as geojson

valid = df_entities.dropna(subset=["lat","lon"])
gdf = gpd.GeoDataFrame(
    valid,
    geometry=[Point(xy) for xy in zip(valid.lon, valid.lat)],
    crs="EPSG:4326"
)

gdf.to_file(OUTPUT_DIR / "locations.geojson", driver="GeoJSON")
print("✅ GeoJSON saved →", OUTPUT_DIR / "locations.geojson")


✅ GeoJSON saved → output/locations.geojson


In [13]:
if not gdf.empty:
    center = [gdf["lat"].mean(), gdf["lon"].mean()]
    m = folium.Map(location=center, zoom_start=3)
    # --- Add multiple basemap layers ---
    folium.TileLayer('OpenStreetMap', name='🗺️ OpenStreetMap').add_to(m)
    folium.TileLayer('CartoDB positron', name='🌤️ CartoDB Light').add_to(m)
    folium.TileLayer('CartoDB dark_matter', name='🌙 CartoDB Dark').add_to(m)
    folium.TileLayer('Stamen Terrain', name='🌎 Stamen Terrain', attr='Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.').add_to(m)
    folium.TileLayer('Stamen Toner', name='toner', attr='Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.').add_to(m)
    cluster = MarkerCluster().add_to(m)

    # Esri satellite imagery
    folium.TileLayer(
        tiles="https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}",
        attr="Esri World Imagery",
        name='🛰️ Esri Satellite'
    ).add_to(m)


    for _, row in gdf.iterrows():
        popup = f"<b>{row['entity']}</b><br>{row['text']}"
        folium.Marker(
            location=[row['lat'], row['lon']],
            popup=popup,
            tooltip=row['entity']
        ).add_to(cluster)
      # --- Add Layer Control for switching maps ---
    folium.LayerControl(position='topright', collapsed=False).add_to(m)

    # --- Save to HTML ---
    m.save("GIS_LLM_WebApp.html")
    print("✅ Map saved as GIS_LLM_WebApp.html — open it in your browser.")

    m.save(str(OUTPUT_DIR / "map.html"))
    m  # Show inline in Colab
else:
    print("No valid coordinates to map.")

✅ Map saved as GIS_LLM_WebApp.html — open it in your browser.


In [10]:
!pip install --upgrade folium branca



In [15]:
from IPython.display import display, HTML

display(HTML("GIS_LLM_WebApp.html"))

In [34]:
from google.colab import files
files.download("output/entities_geocoded.csv")
files.download("output/locations.geojson")
files.download("output/map.html")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>