<a href="https://colab.research.google.com/github/EtzionR/NLP4GeoAI/blob/main/Text_to_Geo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP for GeoAI
### created by Etzion Harari | RFL

[**https://github.com/EtzionR/NLP4GeoAI**](https://github.com/EtzionR/NLP4GeoAI)

## Imports

In [94]:
from geopy.geocoders import Nominatim
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
from time import sleep as wait
from tqdm import tqdm

import networkx as nx
import pandas as pd
import folium

## Clone git Repo
[https://github.com/EtzionR/NLP4GeoAI](https://github.com/EtzionR/NLP4GeoAI)

In [95]:
%%bash
rm -rf NLP4GeoAI
git clone https://github.com/EtzionR/NLP4GeoAI.git

Cloning into 'NLP4GeoAI'...


## Load Data

In [96]:
df = pd.read_csv('NLP4GeoAI/data.csv')

print(f'Dataframe shape: {df.shape}')

df.head()

Dataframe shape: (23072, 2)


Unnamed: 0,Text,Source
0,"Last week, Sen. Malcolm Wallop -LRB- R., Wyo. ...",ontonotes5
1,Rules that set standards for products or gover...,ontonotes5
2,Determining when handicapped access is require...,ontonotes5
3,"``It's very costly and time-consuming ,'' says...",ontonotes5
4,"Next to medical insurance, ``costs of complian...",ontonotes5


## Activate NER model
[https://huggingface.co/dslim/distilbert-NER](https://huggingface.co/dslim/distilbert-NER)

In [97]:
MODEL = "dslim/bert-base-NER"
TEXT = "Sam Altman visited OpenAI in San Francisco."

ner = pipeline("ner",
               model=MODEL,
               aggregation_strategy='average')

output = ner(TEXT)

print('\n\n\nNER output:\n\n')

pd.DataFrame(output)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0





NER output:




Unnamed: 0,entity_group,score,word,start,end
0,PER,0.999328,Sam Altman,0,10
1,ORG,0.664299,OpenAI,19,25
2,LOC,0.999392,San Francisco,29,42


## Run NER on the entire corpus

In [None]:
outputs = []

for text, source in tqdm(df.values):
    entites = ner(text)

    for entity in entites:

        entity['source'] = source
        entity['text'] = text

        outputs.append(entity)

outputs = pd.DataFrame(outputs)

print(f'\n\nNER output shape: {outputs.shape}\n')

outputs

 80%|████████  | 18525/23072 [09:24<02:39, 28.59it/s]

## Display the Top K locations from the corpus

In [None]:
k = 10

top_k_locations = outputs[outputs.entity_group=='LOC'].word.value_counts().head(k).reset_index()
top_k_locations

## Use Nominatim package to geocode example place name

In [None]:
example = "Tel Aviv, Israel"

geolocator = Nominatim(user_agent="GeoAI_Course_Geocoder")

location = geolocator.geocode(example)

print(f'Location full adress:\n{location.address}\n')
print(f'WGS84 GEO X = {round(location.longitude,6)}, Y = {round(location.latitude,6)}')

## Geocode the Top K place names

In [None]:
time_gap = 1.25

x_coords = []
y_coords = []

for placename in tqdm(top_k_locations.word):
    loc = geolocator.geocode(placename)

    x_coords.append(loc.longitude)
    y_coords.append(loc.latitude)

    wait(time_gap)

top_k_locations['x'] = x_coords
top_k_locations['y'] = y_coords

top_k_locations

## Create folimap on the Top K places in the Corpus

In [None]:

fmap = folium.Map(location=[0, 0], zoom_start=3)

places = []
place_to_xy = {}

for name,x,y in zip(top_k_locations.word, top_k_locations.x, top_k_locations.y):
    folium.Marker([y,x], popup=name, tooltip=name).add_to(fmap)
    places.append(name)
    place_to_xy[name] = (y, x)

fmap

## Construct a Graph from the pair locations entities

In [None]:
topk_places = set(places)

edge_weight = {}

G = nx.Graph()

sub = outputs[['text', 'entity_group', 'word']]
sub['merged'] = [(entity, typ) for _, typ, entity in sub.values]

sub = pd.pivot_table(sub[['text', 'merged']], index='text', aggfunc=set)

for entity_set in sub.merged[sub.merged.str.len()>1]:

    entity_list = [*entity_set]

    for i in range(len(entity_list)):
        for j in range(i+1, len(entity_list)):
            placei = entity_list[i][0]
            placej = entity_list[j][0]
            if placei in topk_places and placej in topk_places:
                G.add_edge(placei,
                           placej)
                edge_weight[(placei, placej)] = edge_weight.get((placei, placej), 0) + 1


print(f'\n\n\nConnections Graph created!\n|V| = {len(G.nodes)}\n|E| = {len(G.edges)}')

## Display the Graph on the MAP

In [None]:
fmap = folium.Map(location=[0, 0], zoom_start=3)

for name,x,y in zip(top_k_locations.word, top_k_locations.x, top_k_locations.y):
    folium.Marker([y,x], popup=name, tooltip=name,icon = folium.Icon(color="blue") ).add_to(fmap)

for i,j in G.edges:
    if (i,j) in edge_weight:
        folium.PolyLine(locations=[place_to_xy[j],
                                   place_to_xy[i]],
                        color="blue",
                        opacity=0.3,
                        tooltip=f'Side 1: {i}<br>Side 2: {j}<br>Connections: {edge_weight[(i,j)]}',
                        weight=edge_weight[(i,j)]**.5).add_to(fmap)

fmap

[https://github.com/EtzionR/NLP4GeoAI](https://github.com/EtzionR/NLP4GeoAI)