<a href="https://colab.research.google.com/github/EtzionR/NLP4GeoAI/blob/main/Text_to_Geo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
from geopy.geocoders import Nominatim
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
from time import sleep as wait
from tqdm import tqdm

import networkx as nx
import pandas as pd
import folium

In [66]:
%%bash
rm -rf NLP4GeoAI
git clone https://github.com/EtzionR/NLP4GeoAI.git

Cloning into 'NLP4GeoAI'...


In [67]:
df = pd.read_csv('NLP4GeoAI/data.csv')

print(f'Dataframe shape: {df.shape}')

df.head()

Dataframe shape: (23072, 2)


Unnamed: 0,Text,Source
0,"Last week, Sen. Malcolm Wallop -LRB- R., Wyo. ...",ontonotes5
1,Rules that set standards for products or gover...,ontonotes5
2,Determining when handicapped access is require...,ontonotes5
3,"``It's very costly and time-consuming ,'' says...",ontonotes5
4,"Next to medical insurance, ``costs of complian...",ontonotes5


In [79]:
MODEL = "dslim/bert-base-NER" # https://huggingface.co/dslim/distilbert-NER
TEXT = "Sam Altman visited OpenAI in San Francisco."

ner = pipeline("ner",
               model=MODEL,
               aggregation_strategy='average')

output = ner(TEXT)

print('\n\n\nNER output:\n\n')

pd.DataFrame(output)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0





NER output:




Unnamed: 0,entity_group,score,word,start,end
0,PER,0.999328,Sam Altman,0,10
1,ORG,0.664299,OpenAI,19,25
2,LOC,0.999392,San Francisco,29,42


In [71]:
outputs = []

for text, source in tqdm(df.values):
    entites = ner(text)

    for entity in entites:

        entity['source'] = source
        entity['text'] = text

        outputs.append(entity)

outputs = pd.DataFrame(outputs)

print(f'\n\nNER output shape: {outputs.shape}\n')

outputs

100%|██████████| 23072/23072 [11:48<00:00, 32.59it/s]




NER output shape: (62643, 7)



Unnamed: 0,entity_group,score,word,start,end,source,text
0,PER,0.999573,Malcolm Wallop,16,30,ontonotes5,"Last week, Sen. Malcolm Wallop -LRB- R., Wyo. ..."
1,ORG,0.500653,LRB,32,35,ontonotes5,"Last week, Sen. Malcolm Wallop -LRB- R., Wyo. ..."
2,LOC,0.879631,R,37,38,ontonotes5,"Last week, Sen. Malcolm Wallop -LRB- R., Wyo. ..."
3,LOC,0.428096,Wyo,41,44,ontonotes5,"Last week, Sen. Malcolm Wallop -LRB- R., Wyo. ..."
4,ORG,0.499855,RRB,47,50,ontonotes5,"Last week, Sen. Malcolm Wallop -LRB- R., Wyo. ..."
...,...,...,...,...,...,...,...
62638,LOC,0.999821,England,158,165,conll2003,He guided Ireland to two successive World Cup ...
62639,ORG,0.999167,Leeds United,17,29,conll2003,The lanky former Leeds United defender did not...
62640,LOC,0.999798,England,56,63,conll2003,The lanky former Leeds United defender did not...
62641,MISC,0.996952,World Cup,150,159,conll2003,The lanky former Leeds United defender did not...


In [72]:
k = 10

top_k_locations = outputs[outputs.entity_group=='LOC'].word.value_counts().head(k).reset_index()
top_k_locations

Unnamed: 0,word,count
0,China,1484
1,U. S.,1289
2,Taiwan,1252
3,Iraq,605
4,Japan,595
5,US,508
6,Israel,446
7,United States,418
8,New York,406
9,Hong Kong,403


In [73]:
example = "Tel Aviv, Israel"

geolocator = Nominatim(user_agent="GeoAI_Course_Geocoder")

location = geolocator.geocode(example)

print(f'Location full adress:\n{location.address}\n')
print(f'WGS84 GEO X = {round(location.longitude,6)}, Y = {round(location.latitude,6)}')

Location full adress:
תל־אביב–יפו, נפת תל אביב, מחוז תל אביב, ישראל

WGS84 GEO X = 34.781806, Y = 32.0853


In [74]:
time_gap = 1.25

x_coords = []
y_coords = []

for placename in tqdm(top_k_locations.word):
    loc = geolocator.geocode(placename)

    x_coords.append(loc.longitude)
    y_coords.append(loc.latitude)

    wait(time_gap)

top_k_locations['x'] = x_coords
top_k_locations['y'] = y_coords

top_k_locations

100%|██████████| 10/10 [00:18<00:00,  1.82s/it]


Unnamed: 0,word,count,x,y
0,China,1484,104.999927,35.000074
1,U. S.,1289,-100.445882,39.78373
2,Taiwan,1252,120.982018,23.973937
3,Iraq,605,44.174977,33.095579
4,Japan,595,139.239418,36.574844
5,US,508,-100.445882,39.78373
6,Israel,446,34.859476,30.812425
7,United States,418,-100.445882,39.78373
8,New York,406,-74.006015,40.712728
9,Hong Kong,403,114.184916,22.350627


In [85]:

fmap = folium.Map(location=[0, 0], zoom_start=3)

places = []
place_to_xy = {}

for name,x,y in zip(top_k_locations.word, top_k_locations.x, top_k_locations.y):
    folium.Marker([y,x], popup=name, tooltip=name).add_to(fmap)
    places.append(name)
    place_to_xy[name] = (y, x)

fmap

In [86]:
topk_places = set(places)

edge_weight = {}

G = nx.Graph()

sub = outputs[['text', 'entity_group', 'word']]
sub['merged'] = [(entity, typ) for _, typ, entity in sub.values]

sub = pd.pivot_table(sub[['text', 'merged']], index='text', aggfunc=set)

for entity_set in tqdm(sub.merged[sub.merged.str.len()>1]):

    entity_list = [*entity_set]

    for i in range(len(entity_list)):
        for j in range(i+1, len(entity_list)):
            placei = entity_list[i][0]
            placej = entity_list[j][0]
            if placei in topk_places and placej in topk_places:
                G.add_edge(placei,
                           placej)
                edge_weight[(placei, placej)] = edge_weight.get((placei, placej), 0) + 1


len(G.edges)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['merged'] = [(entity, typ) for _, typ, entity in sub.values]
100%|██████████| 15903/15903 [00:00<00:00, 177074.97it/s]


43

In [87]:
fmap = folium.Map(location=[0, 0], zoom_start=3)

for name,x,y in zip(top_k_locations.word, top_k_locations.x, top_k_locations.y):
    folium.Marker([y,x], popup=name, tooltip=name,icon = folium.Icon(color="blue") ).add_to(fmap)

for i,j in G.edges:
    if (i,j) in edge_weight:
        folium.PolyLine(locations=[place_to_xy[j],
                                   place_to_xy[i]],
                        color="blue",
                        opacity=0.3,
                        tooltip=f'Side 1: {i}<br>Side 2: {j}<br>Connections: {edge_weight[(i,j)]}',
                        weight=edge_weight[(i,j)]**.5).add_to(fmap)

fmap

In [78]:
help(folium.PolyLine)

Help on class PolyLine in module folium.vector_layers:

class PolyLine(BaseMultiLocation)
 |  PolyLine(locations, popup=None, tooltip=None, **kwargs)
 |
 |  Draw polyline overlays on a map.
 |
 |  See :func:`folium.vector_layers.path_options` for the `Path` options.
 |
 |  Parameters
 |  ----------
 |  locations: list of points (latitude, longitude)
 |      Latitude and Longitude of line (Northing, Easting)
 |      Pass multiple sequences of coordinates for a multi-polyline.
 |  popup: str or folium.Popup, default None
 |      Input text or visualization for object displayed when clicking.
 |  tooltip: str or folium.Tooltip, default None
 |      Display a text when hovering over the object.
 |  smooth_factor: float, default 1.0
 |      How much to simplify the polyline on each zoom level.
 |      More means better performance and smoother look,
 |      and less means more accurate representation.
 |  no_clip: Bool, default False
 |      Disable polyline clipping.
 |  **kwargs
 |      O