## Recognizing Locations from extracted text

Different methods to get the location the RTS broadcasts are about

In [None]:
# ! pip install folium

In [1]:
from emv.db.dao import DataAccessObject
from sqlalchemy.sql import text
import pandas as pd
import numpy as np
import json
import folium

from emv.client.get_content import get_features

# Load data with query

In [None]:
query = text("""SELECT * FROM feature WHERE feature_type = 'transcript+ner';""")
df = pd.DataFrame(DataAccessObject().fetch_all(query))

In [6]:
loc_entities = []

for i, row in df.iterrows():
    try:
        for ent in row['data']['entities']:
            if ent[1] == 'LOC':
                loc_entities.append(ent[0].lower())
                # print(ent['text'])
        # print(row['data']['entities'])
    except KeyError:
        pass

In [7]:
series = pd.Series(loc_entities).value_counts()

In [None]:
all_locs = []
for s in series.items():
    all_locs.append(s)
print(all_locs)

## Match streets

In [None]:
streets = []
for i, row in df.iterrows():
    try:
        for t in row['data']['transcript']:
            streets.append(t['t'])
        # row['data']['transcript'][0]['t']
    except KeyError:
        pass
    except TypeError:
        pass

In [None]:
for i, row in df.iterrows():
    try:
        for t in row['data']['transcript']:
            if 'rue de' in t['t'].lower():
                print(t['t'])
        # row['data']['transcript'][0]['t']
    except KeyError:
        pass
    except TypeError:
        pass

In [None]:
with open("data/geneva_streets.txt", "r") as f:
    streets = [x.strip() for x in f.readlines()]

In [None]:
def replace_street(sent):
    corrected_street = sent.replace("ruedes", "rue des")
    corrected_street = corrected_street.replace("cheminde", "chemin de")
    corrected_street = corrected_street.replace("placedes", "place des")
    corrected_street = corrected_street.replace("placede", "place de")
    corrected_street = corrected_street.replace("routedes", "route des")
    corrected_street = corrected_street.replace("avenuede", "avenue de")
    corrected_street = corrected_street.replace("avenuedu", "avenue du")
    corrected_street = corrected_street.replace("ruede", "rue de")
    corrected_street = corrected_street.replace("quaidu", "quai du")
    corrected_street = corrected_street.replace("placedu", "place du")
    corrected_street = corrected_street.replace("promenadedu", "promenade du")
    corrected_street = corrected_street.replace("ruedu", "rue du")
    corrected_street = corrected_street.replace("routede", "route de")
    corrected_street = corrected_street.replace("passagedes", "passage des")
    corrected_street = corrected_street.replace("chemindes", "chemin des")
    corrected_street = corrected_street.replace("ruedes", "rue des")
    corrected_street = corrected_street.replace("squaredu", "square du")
    corrected_street = corrected_street.replace("passagede", "passage de")
    corrected_street = corrected_street.replace("promenade des", "promenade des ")
    return sent

In [None]:
import spacy
from spacy.matcher import Matcher

# Load the French language model
nlp = spacy.load("fr_core_news_sm")

# Initialize the Matcher
matcher = Matcher(nlp.vocab)

street_names = ["Rue", "Chemin", "Place", "Avenue", "Boulevard", "Quai", "Promenade", "Route", "Square"]
connectors = ["des", "de", "du", "la", "le", "les", "l'", "d'", "au", "aux"]
second_connectors = ["l'", "d'", "la"]

# Define the pattern
pattern = [
    {"TEXT": {"IN": street_names}},
    {"TEXT": {"IN": connectors}, "OP": "?"},
    {"TEXT": {"REGEX": "^[a-zA-Z'-]+$"}, "OP": "+"},
]

# Add the pattern to the matcher
matcher.add("ADDRESS", [pattern])

# create a second pattern to match things like Rue de l'Hôtel-de-Ville or Chemin de la Gravière
pattern2 = [
    {"TEXT": {"IN": street_names}},
    {"TEXT": {"IN": connectors}, "OP": "?"},
    {"TEXT": {"IN": second_connectors}, "OP": "?"},
    {"TEXT": {"REGEX": "^[a-zA-Z-ôèéê]+$"}, "OP": "+"},

]
matcher.add("ADDRESS2", [pattern2])

# Process the sentences and get the matches
matched_streets = []

for street in streets:
    street = replace_street(street)
    # sentence = f"Hier, un accident s'est produit dans le sud de Genève, {street}, et 5 personnes ont été blessées. Le trafic est perturbé dans le secteur."
    sentence = street
    doc = nlp(sentence)
    matches = matcher(doc)

    # if matches are overlapped, we only keep the longest one
    if len(matches) > 1:
        matches = sorted(matches, key=lambda x: x[2]-x[1], reverse=True)
        matches = [matches[0]]

    for match_id, start, end in matches:
        span = doc[start:end]
        matched_streets.append(span.text)

matched_streets


In [None]:
doc = nlp(sentence)
matches = matcher(doc)

# Load from API

In [12]:
features = get_features(feature_type='transcript+ner', max_features=None)



[]
Retrieved 77520 features.




In [13]:
df = pd.DataFrame(features)

In [28]:
len(df.media_id.unique())

77520

In [14]:
def get_entities(data):
    entities = []
    if "entities" in data.keys():
        entities = data["entities"]
    else:
        entities = [t.get("entities", []) for t in data.get("transcript", [])]
        entities = [e for sublist in entities for e in sublist]
    
    return entities

In [15]:
df["entities"] = df["data"].apply(get_entities)

In [16]:
df["locations"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "LOC" and len(e[0]) > 2])

In [17]:
locations = df["locations"].explode().value_counts()
locations = pd.DataFrame(locations).reset_index().rename(columns={"locations":"location"})
locations[:10]

Unnamed: 0,location,count
0,Suisse,11233
1,Genève,4145
2,Lausanne,1819
3,État,1669
4,Est,1591
5,Valais,1489
6,Berne,1474
7,Europe,1469
8,Suisses,1437
9,Zurich,1189


In [18]:
with open("../emv/features/cities.json", "r") as f:
    cities = json.load(f)

In [19]:
cities = pd.DataFrame([{"location":k, "lon":float(v[0]), "lat":float(v[1])} for k,v in cities.items() if len(v) == 2])

In [20]:
found_cities = pd.merge(locations, cities, on="location", how="left").dropna(subset = ["lat", "lon"])
found_cities

Unnamed: 0,location,count,lon,lat
1,Genève,4145,46.200000,6.150000
2,Lausanne,1819,46.533333,6.633333
6,Berne,1474,46.947980,7.447430
9,Zurich,1189,47.374444,8.541111
11,Neuchâtel,1149,46.990278,6.930556
...,...,...,...,...
16708,Langendorf,1,47.220556,7.514444
16907,Rorschach,1,47.478611,9.493611
16935,Oberhof,1,47.451111,8.006944
16993,Cormondes,1,46.900000,7.166667


In [26]:
# Create a base map centered around Switzerland
m = folium.Map(location=[46.8182, 8.2275], zoom_start=8)
size_multiplier = 1
# Add city points to the map
for index, row in found_cities.iterrows():
    folium.CircleMarker(
        location=(row['lon'], row['lat']),
        radius=np.sqrt(row['count'] / np.pi) * size_multiplier,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        tooltip=row['location'] + ': ' + str(row['count']) + ' occurrences'
    ).add_to(m)

m