## Recognizing Locations from extracted text

Different methods to get the location the RTS broadcasts are about

In [None]:
%cd ../..
%load_ext autoreload

%autoreload 2

In [None]:
from emv.db.dao import DataAccessObject
from sqlalchemy.sql import text
import pandas as pd
import numpy as np
import json
import folium
from collections import Counter
import os
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval

from emv.client.get_content import get_features
from emv.utils import dataframe_from_hdf5
from emv.features.wikidata import get_wikidata_id, get_property, get_wikidata_label
from emv.features.wikidata import process_batch
from emv.settings import DRIVE_PATH

# Load data with query

In [None]:
query = text("""SELECT * FROM feature WHERE feature_type = 'transcript+ner';""")
df = pd.DataFrame(DataAccessObject().fetch_all(query))

In [None]:
loc_entities = []

for i, row in df.iterrows():
    try:
        for ent in row['data']['entities']:
            if ent[1] == 'LOC':
                loc_entities.append(ent[0].lower())
                # print(ent['text'])
        # print(row['data']['entities'])
    except KeyError:
        pass

In [None]:
series = pd.Series(loc_entities).value_counts()

In [None]:
all_locs = []
for s in series.items():
    all_locs.append(s)
print(all_locs)

## Match streets

In [None]:
streets = []
for i, row in df.iterrows():
    try:
        for t in row['data']['transcript']:
            streets.append(t['t'])
        # row['data']['transcript'][0]['t']
    except KeyError:
        pass
    except TypeError:
        pass

In [None]:
for i, row in df.iterrows():
    try:
        for t in row['data']['transcript']:
            if 'rue de' in t['t'].lower():
                print(t['t'])
        # row['data']['transcript'][0]['t']
    except KeyError:
        pass
    except TypeError:
        pass

In [None]:
with open("data/geneva_streets.txt", "r") as f:
    streets = [x.strip() for x in f.readlines()]

In [None]:
def replace_street(sent):
    corrected_street = sent.replace("ruedes", "rue des")
    corrected_street = corrected_street.replace("cheminde", "chemin de")
    corrected_street = corrected_street.replace("placedes", "place des")
    corrected_street = corrected_street.replace("placede", "place de")
    corrected_street = corrected_street.replace("routedes", "route des")
    corrected_street = corrected_street.replace("avenuede", "avenue de")
    corrected_street = corrected_street.replace("avenuedu", "avenue du")
    corrected_street = corrected_street.replace("ruede", "rue de")
    corrected_street = corrected_street.replace("quaidu", "quai du")
    corrected_street = corrected_street.replace("placedu", "place du")
    corrected_street = corrected_street.replace("promenadedu", "promenade du")
    corrected_street = corrected_street.replace("ruedu", "rue du")
    corrected_street = corrected_street.replace("routede", "route de")
    corrected_street = corrected_street.replace("passagedes", "passage des")
    corrected_street = corrected_street.replace("chemindes", "chemin des")
    corrected_street = corrected_street.replace("ruedes", "rue des")
    corrected_street = corrected_street.replace("squaredu", "square du")
    corrected_street = corrected_street.replace("passagede", "passage de")
    corrected_street = corrected_street.replace("promenade des", "promenade des ")
    return sent

In [None]:
import spacy
from spacy.matcher import Matcher

# Load the French language model
nlp = spacy.load("fr_core_news_sm")

# Initialize the Matcher
matcher = Matcher(nlp.vocab)

street_names = ["Rue", "Chemin", "Place", "Avenue", "Boulevard", "Quai", "Promenade", "Route", "Square"]
connectors = ["des", "de", "du", "la", "le", "les", "l'", "d'", "au", "aux"]
second_connectors = ["l'", "d'", "la"]

# Define the pattern
pattern = [
    {"TEXT": {"IN": street_names}},
    {"TEXT": {"IN": connectors}, "OP": "?"},
    {"TEXT": {"REGEX": "^[a-zA-Z'-]+$"}, "OP": "+"},
]

# Add the pattern to the matcher
matcher.add("ADDRESS", [pattern])

# create a second pattern to match things like Rue de l'Hôtel-de-Ville or Chemin de la Gravière
pattern2 = [
    {"TEXT": {"IN": street_names}},
    {"TEXT": {"IN": connectors}, "OP": "?"},
    {"TEXT": {"IN": second_connectors}, "OP": "?"},
    {"TEXT": {"REGEX": "^[a-zA-Z-ôèéê]+$"}, "OP": "+"},

]
matcher.add("ADDRESS2", [pattern2])

# Process the sentences and get the matches
matched_streets = []

for street in streets:
    street = replace_street(street)
    # sentence = f"Hier, un accident s'est produit dans le sud de Genève, {street}, et 5 personnes ont été blessées. Le trafic est perturbé dans le secteur."
    sentence = street
    doc = nlp(sentence)
    matches = matcher(doc)

    # if matches are overlapped, we only keep the longest one
    if len(matches) > 1:
        matches = sorted(matches, key=lambda x: x[2]-x[1], reverse=True)
        matches = [matches[0]]

    for match_id, start, end in matches:
        span = doc[start:end]
        matched_streets.append(span.text)

matched_streets


In [None]:
doc = nlp(sentence)
matches = matcher(doc)

# Load from API

In [None]:
features = get_features(feature_type='transcript+ner', max_features=None)

In [None]:
df = pd.DataFrame(features)

In [None]:
metadata = dataframe_from_hdf5(DRIVE_PATH + "rts/metadata", "rts_metadata")
metadata.reset_index(inplace=True)
metadata.rename(columns = {"mediaId": "rts_id"}, inplace=True)

In [None]:
df["rts_id"] = df["media_id"].apply(lambda x: x.split("-")[1])
df = df.merge(metadata, on='rts_id', how='left')

In [None]:
total_duration = metadata.drop_duplicates(subset="rts_id").mediaDuration.sum()
processed_duration = df.drop_duplicates(subset="rts_id").mediaDuration.sum()

print(f"Processed {100 * processed_duration / total_duration:.2f}% of the total duration")
print(f"Processed {len(df.rts_id.unique())} videos out of {len(metadata.rts_id.unique())} - {len(df.rts_id.unique())/len(metadata.rts_id.unique())*100:.2f}%")

In [None]:
df["publishedDate"] = pd.to_datetime(df["publishedDate"])
df["year"] = df["publishedDate"].dt.year

In [None]:
plt.figure(figsize=(6, 4))
df.groupby("rts_id").categoryName.agg(set).map(lambda x: list(x)[0]).value_counts().plot(kind="barh")
plt.title("Number of videos per category")
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
df.rts_id.value_counts().hist(bins=100, grid = False)
plt.title("Number of clips extracted per video")
plt.text(100, 2200, f"{len(df)} clips extracted\nout of {len(df.rts_id.unique())} videos", fontdict={"size": 12, "weight": "bold"})
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
df.groupby("rts_id").year.mean().hist(bins=50, ax=axs[0], grid = False)
axs[0].set_title("Distribution of the year of publication of the videos")
df.year.hist(bins=50, ax=axs[1], grid = False)
axs[1].set_title("Distribution of the year of publication of the clips")
plt.show()

## Extracting entities

In [None]:
def get_entities(data):
    entities = []
    if "entities" in data.keys():
        entities = data["entities"]
    else:
        entities = [t.get("entities", []) for t in data.get("transcript", [])]
        entities = [e for sublist in entities for e in sublist]
    
    return entities

In [None]:
df["entities"] = df["data"].apply(get_entities)

In [None]:
entity_types = list(set([e[1] for sublist in df.entities for e in sublist]))
entity_types

In [None]:
df["locations"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "LOC" and len(e[0]) > 2])
df["people"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "PER" and len(e[0]) > 2])
df["orgs"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "ORG" and len(e[0]) > 1])
df["misc"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "MISC" and len(e[0]) > 2])

In [None]:
df = df[["media_id", "rts_id", "year", "categoryName", "contentType", "title", "resume", "mediaDuration", "locations", "people", "orgs", "misc"]]

In [None]:
df.to_csv("../data/rts_transcript_ner.csv", index=False, sep = "\t")

# Load presaved data

In [None]:
df = pd.read_csv("../data/rts_transcript_ner.csv", sep = "\t", converters={"locations": literal_eval,
                                                                           "people": literal_eval,
                                                                           "orgs": literal_eval,
                                                                           "misc": literal_eval})

In [None]:
total_duration = metadata.drop_duplicates(subset="rts_id").mediaDuration.sum()
processed_duration = df.drop_duplicates(subset="rts_id").mediaDuration.sum()
print("Hours processed:", processed_duration / 3600)
print(f"Processed {100 * processed_duration / total_duration :.03f}% of the total duration")

# Locations

## Map of Switzerland

In [None]:
locations = df["locations"].explode().value_counts()
locations = pd.DataFrame(locations).reset_index().rename(columns={"locations":"location"})
locations[:10]

In [None]:
locations = df[["locations", "year"]].explode("locations").dropna().groupby("locations").agg(list).reset_index()
locations["count"] = locations["year"].apply(len)
locations["year"] = locations.year.map(lambda x: Counter(x))
locations = locations.sort_values("count", ascending=False)
locations.head(10)

In [None]:
with open("../emv/features/cities.json", "r") as f:
    cities = json.load(f)

In [None]:
cities = pd.DataFrame([{"locations":k, "lon":float(v[0]), "lat":float(v[1])} for k,v in cities.items() if len(v) == 2])

In [None]:
cities.head()

In [None]:
found_cities = pd.merge(locations, cities, on="locations", how="left").dropna(subset = ["lat", "lon"])
found_cities

In [None]:
# Create a base map centered around Switzerland
m = folium.Map(location=[46.8182, 8.2275], zoom_start=8)
size_multiplier = 1
# Add city points to the map
for index, row in found_cities.iterrows():
    folium.CircleMarker(
        location=(row['lon'], row['lat']),
        radius=np.sqrt(row['count'] / np.pi) * size_multiplier,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        tooltip=row['locations'] + ': ' + str(row['count']) + ' occurrences'
    ).add_to(m)

m

## Evolution over time

In [None]:
n_mentions_per_year = found_cities.year.sum()
n_mentions_per_year = pd.DataFrame(n_mentions_per_year.items(), columns=["year", "count"])
n_mentions_per_year = n_mentions_per_year.sort_values("year")

plt.figure(figsize=(6, 4))
sns.lineplot(data=n_mentions_per_year, x="year", y="count")
plt.title("Number of mentions of locations in Switzerland in the RTS videos")
plt.show()

In [None]:
n_mentions_per_year_dict = n_mentions_per_year.set_index("year").to_dict()["count"]
found_cities["relative_counts"] = found_cities.year.map(lambda x: {k:v / n_mentions_per_year_dict[k] for k,v in x.items()})

In [None]:
plt.figure(figsize=(6, 4))
found_cities[:20].sort_values("count").set_index("locations")["count"].plot(kind="barh")
plt.xlabel("Number of mentions")
plt.title("Top 20 locations mentioned in the RTS videos")
plt.show()

In [None]:
top_N = 20
skip_first_n = 1
top_cities = found_cities[skip_first_n:top_N + skip_first_n]
counts_per_year = pd.DataFrame(top_cities.year.tolist(), index=top_cities.locations).fillna(0)
counts_per_year = counts_per_year.T.sort_index().T

plt.figure(figsize=(12, 6))
sns.heatmap(counts_per_year, cmap="Blues", cbar_kws={'label': 'Number of mentions'})
plt.title("Number of mentions of locations in Switzerland in the RTS videos")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 6))
for loc,mentions in counts_per_year.iterrows():
    plt.plot(mentions.index, mentions.values, label=loc)
plt.legend()
plt.title(f"Number of mentions of the top {len(counts_per_year)} locations in Switzerland in the RTS videos")
plt.show()

In [None]:
top_N = 20
skip_first_n = 1
top_cities = found_cities[skip_first_n:top_N + skip_first_n]
counts_per_year = pd.DataFrame(top_cities.relative_counts.tolist(), index=top_cities.locations).fillna(0)
counts_per_year = counts_per_year.T.sort_index().T

counts_per_year = counts_per_year.div(counts_per_year.sum(axis=1), axis=0)

plt.figure(figsize=(12, 6))
sns.heatmap(counts_per_year, cmap="Blues", cbar_kws={'label': 'Number of mentions'})
plt.title("Number of mentions of locations in Switzerland in the RTS videos")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 6))
for loc,mentions in counts_per_year.iterrows():
    plt.plot(mentions.index, mentions.values, label=loc)
plt.legend()
plt.title(f"Number of mentions of the top {len(counts_per_year)} locations in Switzerland in the RTS videos")
plt.show()

# People

In [None]:
filter_people = ["messieurs", "monsieur", "madame", "mesdames"]

df["people"] = df["people"].apply(lambda x: [p for p in x if p.lower() not in filter_people])   

In [None]:
persons = df[["people", "year"]].explode("people").dropna().groupby("people").agg(list).reset_index()
persons["count"] = persons["year"].apply(len)
persons["year"] = persons.year.map(lambda x: Counter(x))
persons = persons.sort_values("count", ascending=False)

In [None]:
top_persons = persons.head(1000)
top_persons.head(10)

In [None]:
top_persons["wikidata_search"] = top_persons["people"].apply(get_wikidata_id)

In [None]:
top_persons = top_persons[top_persons["wikidata_search"].map(lambda x: len(x) > 0)]

In [None]:
top_persons["wikidata_id"] = top_persons["wikidata_search"].apply(lambda x: x[0].get("id", "MISSING_ID") if len(x) > 0 else None)
top_persons["wikidata_label"] = top_persons["wikidata_search"].apply(lambda x: x[0].get("label", "MISSING_LABEL") if len(x) > 0 else None)
top_persons["wikidata_description"] = top_persons["wikidata_search"].apply(lambda x: x[0].get("description", "MISSING_DESCRIPTION") if len(x) > 0 else None)

In [None]:
instance_of = process_batch(top_persons["wikidata_id"].dropna().tolist(), "P31", BATCH_SIZE=20)
top_persons["instance_of"] = top_persons["wikidata_id"].map(instance_of)

In [None]:
instances = top_persons.instance_of.dropna().unique().tolist()
instances = {i:get_wikidata_label(i.split("/")[-1]) for i in instances}

In [None]:
top_persons["instance_of"] = top_persons["instance_of"].map(instances)
top_persons["instance_of"] = top_persons["instance_of"].fillna("MISSING").map(lambda x: x.get("en", {}).get("value", "MISSING") if x != "MISSING" else "MISSING")

In [None]:
top_persons["instance_of"].value_counts()

## Focus on humans (instance of Q5)

In [None]:
top_persons.dropna(subset=["wikidata_id", "instance_of"], inplace=True)

In [None]:
top_persons = top_persons[top_persons.instance_of == "human"]

In [None]:
top_persons.shape

In [None]:
ids = top_persons["wikidata_id"].tolist()
citizenship = process_batch(ids, "P27", BATCH_SIZE=20)
top_persons["citizenship"] = top_persons["wikidata_id"].map(citizenship)

occupation = process_batch(ids, "P106", BATCH_SIZE=20)
top_persons["occupation"] = top_persons["wikidata_id"].map(occupation)

In [None]:
citizenship_labels = {k:get_wikidata_label(k.split("/")[-1]) for k in top_persons["citizenship"].dropna().unique()}
top_persons["citizenship"] = top_persons["citizenship"].map(citizenship_labels)

occupation_labels = {k:get_wikidata_label(k.split("/")[-1]) for k in top_persons["occupation"].dropna().unique()}
top_persons["occupation"] = top_persons["occupation"].map(occupation_labels)

In [None]:
top_persons["citizenship"] = top_persons["citizenship"].fillna("MISSING").map(lambda x: x.get("en", {}).get("value", "MISSING") if x != "MISSING" else "MISSING")
top_persons["occupation"] = top_persons["occupation"].fillna("MISSING").map(lambda x: x.get("en", {}).get("value", "MISSING") if x != "MISSING" else "MISSING")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
top_persons.occupation.value_counts()[:20].plot(kind="barh", ax=axs[0])
axs[0].set_title("Occupations of most occurring persons")
top_persons.citizenship.value_counts()[:20].plot(kind="barh", ax=axs[1])
axs[1].set_title("Citizenships of most occurring persons")
plt.tight_layout()
plt.show()

In [None]:
videos = pd.read_csv("/mnt/g/rts/aibox-vectors/videos.csv")

In [None]:
videos.rename(columns = {"umid": "rts_id"}, inplace=True)

In [None]:
videos.merge(metadata, on="rts_id", how="left")

In [None]:
representative_sample = videos.rts_id.tolist()
print(len(representative_sample))

In [None]:
len(df[df.rts_id.isin(representative_sample)].rts_id.unique().tolist()) / len(representative_sample)