In [None]:
%cd ../..
%load_ext autoreload

%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import json
import folium
from collections import Counter
import os
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval

from emv.client.get_content import get_features
from emv.utils import dataframe_from_hdf5
from emv.settings import DRIVE_PATH

from emv.features.wikidata import get_wikidata_id, get_property, get_wikidata_label
from emv.features.wikidata import process_batch

# Load from API

In [None]:
features = get_features(feature_type='transcript+ner', max_features=None)

In [None]:
df = pd.DataFrame(features)

In [None]:
metadata = dataframe_from_hdf5(DRIVE_PATH + "rts/metadata", "rts_metadata")
metadata.reset_index(inplace=True)
metadata.rename(columns = {"mediaId": "rts_id"}, inplace=True)

In [None]:
df["rts_id"] = df["media_id"].apply(lambda x: x.split("-")[1])
df = df.merge(metadata, on='rts_id', how='left')

In [None]:
sample_rts = pd.read_csv(DRIVE_PATH + "rts/aibox-vectors/videos.csv")
sample_rts_ids = sample_rts.umid.tolist()
df = df[df.rts_id.isin(sample_rts.umid)].reset_index(drop=True)
print(f"Processed {len(df.rts_id.unique())} videos out of {len(sample_rts_ids)} - {len(df.rts_id.unique())/len(sample_rts_ids)*100:.2f}%")

In [None]:
df["publishedDate"] = pd.to_datetime(df["publishedDate"])
df["year"] = df["publishedDate"].dt.year

## Extracting entities

In [None]:
def get_entities(data):
    entities = []
    if "entities" in data.keys():
        entities = data["entities"]
    else:
        entities = [t.get("entities", []) for t in data.get("transcript", [])]
        entities = [e for sublist in entities for e in sublist]
    
    return entities

In [None]:
df["entities"] = df["data"].apply(get_entities)

In [None]:
entity_types = list(set([e[1] for sublist in df.entities for e in sublist]))
entity_types

In [None]:
df["locations"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "LOC" and len(e[0]) > 2])
df["people"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "PER" and len(e[0]) > 2])
df["orgs"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "ORG" and len(e[0]) > 1])
df["misc"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "MISC" and len(e[0]) > 2])

In [None]:
df = df[["media_id", "rts_id", "year", "data", "categoryName", "contentType", "title", "resume", "mediaDuration", "locations", "people", "orgs", "misc"]]

In [None]:
df.to_csv("data/rts_sample.csv", index=False, sep = "\t")

# Load presaved data

In [None]:
df = pd.read_csv("data/rts_sample.csv", 
                 sep = "\t", 
                 converters = {
                     "data": literal_eval,
                     "locations": literal_eval,
                     "people": literal_eval,
                     "orgs": literal_eval,
                     "misc": literal_eval
                })

In [None]:
plt.figure(figsize=(6, 4))
df.groupby("rts_id").categoryName.agg(set).map(lambda x: list(x)[0]).value_counts().plot(kind="barh")
plt.title("Number of videos per category")
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
df.rts_id.value_counts().hist(bins=100, grid = False)
plt.title("Number of clips extracted per video")
plt.text(100, 1200, f"{len(df)} clips extracted\nout of {len(df.rts_id.unique())} videos", fontdict={"size": 12, "weight": "bold"})
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
df.groupby("rts_id").year.mean().hist(bins=50, ax=axs[0], grid = False)
axs[0].set_title("Distribution of the year of publication of the videos")
df.year.hist(bins=50, ax=axs[1], grid = False)
axs[1].set_title("Distribution of the year of publication of the clips")
plt.show()

# Locations

## Map of Switzerland

In [None]:
locations = df["locations"].explode().value_counts()
locations = pd.DataFrame(locations).reset_index().rename(columns={"locations":"location"})
locations[:10]

In [None]:
locations = df[["locations", "year"]].explode("locations").dropna().groupby("locations").agg(list).reset_index()
locations["count"] = locations["year"].apply(len)
locations["year"] = locations.year.map(lambda x: Counter(x))
locations = locations.sort_values("count", ascending=False)
locations.head(10)

In [None]:
with open("emv/features/cities.json", "r") as f:
    cities = json.load(f)

In [None]:
cities = pd.DataFrame([{"locations":k, "lon":float(v[0]), "lat":float(v[1])} for k,v in cities.items() if len(v) == 2])

In [None]:
cities.head()

In [None]:
found_cities = pd.merge(locations, cities, on="locations", how="left").dropna(subset = ["lat", "lon"])
found_cities

In [None]:
# Create a base map centered around Switzerland
m = folium.Map(location=[46.8182, 8.2275], zoom_start=8)
size_multiplier = 1
# Add city points to the map
for index, row in found_cities.iterrows():
    folium.CircleMarker(
        location=(row['lon'], row['lat']),
        radius=np.sqrt(row['count'] / np.pi) * size_multiplier,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        tooltip=row['locations'] + ': ' + str(row['count']) + ' occurrences'
    ).add_to(m)

m

## Evolution over time

In [None]:
n_mentions_per_year = found_cities.year.sum()
n_mentions_per_year = pd.DataFrame(n_mentions_per_year.items(), columns=["year", "count"])
n_mentions_per_year = n_mentions_per_year.sort_values("year")

plt.figure(figsize=(6, 4))
sns.lineplot(data=n_mentions_per_year, x="year", y="count")
plt.title("Number of mentions of locations in Switzerland in the RTS videos")
plt.show()

In [None]:
n_mentions_per_year_dict = n_mentions_per_year.set_index("year").to_dict()["count"]
found_cities["relative_counts"] = found_cities.year.map(lambda x: {k:v / n_mentions_per_year_dict[k] for k,v in x.items()})

In [None]:
plt.figure(figsize=(6, 4))
found_cities[:20].sort_values("count").set_index("locations")["count"].plot(kind="barh")
plt.xlabel("Number of mentions")
plt.title("Top 20 locations mentioned in the RTS videos")
plt.show()

In [None]:
top_N = 20
skip_first_n = 1
top_cities = found_cities[skip_first_n:top_N + skip_first_n]
counts_per_year = pd.DataFrame(top_cities.year.tolist(), index=top_cities.locations).fillna(0)
counts_per_year = counts_per_year.T.sort_index().T

plt.figure(figsize=(12, 6))
sns.heatmap(counts_per_year, cmap="Blues", cbar_kws={'label': 'Number of mentions'})
plt.title("Number of mentions of locations in Switzerland in the RTS videos")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 6))
for loc,mentions in counts_per_year.iterrows():
    plt.plot(mentions.index, mentions.values, label=loc)
plt.legend()
plt.title(f"Number of mentions of the top {len(counts_per_year)} locations in Switzerland in the RTS videos")
plt.show()

In [None]:
top_N = 20
skip_first_n = 1
top_cities = found_cities[skip_first_n:top_N + skip_first_n]
counts_per_year = pd.DataFrame(top_cities.relative_counts.tolist(), index=top_cities.locations).fillna(0)
counts_per_year = counts_per_year.T.sort_index().T

counts_per_year = counts_per_year.div(counts_per_year.sum(axis=1), axis=0)

plt.figure(figsize=(12, 6))
sns.heatmap(counts_per_year, cmap="Blues", cbar_kws={'label': 'Number of mentions'})
plt.title("Number of mentions of locations in Switzerland in the RTS videos")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 6))
for loc,mentions in counts_per_year.iterrows():
    plt.plot(mentions.index, mentions.values, label=loc)
plt.legend()
plt.title(f"Number of mentions of the top {len(counts_per_year)} locations in Switzerland in the RTS videos")
plt.show()

# People

In [None]:
filter_people = ["messieurs", "monsieur", "madame", "mesdames"]

df["people"] = df["people"].apply(lambda x: [p for p in x if p.lower() not in filter_people])   

In [None]:
persons = df[["people", "year"]].explode("people").dropna().groupby("people").agg(list).reset_index()
persons["count"] = persons["year"].apply(len)
persons["year"] = persons.year.map(lambda x: Counter(x))
persons = persons.sort_values("count", ascending=False)

In [None]:
top_persons = persons.head(1000)
top_persons.head(10)

In [None]:
top_persons["wikidata_search"] = top_persons["people"].apply(get_wikidata_id)

In [None]:
top_persons = top_persons[top_persons["wikidata_search"].map(lambda x: len(x) > 0)]

In [None]:
top_persons["wikidata_id"] = top_persons["wikidata_search"].apply(lambda x: x[0].get("id", "MISSING_ID") if len(x) > 0 else None)
top_persons["wikidata_label"] = top_persons["wikidata_search"].apply(lambda x: x[0].get("label", "MISSING_LABEL") if len(x) > 0 else None)
top_persons["wikidata_description"] = top_persons["wikidata_search"].apply(lambda x: x[0].get("description", "MISSING_DESCRIPTION") if len(x) > 0 else None)

In [None]:
instance_of = process_batch(top_persons["wikidata_id"].dropna().tolist(), "P31", BATCH_SIZE=20)
top_persons["instance_of"] = top_persons["wikidata_id"].map(instance_of)

In [None]:
instances = top_persons.instance_of.dropna().unique().tolist()
instances = {i:get_wikidata_label(i.split("/")[-1]) for i in instances}

In [None]:
top_persons["instance_of"] = top_persons["instance_of"].map(instances)
top_persons["instance_of"] = top_persons["instance_of"].fillna("MISSING").map(lambda x: x.get("en", {}).get("value", "MISSING") if x != "MISSING" else "MISSING")

In [None]:
top_persons["instance_of"].value_counts()

## Focus on humans (instance of Q5)

In [None]:
top_persons.dropna(subset=["wikidata_id", "instance_of"], inplace=True)

In [None]:
top_persons = top_persons[top_persons.instance_of == "human"]

In [None]:
top_persons.shape

In [None]:
ids = top_persons["wikidata_id"].tolist()
citizenship = process_batch(ids, "P27", BATCH_SIZE=20)
top_persons["citizenship"] = top_persons["wikidata_id"].map(citizenship)

occupation = process_batch(ids, "P106", BATCH_SIZE=20)
top_persons["occupation"] = top_persons["wikidata_id"].map(occupation)

In [None]:
citizenship_labels = {k:get_wikidata_label(k.split("/")[-1]) for k in top_persons["citizenship"].dropna().unique()}
top_persons["citizenship"] = top_persons["citizenship"].map(citizenship_labels)

occupation_labels = {k:get_wikidata_label(k.split("/")[-1]) for k in top_persons["occupation"].dropna().unique()}
top_persons["occupation"] = top_persons["occupation"].map(occupation_labels)

In [None]:
top_persons["citizenship"] = top_persons["citizenship"].fillna("MISSING").map(lambda x: x.get("en", {}).get("value", "MISSING") if x != "MISSING" else "MISSING")
top_persons["occupation"] = top_persons["occupation"].fillna("MISSING").map(lambda x: x.get("en", {}).get("value", "MISSING") if x != "MISSING" else "MISSING")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
top_persons.occupation.value_counts()[:20].plot(kind="barh", ax=axs[0])
axs[0].set_title("Occupations of most occurring persons")
top_persons.citizenship.value_counts()[:20].plot(kind="barh", ax=axs[1])
axs[1].set_title("Citizenships of most occurring persons")
plt.tight_layout()
plt.show()

# Sentiment Analysis

In [None]:
from transformers import pipeline

In [None]:
df["transcript"] = df.data.map(lambda x: x["transcript"])
sentences = df[["media_id", "rts_id", "transcript"]].explode("transcript")
sentences["transcript"] = sentences.transcript.map(lambda x: x["t"] if type(x) == dict else x).tolist()
sentences.dropna(subset="transcript", inplace=True)
print(f"Extracted {len(sentences)} sentences")

In [None]:
sentences["sentence_length"] = sentences.transcript.map(lambda x: len(x.split()))
sentences.sort_values("sentence_length", ascending=False).head(10)

In [None]:
sentences.sentence_length.hist(bins=100, grid = False)

## Sentiment Score

In [None]:
sentiment_classifier = pipeline(
    model = "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    top_k = None
)

In [None]:
df["transcript"] = df.data.map(lambda x: x["transcript"])
sentences = df[["media_id", "rts_id", "transcript"]].explode("transcript")
sentences["transcript"] = sentences.transcript.map(lambda x: x["t"] if type(x) == dict else x).tolist()
print(f"Extracted {len(sentences)} sentences")

In [None]:
sentences = sentences[:1000]

In [None]:
def get_sentiment_scores(sentences):
    try:
        return sentiment_classifier(sentences)[0]
    except:
        return None

sentences["sentiment_scores"] = sentences.transcript.apply(get_sentiment_scores)

In [None]:
sentences.dropna(subset=["sentiment_scores"], inplace=True)

In [None]:
sentences["sentiment_scores"] = sentences["sentiment_scores"].map(lambda x: {s["label"]:s["score"] for s in x})
sentences["positive_score"] = sentences["sentiment_scores"].map(lambda x: x.get("positive", 0))
sentences["negative_score"] = sentences["sentiment_scores"].map(lambda x: x.get("negative", 0))
sentences["neutral_score"] = sentences["sentiment_scores"].map(lambda x: x.get("neutral", 0))
sentences["top_sentiment"] = sentences["sentiment_scores"].map(lambda x: max(x, key=x.get))
sentences["top_sentiment_score"] = sentences["sentiment_scores"].map(lambda x: max(x.values()))
#sentences.drop(columns = ["sentiment_scores"], inplace=True)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
sentences.top_sentiment.value_counts().reindex(["positive", "negative", "neutral"]).plot(kind="bar", ax=axs[0])
axs[0].set_title("Sentiment distribution")
axs[0].set_xticks(range(3), ["positive", "negative", "neutral"], rotation=0)

sentences.top_sentiment_score.hist(ax=axs[1], bins=50, grid = False)
axs[1].set_title("Sentiment score distribution")

plt.show()

In [None]:
# Top k extreme sentences, by sentiment
k = 50
print(f"Top {k} positive sentences")
_ = [print(s) for s in sentences[sentences.top_sentiment == "positive"].sort_values("top_sentiment_score", ascending=False).transcript.tolist()[:k]]
print()
print(f"Top {k} negative sentences")
_ = [print(s) for s in sentences[sentences.top_sentiment == "negative"].sort_values("top_sentiment_score", ascending=False).transcript.tolist()[:k]]

In [None]:
sentences.head()

In [None]:
clips = sentences.groupby("media_id").sentiment_scores.agg(list).map(lambda x: {k:[s[k] for s in x] for k in x[0].keys()}).apply(pd.Series).reset_index()

clips["positive_mean"] = clips["positive"].map(np.mean)
clips["positive_std"] = clips["positive"].map(np.std)
clips["negative_mean"] = clips["negative"].map(np.mean)
clips["negative_std"] = clips["negative"].map(np.std)
clips["neutral_mean"] = clips["neutral"].map(np.mean)
clips["neutral_std"] = clips["neutral"].map(np.std)

clips.drop(columns = ["positive", "negative", "neutral"], inplace=True)

In [None]:
clips["top_sentiment"] = clips[["positive_mean", "negative_mean", "neutral_mean"]].idxmax(axis=1).map(lambda x: x.split("_")[0])

In [None]:
clips.head()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
clips.top_sentiment.value_counts().reindex(["positive", "negative", "neutral"]).plot(kind="bar", ax=axs[0])
axs[0].set_title("Sentiment distribution")
axs[0].set_xticks(range(3), ["positive", "negative", "neutral"], rotation=0)

clips.apply(lambda df: df[df["top_sentiment"] + "_mean"], axis = 1).hist(ax=axs[1], bins=50, grid = False)
axs[1].set_xlabel("Sentiment score")
axs[1].set_title("Sentiment score distribution")

plt.show()

## Emotions

In [None]:
classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)

In [None]:
classifier.tokenizer

In [None]:
sentences["emotions"] = sentences.transcript.apply(lambda x: classifier(x)[0])
sentences["emotions"] = sentences["emotions"].map(lambda x: {s["label"]:s["score"] for s in x})

In [None]:
sentences.head()

In [None]:
sentences["top_emotion"] = sentences["emotions"].map(lambda x: max(x, key=x.get))

In [None]:
sentences["top_emotion"].value_counts()