In [None]:
%cd ../..
%load_ext autoreload

%autoreload 2

In [None]:
from emv.db.dao import DataAccessObject
from emv.db.queries import get_features_by_type_paginated, count_features_by_type
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
import numpy as np
from tqdm import tqdm
from emv.api.models import Feature
from emv.api.models import Projection, MapProjectionFeatureCreate
from emv.db.queries import create_projection, create_map_projection_feature, create_feature, update_feature, count_media_by_library_id
from emv.io.media import create_square_atlases
from umap import UMAP
import numba
import cv2
from PIL import Image
from sqlalchemy.sql import text
from datetime import datetime
import textwrap as tw

from emv.db.queries import get_all_media_by_library_id, get_library_id_from_name, get_library_from_name, check_media_exists, get_media_by_id, delete_feature_by_type
from emv.storage.storage import get_storage_client
from emv.features.image import embed_images

from transformers import pipeline

# Load data

In [None]:
total_features = count_features_by_type("transcript+ner", short_clips_only=True)
print(f"Total features: {total_features}")

In [None]:
MAX_FEATURES = total_features + 1
data = get_features_by_type_paginated("transcript+ner", page_size=10000, short_clips_only=True)

for _ in tqdm(range(MAX_FEATURES // 10000)):
    last_seen_id = data[-1].get("feature_id", None)
    if last_seen_id is None:
        break
    data.extend(get_features_by_type_paginated("transcript+ner", page_size=10000, last_seen_feature_id=last_seen_id, short_clips_only=True))

In [None]:
# Drop fields not needed
df = []
for d in tqdm(data):
    df.append(
        {
            "media_id": d["media_id"],
            "data": d["data"]
        }
    )
    
df = pd.DataFrame(df)
df = df.dropna()
df = df.reset_index(drop=True)
print(f"Retrieved {len(df)} instances")

In [None]:
MAX_FEATURES = count_features_by_type("locations", short_clips_only=True) + 1
PAGE_SIZE = 10000
features = get_features_by_type_paginated("locations", page_size=PAGE_SIZE)

for _ in tqdm(range(MAX_FEATURES // PAGE_SIZE)):
    last_seen_id = features[-1].get("feature_id", None)
    if last_seen_id is None:
        break
    features.extend(get_features_by_type_paginated("locations", page_size=PAGE_SIZE, last_seen_feature_id=last_seen_id))
    
features = pd.DataFrame(features)
print(f"Retrieved {len(features)} instances")

In [None]:
df.rename(columns={"data": "transcript_data"}, inplace=True)
features = features.merge(df, on="media_id", how="left")

In [None]:
features["transcript"] = features["transcript_data"].map(lambda x: x.get("transcript", None))
features["entities"] = features["transcript_data"].map(lambda x: x.get("entities", None))

In [None]:
features["data"] = features.apply(lambda row: {**row["data"], "transcript": row["transcript"]}, axis=1)

In [None]:
models = [
    "tabularisai/multilingual-sentiment-analysis",
    "SamLowe/roberta-base-go_emotions",
    "nlptown/bert-base-multilingual-uncased-sentiment",
    "lxyuan/distilbert-base-multilingual-cased-sentiments-student"
]
pipe = pipeline("text-classification", model=models[3], return_all_scores=True)

In [None]:
for t in features.transcript.values[:10]:
    if t is None:
        continue
    try:
        result = pipe(t)
        print(result)
        print(tw.fill(t, 100))
        print()
    except Exception as e:
        print(f"Error processing transcript: {e}")

## Theme classification

In [None]:
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")

In [None]:
themes_possibles = [
    "Information & Actualité",
    "Débats & Talk Shows",
    "Société & Monde",
    "Culture & Connaissance",
    "Arts & Spectacles",
    "Musique",
    "Sport",
    "Fiction & Divertissement",
    "Jeunesse",
    "Religion & Spiritualité"
]

In [None]:
sample = features.sample(10000)

In [None]:
sample["theme_llm"] = sample.transcript.map(lambda x: classifier(x, themes_possibles))

In [None]:
sample["theme"] = sample["theme_llm"].map(lambda x: x["labels"][0])
sample["theme_score"] = sample["theme_llm"].map(lambda x: x["scores"][0])

In [None]:
sample.theme.value_counts()

In [None]:
sample.theme_score.hist(bins=20)

In [None]:
sample[sample.theme_score > 0.8].theme.value_counts()

In [None]:
sample.head()

# IMI Ontologies

In [None]:
MAX_FEATURES = count_features_by_type("locations", short_clips_only=True) + 1
PAGE_SIZE = 10000
features = get_features_by_type_paginated("locations", page_size=PAGE_SIZE)

for _ in tqdm(range(MAX_FEATURES // PAGE_SIZE)):
    last_seen_id = features[-1].get("feature_id", None)
    if last_seen_id is None:
        break
    features.extend(get_features_by_type_paginated("locations", page_size=PAGE_SIZE, last_seen_feature_id=last_seen_id))
    
features = pd.DataFrame(features)
print(f"Retrieved {len(features)} instances")

In [None]:
rts_metadata = pd.read_hdf("data/rts_metadata.hdf5")

In [None]:
features["original_id"] = features["media_id"].map(lambda x: x.split("-")[1])
features = features.merge(rts_metadata, left_on="original_id", right_on="mediaId", how="left")
features["date"] = features["publishedDate"].map(lambda x: datetime.strptime(x.split("T")[0], "%Y-%m-%d") if x is not None else None)
features["year"] = features["date"].map(lambda x: x.year)

features["year"].value_counts().sort_index().plot(kind="bar", figsize=(10, 6), title="Number of features per year")
plt.show()

In [None]:
features.drop_duplicates(subset=["feature_id"], inplace=True)
print(f"Number of features after dropping duplicates: {len(features)}")

In [None]:
features["transcript"] = features["data"].map(lambda x: x.get("transcript", ""))

In [None]:
# Read jsonl file
with open("data/locations_ontologies_all.jsonl", "r") as f:
    data = f.readlines()
data = [json.loads(line) for line in data]
data = pd.DataFrame(data)
data.drop(columns=["media_id"], inplace=True)
print(f"Retrieved {len(data)} instances of LLM semantics")

In [None]:
# Unpack the "data" column
for key in data["ontology"][0].keys():
    data[key] = data["ontology"].map(lambda x: x[key])
data = data.drop(columns=["ontology"])
print(f"Retrieved {len(data)} instances of LLM semantics")

In [None]:
features = features.merge(data, left_on="feature_id", right_on="transcript_id", how="left")
print(f"Retrieved {len(features)} instances with LLM semantics")

In [None]:
features["categories"].fillna("[]", inplace=True)
features["categories"] = features["categories"].map(lambda x: literal_eval(x) if isinstance(x, str) else x)
features["events"].fillna("[]", inplace=True)
features["events"] = features["events"].map(lambda x: literal_eval(x) if isinstance(x, str) else x)

In [None]:
features["keywords"] = features.events.map(lambda x: [e.get("keywords", []) for e in x])
features["keywords"] = features.keywords.map(lambda x: [item for sublist in x for item in sublist])
features["keywords"] = features.keywords.map(lambda x: list(set(x)))
features["keywords"] = features.keywords.map(lambda x: [k for k in x if k is not None and k != ""])

# Remove location names from keywords
with open("emv/features/cities.json", "r") as f:
    cities = json.load(f)
    
cities = list(cities.keys()) + ["Switzerland", "Geneva", "Bern"]
features["keywords"] = features.keywords.map(lambda x: [k for k in x if k not in cities])

In [None]:
features["publishedDate"] = features["publishedDate"].map(lambda x: x.split("T")[0])

In [None]:
features["new_feature_id"] = features.apply(lambda x: update_feature(x["feature_id"],
                                                                      Feature(
                                                                          feature_type="locations",
                                                                          version="1.3",
                                                                          model_name='transcript+ner+geolocation',
                                                                          model_params={},
                                                                          data={
                                                                            "location": x["data"]["location"],
                                                                            "geo_coords": x["data"]["geo_coords"],
                                                                            "media_path": x["data"]["media_path"],
                                                                            "transcript": x["data"]["transcript"],
                                                                            "categories": x["categories"],
                                                                            "keywords": x["keywords"],
                                                                            "date": x["publishedDate"]
                                                                              },
                                                                          media_id=x['media_id']
                                                                      ))["feature_id"], axis=1)