In [None]:
%cd ../..
%load_ext autoreload

%autoreload 2

In [None]:
from emv.db.dao import DataAccessObject
from emv.db.queries import get_features_by_type_paginated, count_features_by_type
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
import numpy as np
from tqdm import tqdm
from emv.api.models import Feature
from emv.api.models import Projection, MapProjectionFeatureCreate
from emv.db.queries import create_projection, create_map_projection_feature, create_feature
from emv.io.media import create_square_atlases
from umap import UMAP
import numba
import cv2
from PIL import Image
from sqlalchemy.sql import text
from datetime import datetime

from emv.db.queries import get_all_media_by_library_id, get_library_id_from_name, get_library_from_name, check_media_exists, get_media_by_id, delete_feature_by_type
from emv.storage.storage import get_storage_client
from emv.features.image import embed_images

# Create "locations" features from "transcript+ner" features

In [None]:
total_features = count_features_by_type("transcript+ner", short_clips_only=True)
print(f"Total features: {total_features}")

In [None]:
MAX_FEATURES = total_features + 1
data = get_features_by_type_paginated("transcript+ner", page_size=10000, short_clips_only=True)

for _ in tqdm(range(MAX_FEATURES // 10000)):
    last_seen_id = data[-1].get("feature_id", None)
    if last_seen_id is None:
        break
    data.extend(get_features_by_type_paginated("transcript+ner", page_size=10000, last_seen_feature_id=last_seen_id, short_clips_only=True))

In [None]:
# Drop fields not needed
df = []
for d in tqdm(data):
    df.append(
        {
            "feature_id": d["feature_id"],
            "media_id": d["media_id"],
            "data": d["data"]
        }
    )
    
df = pd.DataFrame(df)
df = df.dropna()
df = df.reset_index(drop=True)
print(f"Retrieved {len(df)} instances")

In [None]:
df["locations"] = df["data"].map(lambda x: [w[0] for w in x["entities"] if w[1] == "LOC"])

In [None]:
# Manual matching
with open("emv/features/cities.json", "r") as f:
    cities = json.load(f)
    
locations = pd.DataFrame([{"locations":k, "lon":float(v[0]), "lat":float(v[1])} for k,v in cities.items() if len(v) == 2])

In [None]:
found_locations = locations.locations.values
df = df[df.locations.map(lambda x: any([l in found_locations for l in x]))]
print(f"Filtered to {len(df)} instances")

In [None]:
df = df[["data", "media_id", "locations"]]
df["locations"] = df["locations"].map(lambda x: list(set([l for l in x if l in found_locations])))
df["geo_coords"] = df["locations"].map(lambda x: [cities[l] for l in x])
df = df.explode(["locations", "geo_coords"])

## Check if clip has thumbnail

In [None]:
dao = DataAccessObject()

def query_thumbnail(media_id):
    query = text("SELECT * FROM media WHERE parent_id = :parent_id AND media_type = 'image' AND sub_type = 'screenshot'")
    result = dao.fetch_all(query, {"parent_id": media_id})
    if len(result) == 0:
        return None
    return result[0]

In [None]:
dao = DataAccessObject()
df["thumbnail_media"] = df["media_id"].map(query_thumbnail)

In [None]:
storage_client = get_storage_client()
df["has_thumbnail"] = df["thumbnail_media"].map(lambda x: storage_client.object_exists("rts", x.get("media_path", "")))

In [None]:
# Check map
plt.scatter(df["geo_coords"].map(lambda x: float(x[1])), df["geo_coords"].map(lambda x: float(x[0])), 
            s=1, marker="o",
            c=df["has_thumbnail"].map(lambda x: "red" if x else "blue"))
plt.title(f"Missing {df['has_thumbnail'].value_counts()[False]} thumbnails out of {len(df)} instances")
plt.show()

In [None]:
df = df[df["has_thumbnail"]]
df = df.drop(columns=["has_thumbnail"])
df = df.reset_index(drop=True)

## Save Features

In [None]:
df["feature_id"] = df.apply(lambda x: create_feature(Feature(
                                                        feature_type='locations',
                                                        version="1",
                                                        model_name='transcript+ner+geolocation',
                                                        model_params={},
                                                        data={
                                                            "location": x["locations"],
                                                            "geo_coords": x["geo_coords"],
                                                            "media_path": x["thumbnail_media"].get("media_path", "")
                                                            },
                                                        media_id=x['media_id']
                                                    ))["feature_id"], axis=1)

# Create Atlases and Projection

## Atlases

**Note**: the same clip can mention multiple locations. Since the mapping is based on the locations, the same clip can appear multiple times.
In the Atlases, we don't need to duplicate the thumbnails.

In [None]:
MAX_FEATURES = 100000
PAGE_SIZE = 10000
features = get_features_by_type_paginated("locations", page_size=PAGE_SIZE)

for _ in tqdm(range(MAX_FEATURES // PAGE_SIZE)):
    last_seen_id = features[-1].get("feature_id", None)
    if last_seen_id is None:
        break
    features.extend(get_features_by_type_paginated("locations", page_size=PAGE_SIZE, last_seen_feature_id=last_seen_id))
    
features = pd.DataFrame(features)
print(f"Retrieved {len(features)} instances")

In [None]:
thumbnails_paths = list(set(features["data"].map(lambda x: x["media_path"]).values))

In [None]:
storage_client = get_storage_client()

def get_thumbnail(media_path):
    frame_bytes = storage_client.get_bytes("rts", media_path)
    if type(frame_bytes) == bytes:
        frame = cv2.imdecode(np.frombuffer(frame_bytes, np.uint8), -1)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = Image.fromarray(frame)
    else:
        frame = None
        
    return frame

In [None]:
thumbnails = [get_thumbnail(p) for p in tqdm(thumbnails_paths)]

In [None]:
total_tiles = len(thumbnails) # either all features or a subset of features
atlas_width = 4096
max_tile_size = 512
max_tiles_per_atlas = (atlas_width // max_tile_size) ** 2
atlas_count = int(total_tiles / max_tiles_per_atlas) + 1

In [None]:
# Create the projection, replace the names with the desired ones
projection = Projection(
    projection_name="RTS locations 26k",
    version="1",
    library_id=get_library_id_from_name("rts"),
    model_name="whisper+spacy",
    model_params={},
    data={},
    dimension=3,
    atlas_folder_path="",
    atlas_width=atlas_width,
    tile_size=max_tile_size,
    atlas_count=atlas_count,
    total_tiles=total_tiles,
    tiles_per_atlas=max_tiles_per_atlas,
)

projection_id = create_projection(projection)['projection_id']
print(f"Projection ID: {projection_id}")

In [None]:
projection_id = 19

In [None]:
square_atlases = create_square_atlases(atlas_name="atlas_rts_locations",
                                       projection_id=projection_id, 
                                       images=thumbnails, 
                                       width=atlas_width, 
                                       max_tile_size=max_tile_size, 
                                       no_border=True)

In [None]:
thumbnails_df = pd.DataFrame(thumbnails_paths, columns=["thumbnail_path"])
thumbnails_df["atlas_order"] = thumbnails_df.index // max_tiles_per_atlas
thumbnails_df["index_in_atlas"] = thumbnails_df.index % max_tiles_per_atlas

# Merge with locations features
features["thumbnail_path"] = features["data"].map(lambda x: x["media_path"])
features = features.merge(thumbnails_df, on="thumbnail_path", how="left")

## Projection

In [None]:
features["geo_coords"] = features["data"].map(lambda x: x["geo_coords"])

In [None]:
for i, row in features.iterrows():
    create_map_projection_feature(MapProjectionFeatureCreate(
        projection_id=projection_id,
        media_id=row.media_id,
        atlas_order=row.atlas_order,
        index_in_atlas=row.index_in_atlas,
        coordinates=[row.geo_coords[0], row.geo_coords[1], 0],
        feature_id=row.feature_id
    ))

# Locations metadata

In [None]:
MAX_FEATURES = 100000
PAGE_SIZE = 10000
features = get_features_by_type_paginated("locations", page_size=PAGE_SIZE)

for _ in tqdm(range(MAX_FEATURES // PAGE_SIZE)):
    last_seen_id = features[-1].get("feature_id", None)
    if last_seen_id is None:
        break
    features.extend(get_features_by_type_paginated("locations", page_size=PAGE_SIZE, last_seen_feature_id=last_seen_id))
    
features = pd.DataFrame(features)
print(f"Retrieved {len(features)} instances")

In [None]:
metadata = pd.read_hdf("data/rts_metadata.hdf5")

In [None]:
metadata[['mediaId', 'publishedDate', 'categoryName', 'assetType',
                                    'contentType', 'backgoundType', 'collection', 'publishedBy',
                                    'title', 'resume', 'geoTheme', 'resumeSequence',
                                    'sequences']].head()

In [None]:
features["parent_id"] = features.media_id.map(lambda x: x.split("-")[1])
features = features.merge(metadata[['mediaId', 'publishedDate', 'categoryName', 'assetType',
                                    'contentType', 'backgoundType', 'collection', 'publishedBy',
                                    'title', 'resume', 'geoTheme', 'resumeSequence',
                                    'sequences']], 
                          left_on="parent_id", right_on="mediaId", how="left")

In [None]:
features.head(2)

In [None]:
features.assetType.value_counts()

In [None]:
features.contentType.value_counts()

In [None]:
features.backgoundType.explode().value_counts()

In [None]:
features.collection.value_counts()

In [None]:
features.publishedBy.value_counts()

In [None]:
features.publishedDate.map(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").year).value_counts().sort_index().plot(kind="bar", figsize=(12, 6), title="Published date per year")

In [None]:
categorie_to_theme = {
    "Téléjournal, flashes": "Information & Actualité",
    "Actualités régionales": "Information & Actualité",
    "ACTUALITE": "Information & Actualité",
    "INFORMATION": "Information & Actualité",
    "Autres émissions d'actualités": "Information & Actualité",
    "Autres (information)": "Information & Actualité",
    "Plateaux, débats, highlights": "Information & Actualité",
    "Sujets internationaux et nationaux mélangés (débats)": "Information & Actualité",
    "Politique, économie, société": "Information & Actualité",
    "Informations de service": "Information & Actualité",
    "Rencontres, entretiens, portraits": "Information & Actualité",

    "Talk shows": "Débats & Talk Shows",
    "Débats": "Débats & Talk Shows",

    "Société, religion": "Société & Monde",
    "Pays et peuples": "Société & Monde",
    "Emissions de conseil": "Société & Monde",
    "Emissions de compagnie": "Société & Monde",
    "Médecine, santé": "Société & Monde",
    "Histoire": "Société & Monde",

    "CULTURE ET CONNAISSANCE": "Culture & Connaissance",
    "Science": "Culture & Connaissance",
    "Emissions didactiques": "Culture & Connaissance",
    "Magazines culturels": "Culture & Connaissance",
    "Arts et médias": "Culture & Connaissance",

    "Arts": "Arts & Spectacles",
    "Ballets": "Arts & Spectacles",
    "Opéras": "Arts & Spectacles",
    "Concerts": "Arts & Spectacles",
    "Shows et variétés musicales": "Arts & Spectacles",
    "Cabaret, humour": "Arts & Spectacles",

    "MUSIQUE": "Musique",
    "Pop et rock, clips": "Musique",
    "Folklore et musique populaire": "Musique",

    "SPORT": "Sport",
    "Résultats et magazines sportifs": "Sport",
    "Autres émissions sportives": "Sport",
    "Retransmissions en direct": "Sport",

    "FICTION": "Fiction & Divertissement",
    "Série et feuilletons": "Fiction & Divertissement",
    "DIVERTISSEMENT": "Fiction & Divertissement",
    "Autres émissions de divertissement": "Fiction & Divertissement",

    "EMISSIONS POUR ENFANTS ET ADOLESCENTS": "Jeunesse",
    "Jeux, concours": "Jeunesse",

    "RELIGION": "Religion & Spiritualité",
    "Cultes, messes et prédications": "Religion & Spiritualité",
    "Autres émissions religieuses": "Religion & Spiritualité",

    "AUTRES EMISSIONS": "Autres / Inclassables",
    None: "Autres / Inclassables"
}



In [None]:
features["theme"] = features.contentType.map(lambda x: categorie_to_theme.get(x, "Autres / Inclassables"))
features.theme.value_counts()