In [None]:
%cd ../..
%load_ext autoreload

%autoreload 2

In [2]:
import os
import random
import textwrap as tw
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image
from ast import literal_eval
from scipy.spatial.distance import pdist, squareform

from emv.settings import DRIVE_PATH

# Clustering
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN, KMeans
from hdbscan import HDBSCAN

# DR
from umap import UMAP
from umap.umap_ import nearest_neighbors
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from trimap import TRIMAP

# Metrics
from emv.embeddings.dr_eval import compute_embeddings, compute_umap_embeddings, plot_embeddings, format_params, plot_embeddings_with_images
from emv.embeddings.dr_eval import \
    compute_coranking_metrics, \
    random_triplet_accuracy, \
    compute_pcc, \
    global_score

# Load data

In [3]:
genres = pd.read_csv("data/mjf_vectors_genre.csv")
genres["media_id"] = genres.media_id.map(lambda x: int(x.split("-")[1]))
genres.sort_values("media_id", inplace = True)

instruments = pd.read_csv("data/mjf_vectors_instrument.csv")
instruments["media_id"] = instruments.media_id.map(lambda x: int(x.split("-")[1]))
instruments.sort_values("media_id", inplace = True)

moods = pd.read_csv("data/mjf_vectors_mood.csv")
moods["media_id"] = moods.media_id.map(lambda x: int(x.split("-")[1]))
moods.sort_values("media_id", inplace = True)

In [4]:
features = genres.copy()
features["genres_f"] = genres.drop("media_id", axis = 1).agg(list, axis = 1).tolist()
features["insts_f"] = instruments.drop("media_id", axis = 1).agg(list, axis = 1).tolist()
features["moods_f"] = moods.drop("media_id", axis = 1).agg(list, axis = 1).tolist()

features = features[["media_id", "genres_f", "insts_f", "moods_f"]]

In [None]:
features.head()

# Metadata

In [6]:
import pickle

with open("/media/data/mjf/metadata/metadata.pickle", "rb") as f:
    data = pickle.load(f)

In [None]:
data.keys()

In [8]:
songs = pd.DataFrame(data["songs"]).T
concerts = pd.DataFrame(data["concerts"]).T
genres_concerts = pd.DataFrame(data["genres"]).T

In [9]:
concerts["genres"] = concerts.genre_ids.map(lambda x: [genres_concerts.loc[int(i), "name"] for i in x])
songs["genres"] = songs.concert_id.map(lambda x: concerts.loc[x, "genres"])

In [10]:
songs.rename(columns = {"song_id":"media_id"}, inplace = True)
songs = pd.merge(songs, features, on = "media_id", how = "left")

In [11]:
songs.dropna(subset = ["genres_f", "insts_f", "moods_f"], inplace = True)

In [None]:
songs.sort_values("media_id").head()

In [None]:
def remove_noise(x, t = 0.2):
    return [0 if i < t else i for i in x]

In [None]:
songs["genres_f_filtered"] = songs.genres_f.map(lambda x: remove_noise(x, 0.2))
songs["insts_f_filtered"] = songs.insts_f.map(lambda x: remove_noise(x, 0.2))
songs["moods_f_filtered"] = songs.moods_f.map(lambda x: remove_noise(x, 0.2))

# Embeddings

In [None]:
umap_embeddings = compute_umap_embeddings(np.array(songs["genres_f"].tolist()), n_neighbors = [100])
umap_embeddings.extend(compute_umap_embeddings(np.array(songs["insts_f"].tolist()), n_neighbors = [100]))
umap_embeddings.extend(compute_umap_embeddings(np.array(songs["moods_f"].tolist()), n_neighbors = [100]))

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
for i, (embedding, title) in enumerate(zip(umap_embeddings, ["Genres", "Instruments", "Moods"])):
    axs[i].scatter(embedding["embeddings"][:, 0], embedding["embeddings"][:, 1], s=0.01)
    axs[i].set_title(title)
    axs[i].set_xticks([])
    axs[i].set_yticks([])
plt.show()

In [None]:
genres_embedding = umap_embeddings[0]["embeddings"]

main_genres = songs.genres.explode().value_counts()[:16].index

n_cols = 4
n_rows = int(len(main_genres) / n_cols)
d = 3
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * d, n_rows * d))
axs = axs.flatten()
for i,genre in enumerate(main_genres):
    axs[i].scatter(genres_embedding[:,0], genres_embedding[:,1], 
                  s = songs.genres.map(lambda x: 0.1 if genre in x else 0.01).tolist(), 
                  alpha = songs.genres.map(lambda x: 1 if genre in x else 0.5).tolist(), 
                  color = songs.genres.map(lambda x: "red" if genre in x else "blue").tolist())
    axs[i].set_title(genre)
    axs[i].set_xticks([])
    axs[i].set_yticks([])
plt.tight_layout()
plt.show()

# Get medias from DB

In [13]:
from emv.db.queries import get_all_media_by_library_id, get_library_id_from_name

In [None]:
lib_id = get_library_id_from_name("mjf")

max_medias = 100000
medias = get_all_media_by_library_id(lib_id, page_size=100, media_type="video")

while len(medias) < max_medias:
    new_medias = get_all_media_by_library_id(lib_id, page_size=100, media_type="video", last_seen_media_id=medias[-1]["media_id"], last_seen_date=medias[-1]["created_at"])
    if len(new_medias) == 0:
        break
    medias.extend(new_medias)

medias = pd.DataFrame(medias)
medias["original_id"] = medias.original_id.astype(int)
print(f"Found {len(medias)} videos")

In [35]:
medias = medias.merge(songs[["media_id", "genres_f", "insts_f", "moods_f"]], right_on = "media_id", left_on = "original_id", how = "left")

In [40]:
medias.rename(columns = {"media_id_x":"media_id"}, inplace = True)
medias.drop("media_id_y", axis = 1, inplace = True)

In [None]:
medias.iloc[0]