In [24]:
import pandas as pd
from sklearn.cluster import KMeans
import glob
from pathlib import Path
import json

In [15]:
movies = glob.glob('../emotion_analysis/embeddings/*.csv')
emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
all_movies_emb = []
names = []

for idx, movie in enumerate(movies):
    name = Path(movie).stem.replace('_', ' ')
    names.append(name)
    emb = pd.read_csv(movie)
    all_movies_emb.append(emb[emotions].mean(axis=0).values)


all_movies_emb = pd.DataFrame(data=all_movies_emb, columns=emotions)
all_movies_emb['movie'] = names
all_movies_emb

Unnamed: 0,sadness,joy,love,anger,fear,surprise,movie
0,0.082992,0.699409,0.011532,0.143664,0.059919,0.002485,Ostrov - Lost Island 2021
1,0.049160,0.716566,0.012892,0.178026,0.040964,0.002392,The Good House 2021
2,0.041502,0.753985,0.011535,0.159816,0.030842,0.002318,The longest day 2020
3,0.092615,0.607335,0.011287,0.235128,0.050949,0.002686,Havana Widows 1933
4,0.029076,0.465802,0.007403,0.342316,0.152955,0.002448,Knackningar 2021
...,...,...,...,...,...,...,...
108,0.029188,0.565370,0.009854,0.312162,0.080657,0.002770,Cloak & Dagger 1984
109,0.046886,0.610664,0.009954,0.132148,0.197360,0.002988,Invisible Alien 2021
110,0.026736,0.746043,0.016830,0.177609,0.030290,0.002492,Poker Face 2022
111,0.070775,0.805153,0.010427,0.088435,0.023040,0.002169,The Rookie 2002


In [None]:
all_movies_emb['dominant'] = all_movies_emb[emotions].idxmax(1)
all_movies_emb

Unnamed: 0,sadness,joy,love,anger,fear,surprise,movie,dominant
0,0.082992,0.699409,0.011532,0.143664,0.059919,0.002485,Ostrov - Lost Island 2021,joy
1,0.049160,0.716566,0.012892,0.178026,0.040964,0.002392,The Good House 2021,joy
2,0.041502,0.753985,0.011535,0.159816,0.030842,0.002318,The longest day 2020,joy
3,0.092615,0.607335,0.011287,0.235128,0.050949,0.002686,Havana Widows 1933,joy
4,0.029076,0.465802,0.007403,0.342316,0.152955,0.002448,Knackningar 2021,joy
...,...,...,...,...,...,...,...,...
108,0.029188,0.565370,0.009854,0.312162,0.080657,0.002770,Cloak & Dagger 1984,joy
109,0.046886,0.610664,0.009954,0.132148,0.197360,0.002988,Invisible Alien 2021,joy
110,0.026736,0.746043,0.016830,0.177609,0.030290,0.002492,Poker Face 2022,joy
111,0.070775,0.805153,0.010427,0.088435,0.023040,0.002169,The Rookie 2002,joy


In [19]:
all_movies_emb['dominant'].unique()

<StringArray>
['joy', 'anger']
Length: 2, dtype: str

In [23]:
alg = KMeans(n_clusters=5, n_init=10, random_state=42)
clusters = alg.fit_predict(all_movies_emb[emotions])
clusters

array([4, 4, 4, 1, 2, 3, 4, 3, 4, 2, 3, 1, 1, 3, 3, 4, 4, 2, 1, 3, 3, 3,
       2, 1, 1, 3, 3, 1, 0, 3, 1, 1, 4, 1, 3, 2, 1, 1, 1, 3, 1, 3, 0, 1,
       2, 1, 1, 4, 2, 2, 1, 1, 1, 1, 1, 2, 2, 3, 4, 1, 1, 2, 4, 2, 3, 1,
       2, 0, 3, 3, 4, 3, 0, 2, 4, 4, 3, 2, 0, 2, 4, 4, 1, 4, 4, 2, 0, 4,
       2, 1, 3, 1, 3, 4, 4, 1, 3, 2, 4, 3, 1, 1, 1, 1, 4, 1, 4, 1, 3, 1,
       4, 0, 2], dtype=int32)

In [36]:
MAX_DEPTH = 4
MIN_LEAF_EX = 6
NUM_CLUSTERS = 5
def construct_clusters(depth, movies):
    node = {}
    node['count'] = len(movies)

    if depth == MAX_DEPTH:
        node['type'] = 'leaf'
        node['children'] = []
        node['movies'] = list(movies['movie'].values)

        return node


    node['type'] = 'root' if depth == 0 else 'node'
    node['children'] = []
    alg = KMeans(n_clusters=NUM_CLUSTERS, n_init=10, random_state=42)
    clusters = alg.fit_predict(movies[emotions])

    for idx in range(NUM_CLUSTERS):
        subset = movies[clusters == idx]
        if len(subset) <= MIN_LEAF_EX:
            node['children'].extend(list(subset['movie'].values))
        else:
            child = construct_clusters(depth + 1, subset)
            node['children'].append(child)

    return node

root = construct_clusters(0, all_movies_emb)
with open('./clusters.json', 'w') as f:
    json.dump(root, f, indent=4)