In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from ast import literal_eval

movies = pd.read_csv('data/movies.csv')[['id', 'title', 'genres']]
movies.loc[:, 'genres'] = movies['genres'].apply(literal_eval)
movies

Unnamed: 0,id,title,genres
0,862,Toy Story,"(Animation, Comedy, Family)"
1,8844,Jumanji,"(Adventure, Family, Fantasy)"
2,15602,Grumpier Old Men,"(Comedy, Romance)"
3,31357,Waiting to Exhale,"(Comedy, Drama, Romance)"
4,11862,Father of the Bride Part II,"(Comedy,)"
...,...,...,...
45424,439050,Subdue,"(Drama, Family)"
45425,111109,Century of Birthing,"(Drama,)"
45426,67758,Betrayal,"(Action, Drama, Thriller)"
45427,227506,Satan Triumphant,()


In [4]:
# movies['genres'] is a tuple of genres. We need to convert it to one hot encoded columns

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb_genres = mlb.fit_transform(movies['genres'])

for i, genre in enumerate(mlb.classes_):
    movies["genre-" + genre.lower()] = mlb_genres[:, i]

movies.drop(columns=['genres'], inplace=True)
movies

Unnamed: 0,id,title,genre-action,genre-adventure,genre-animation,genre-comedy,genre-crime,genre-documentary,genre-drama,genre-family,...,genre-history,genre-horror,genre-music,genre-mystery,genre-romance,genre-science fiction,genre-tv movie,genre-thriller,genre-war,genre-western
0,862,Toy Story,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,8844,Jumanji,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,15602,Grumpier Old Men,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,31357,Waiting to Exhale,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,11862,Father of the Bride Part II,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45424,439050,Subdue,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
45425,111109,Century of Birthing,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
45426,67758,Betrayal,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
45427,227506,Satan Triumphant,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(movies.filter(regex='genre')[:100], movies.filter(regex='genre')[:100])

array([[1.        , 0.33333333, 0.40824829, ..., 0.        , 0.        ,
        0.33333333],
       [0.33333333, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.40824829, 0.        , 1.        , ..., 0.        , 0.        ,
        0.40824829],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.40824829],
       [0.33333333, 0.        , 0.40824829, ..., 0.        , 0.40824829,
        1.        ]])

In [7]:
movies.filter(regex='genre')

Unnamed: 0,genre-action,genre-adventure,genre-animation,genre-comedy,genre-crime,genre-documentary,genre-drama,genre-family,genre-fantasy,genre-foreign,genre-history,genre-horror,genre-music,genre-mystery,genre-romance,genre-science fiction,genre-tv movie,genre-thriller,genre-war,genre-western
0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45424,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
45425,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
45426,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
45427,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
