In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

plt.style.use('ggplot')
plt.rcParams['font.family'] = 'sans-serif' 
plt.rcParams['font.serif'] = 'Ubuntu' 
plt.rcParams['font.monospace'] = 'Ubuntu Mono' 
plt.rcParams['font.size'] = 14 
plt.rcParams['axes.labelsize'] = 12 
plt.rcParams['axes.labelweight'] = 'bold' 
plt.rcParams['axes.titlesize'] = 12 
plt.rcParams['xtick.labelsize'] = 12 
plt.rcParams['ytick.labelsize'] = 12 
plt.rcParams['legend.fontsize'] = 12 
plt.rcParams['figure.titlesize'] = 12 
plt.rcParams['image.cmap'] = 'jet' 
plt.rcParams['image.interpolation'] = 'none' 
plt.rcParams['figure.figsize'] = (12, 10) 
plt.rcParams['axes.grid']=True
plt.rcParams['lines.linewidth'] = 2 
plt.rcParams['lines.markersize'] = 8
colors = ['xkcd:pale orange', 'xkcd:sea blue', 'xkcd:pale red', 'xkcd:sage green', 'xkcd:terra cotta', 'xkcd:dull purple', 'xkcd:teal', 'xkcd: goldenrod', 'xkcd:cadet blue',
'xkcd:scarlet']

In [2]:
data = pd.read_csv('media.csv')

In [3]:
movie_data = data.loc[data.media_type == 'movie']
movie_data = movie_data[['media_type','id','title','overview','credits','genre_ids','top_cast']]
movie_data.head()

Unnamed: 0,media_type,id,title,overview,credits,genre_ids,top_cast
0,movie,27205,Inception,"Cobb, a skilled thief who commits corporate es...",525 525 556 2157773 2157775 41018 54211,28 878 12,6193 24045 3899 2524 27578
1,movie,155,The Dark Knight,Batman raises the stakes in his war on crime. ...,525 525 527 282 525 556 20643 10949 10951 5421...,18 28 80 53,3894 1810 3895 64 6383
2,movie,24428,The Avengers,When an unexpected enemy emerges and threatens...,12891 12891 10850 7624 7626 15277 41018 57027,878 28 12,3223 16828 103 74568 1245
3,movie,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",2710 2710 8529 20294 1385649,28 12 14 878,65731 8691 10205 32747 17647
4,movie,293660,Deadpool,The origin story of former Special Forces oper...,55252 7200 11092 10859 7624 7932 25292 71230 9...,28 12 35,10859 54882 1047649 51990 78452


In [4]:
tv_data = data.loc[data.media_type == 'tv']
tv_data = tv_data[['media_type','id','title','overview','credits','genre_ids','top_cast']]
tv_data.head()

Unnamed: 0,media_type,id,title,overview,credits,genre_ids,top_cast
491,tv,1399,Game of Thrones,Seven noble families fight for control of the ...,1406855 1878409 56746 1223799 33316 114404 122...,10765 18 10759,1223786 22970 239019 17286 1001657
492,tv,66732,Stranger Things,"When a young boy vanishes, a small town uncove...",1113659 3025043 937878 1718785 1179419 17825 1...,18 10765 9648,1920 35029 1356210 1442069 1653291
493,tv,1402,The Walking Dead,Sheriff's deputy Rick Grimes awakens from a co...,,10759 18 10765,4886 31535 62220 84224 1252310
494,tv,63174,Lucifer,"Bored and unhappy as the Lord of Hell, Lucifer...",3211568 1215703 3211569 192944 1512022 3950 12...,80 10765,192944 37014 116474 21356 515875
495,tv,69050,Riverdale,"Set in the present, the series offers a bold, ...",,80 18 9648,1680339 1136940 1721740 56730 1592855


In [5]:
M_overview = np.array(movie_data.overview)
M_credits = np.array(movie_data.credits)
M_genre_ids = np.array(movie_data.genre_ids)
M_top_cast = np.array(movie_data.top_cast)

In [6]:
T_overview = np.array(tv_data.overview)
T_credits = np.array(tv_data.credits)
T_genre_ids = np.array(tv_data.genre_ids)
T_top_cast = np.array(tv_data.top_cast)

In [7]:
M_overview_txt_data = M_overview.tolist()
M_overview_model = SentenceTransformer('distilbert-base-nli-mean-tokens')
M_overview_embeddings = M_overview_model.encode(M_overview_txt_data, show_progress_bar=True)
M_overview = np.array(M_overview_embeddings)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [8]:
T_overview_txt_data = T_overview.tolist()
T_overview_model = SentenceTransformer('distilbert-base-nli-mean-tokens')
T_overview_embeddings = T_overview_model.encode(T_overview_txt_data, show_progress_bar=True)
T_overview = np.array(T_overview_embeddings)

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

In [9]:
M_credits = M_credits.astype(str)
M_credits_txt_data = M_credits.tolist()
M_credits_model = SentenceTransformer('distilbert-base-nli-mean-tokens')
M_credits_embeddings = M_credits_model.encode(M_credits_txt_data, show_progress_bar=True)
M_credits = np.array(M_credits_embeddings)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [10]:
T_credits = T_credits.astype(str)
T_credits_txt_data = T_credits.tolist()
T_credits_model = SentenceTransformer('distilbert-base-nli-mean-tokens')
T_credits_embeddings = T_credits_model.encode(T_credits_txt_data, show_progress_bar=True)
T_credits = np.array(T_credits_embeddings)

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

In [11]:
M_genre_ids_txt_data = M_genre_ids.tolist()
M_genre_ids_model = SentenceTransformer('all-mpnet-base-v2')
M_genre_ids_embeddings = M_genre_ids_model.encode(M_genre_ids_txt_data, show_progress_bar=True)
M_genre_ids = np.array(M_genre_ids_embeddings)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [12]:
T_genre_ids_txt_data = T_genre_ids.tolist()
T_genre_ids_model = SentenceTransformer('all-mpnet-base-v2')
T_genre_ids_embeddings = T_genre_ids_model.encode(T_genre_ids_txt_data, show_progress_bar=True)
T_genre_ids = np.array(T_genre_ids_embeddings)

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

In [13]:
M_top_cast_txt_data = M_top_cast.tolist()
M_top_cast_model = SentenceTransformer('all-mpnet-base-v2')
M_top_cast_embeddings = M_top_cast_model.encode(M_top_cast_txt_data, show_progress_bar=True)
M_top_cast = np.array(M_top_cast_embeddings)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [14]:
T_top_cast = T_top_cast.astype(str)
T_top_cast_txt_data = T_top_cast.tolist()
T_top_cast_model = SentenceTransformer('all-mpnet-base-v2')
T_top_cast_embeddings = T_top_cast_model.encode(T_top_cast_txt_data, show_progress_bar=True)
T_top_cast = np.array(T_top_cast_embeddings)

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

In [15]:
# input 5 titles
disney_movies = []
disney_movies.append(movie_data.index[movie_data['title'] == 'Brave'][0])
disney_movies.append(movie_data.index[movie_data['title'] == 'Finding Nemo'][0]) 
disney_movies.append(movie_data.index[movie_data['title'] == 'Up'][0])
disney_movies.append(movie_data.index[movie_data['title'] == 'The Incredibles'][0])
disney_movies.append(movie_data.index[movie_data['title'] == 'Zootopia'][0])
# print(movie_data.index[movie_data['title'] == 'Neighbors']) # 283
print(disney_movies)


[7, 66, 384, 422, 122]


In [16]:
overview_cos_sim = pd.DataFrame(cosine_similarity(M_overview)) # will hold the cos values
overview_list = []
for movie_loc in disney_movies:
    new_l = overview_cos_sim.loc[movie_loc].sort_values(ascending=False).index.tolist()[1:6]
    overview_list.append(new_l)
print(overview_list)

[[260, 87, 118, 133, 425], [143, 427, 452, 358, 144], [408, 69, 424, 109, 455], [359, 387, 318, 356, 78], [192, 292, 135, 195, 131]]


In [17]:
credits_cos_sim = pd.DataFrame(cosine_similarity(M_credits)) # will hold the cos values
credits_list = []
for movie_loc in disney_movies:
    new_l = credits_cos_sim.loc[movie_loc].sort_values(ascending=False).index.tolist()[1:6]
    credits_list.append(new_l)
print(credits_list)

[[487, 350, 144, 204, 153], [228, 102, 209, 342, 201], [119, 404, 162, 282, 31], [137, 466, 334, 455, 353], [444, 116, 251, 28, 159]]


In [18]:
genre_ids_cos_sim = pd.DataFrame(cosine_similarity(M_genre_ids)) # will hold the cos values
genre_ids_list = []
for movie_loc in disney_movies:
    new_l = genre_ids_cos_sim.loc[movie_loc].sort_values(ascending=False).index.tolist()[1:6]
    genre_ids_list.append(new_l)
print(genre_ids_list)

[[280, 7, 169, 345, 485], [483, 415, 293, 267, 1], [378, 140, 313, 384, 246], [190, 248, 293, 267, 415], [6, 292, 204, 227, 9]]


In [19]:
top_cast_cos_sim = pd.DataFrame(cosine_similarity(M_top_cast)) # will hold the cos values
top_cast_list = []
for movie_loc in disney_movies:
    new_l = top_cast_cos_sim.loc[movie_loc].sort_values(ascending=False).index.tolist()[1:6]
    top_cast_list.append( new_l)
print(top_cast_list)

[[285, 33, 335, 160, 268], [159, 102, 203, 430, 463], [203, 243, 280, 200, 430], [456, 95, 271, 471, 310], [128, 481, 386, 6, 29]]


In [20]:
cos_dict = {}

# print(overview_list)
# print(credits_list)
# print(genre_ids_list)
# print(top_cast_list)
# # disney_movies = [34, 40, 50, 205, 364]
for index, movie in enumerate(disney_movies):
    for loc in overview_list[index]:
        if loc in cos_dict:
            cos_dict[loc] += overview_cos_sim[movie][loc]
        else:
            cos_dict[loc] = overview_cos_sim[movie][loc]
    for loc in credits_list[index]:
        if loc in cos_dict:
            cos_dict[loc] += credits_cos_sim[movie][loc]
        else:
            cos_dict[loc] = credits_cos_sim[movie][loc]
    for loc in genre_ids_list[index]:
        if loc in cos_dict:
            cos_dict[loc] += genre_ids_cos_sim[movie][loc]
        else:
            cos_dict[loc] = genre_ids_cos_sim[movie][loc]
    for loc in top_cast_list[index]:
        if loc in cos_dict:
            cos_dict[loc] += top_cast_cos_sim[movie][loc]
        else:
            cos_dict[loc] = top_cast_cos_sim[movie][loc]
print(cos_dict)

{260: 0.6492827, 87: 0.6203682, 118: 0.61683065, 133: 0.6060129, 425: 0.6055884, 487: 0.86048263, 350: 0.793483, 144: 1.5495981, 204: 1.7674208, 153: 0.75140566, 280: 1.8550354, 7: 1.0, 169: 1.0, 345: 1.0, 485: 1.0, 285: 0.71344817, 33: 0.6288729, 335: 0.62112606, 160: 0.61704457, 268: 0.6107468, 143: 0.78012574, 427: 0.77594525, 452: 0.76969916, 358: 0.768041, 228: 0.91233194, 102: 1.7885127, 209: 0.8890115, 342: 0.87621325, 201: 0.87516403, 483: 0.8914763, 415: 1.7139883, 293: 1.7101502, 267: 1.7101502, 1: 0.8637117, 159: 1.8054032, 203: 1.7722592, 430: 1.7365675, 463: 0.88212264, 408: 0.759803, 69: 0.7549467, 424: 0.7474871, 109: 0.7405548, 455: 1.5735462, 119: 0.9304802, 404: 0.92180765, 162: 0.9195185, 282: 0.9074293, 31: 0.9035888, 378: 1.0000002, 140: 1.0000002, 313: 1.0000002, 384: 1.0000002, 246: 1.0000002, 243: 0.86290514, 200: 0.8532745, 359: 0.72683746, 387: 0.70003426, 318: 0.69486845, 356: 0.6826486, 78: 0.68003213, 137: 0.8650371, 466: 0.8481353, 334: 0.8465189, 353: 0.8

In [21]:
# # disney_movies = [34, 40, 50, 205, 364]
# print(credits_cos_sim[34][154])
# print(genre_ids_cos_sim[364][154])



In [22]:
sorted_cos_dict = []
sorted_cos_dict = sorted(cos_dict.items(), key=lambda x:x[1], reverse=True)
# print(sorted_cos_dict[:10])
for m_id, cos_data in sorted_cos_dict[:10]:
    print(movie_data.title[m_id])

Guardians of the Galaxy
Juno
The Tomorrow War
21 Jump Street
Aquaman
Howl's Moving Castle
Alita: Battle Angel
The Gentlemen
Extraction
Captain Phillips
