In [3]:
import matplotlib.pyplot as plt
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import csv
from tempfile import TemporaryDirectory

plt.style.use('ggplot')
plt.rcParams['font.family'] = 'sans-serif' 
plt.rcParams['font.serif'] = 'Ubuntu' 
plt.rcParams['font.monospace'] = 'Ubuntu Mono' 
plt.rcParams['font.size'] = 14 
plt.rcParams['axes.labelsize'] = 12 
plt.rcParams['axes.labelweight'] = 'bold' 
plt.rcParams['axes.titlesize'] = 12 
plt.rcParams['xtick.labelsize'] = 12 
plt.rcParams['ytick.labelsize'] = 12 
plt.rcParams['legend.fontsize'] = 12 
plt.rcParams['figure.titlesize'] = 12 
plt.rcParams['image.cmap'] = 'jet' 
plt.rcParams['image.interpolation'] = 'none' 
plt.rcParams['figure.figsize'] = (12, 10) 
plt.rcParams['axes.grid']=True
plt.rcParams['lines.linewidth'] = 2 
plt.rcParams['lines.markersize'] = 8
colors = ['xkcd:pale orange', 'xkcd:sea blue', 'xkcd:pale red', 'xkcd:sage green', 'xkcd:terra cotta', 'xkcd:dull purple', 'xkcd:teal', 'xkcd: goldenrod', 'xkcd:cadet blue',
'xkcd:scarlet']

In [4]:
data = pd.read_csv('media.csv', quoting=csv.QUOTE_NONE, delimiter='|',escapechar='\\')
data = data.fillna('')

In [5]:
M_overview = np.array(data.loc[data["media_type"] == "movie"]["overview"])
T_overview = np.array(data.loc[data["media_type"] == "tv"]["overview"])

M_credits = np.array(data.loc[data["media_type"] == "movie"]["credits"])
T_credits = np.array(data.loc[data["media_type"] == "tv"]["credits"])

M_genre_ids = np.array(data.loc[data["media_type"] == "movie"]["genre_ids"])
T_genre_ids = np.array(data.loc[data["media_type"] == "tv"]["genre_ids"])

In [6]:
model = SentenceTransformer('all-distilroberta-v1')

output_dir = TemporaryDirectory()

model.save(output_dir.name)


In [7]:
M_overview_embeddings = model.encode(M_overview, show_progress_bar=True)
M_overview = np.array(M_overview_embeddings)

T_overview_embeddings = model.encode(list(T_overview), show_progress_bar=True)
T_overview = np.array(T_overview_embeddings)

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [8]:
M_credits_embeddings = model.encode(M_credits, show_progress_bar=True)
M_credits = np.array(M_credits_embeddings)

T_credits_embeddings = model.encode(T_credits, show_progress_bar=True)
T_credits = np.array(T_credits_embeddings)

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [9]:
M_genre_ids_embeddings = model.encode(M_genre_ids, show_progress_bar=True)
M_genre_ids = np.array(M_genre_ids_embeddings)

T_genre_ids_embeddings = model.encode(T_genre_ids, show_progress_bar=True)
T_genre_ids = np.array(T_genre_ids_embeddings)

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [26]:
# Movies
movies = []
movies.append(data.index[data['title'] == "Chicken Little"][0])
movies.append(data.index[data['title'] == 'Father of the Bride'][0]) 
movies.append(data.index[data['title'] == 'Superbad'][0])
movies.append(data.index[data['title'] == "The Emperor's New Groove"][0])
movies.append(data.index[data['title'] == 'Harold & Kumar Go to White Castle'][0])

# movies.append(data.index[data['title'] == 'The Hunt for Red October'][0])
# movies.append(data.index[data['title'] == 'The Living Daylights'][0]) 
# movies.append(data.index[data['title'] == "You're Next"][0])
# movies.append(data.index[data['title'] == 'Daylight'][0])
# movies.append(data.index[data['title'] == 'The Rock'][0])
# print(movie_data.index[movie_data['title'] == 'Neighbors']) # 283
print(movies)

overview_cos_sim = pd.DataFrame(cosine_similarity(M_overview)) # will hold the cos values
overview_list = []
for movie_loc in movies:
    new_l = overview_cos_sim.loc[movie_loc].sort_values(ascending=False).index.tolist()[1:]
    new_l = [x for x in new_l if x not in movies]
    overview_list.append(new_l[:5])
print(overview_list)

credits_cos_sim = pd.DataFrame(cosine_similarity(M_credits)) # will hold the cos values
credits_list = []
for movie_loc in movies:
    new_l = credits_cos_sim.loc[movie_loc].sort_values(ascending=False).index.tolist()[1:]
    new_l = [x for x in new_l if x not in movies]
    credits_list.append(new_l[:5])
print(credits_list)

genre_ids_cos_sim = pd.DataFrame(cosine_similarity(M_genre_ids)) # will hold the cos values
genre_ids_list = []
for movie_loc in movies:
    new_l = genre_ids_cos_sim.loc[movie_loc].sort_values(ascending=False).index.tolist()[1:]
    new_l = [x for x in new_l if x not in movies]
    genre_ids_list.append(new_l[:5])
print(genre_ids_list)

[1183, 3297, 552, 623, 2263]
[[923, 2783, 893, 3220, 1039], [2784, 630, 3078, 1177, 2744], [2956, 671, 4574, 4688, 2456], [663, 1289, 1135, 4049, 3328], [3061, 150, 4591, 1295, 3097]]
[[3191, 3326, 81, 2870, 4150], [2489, 41, 1958, 4787, 377], [2953, 289, 1263, 4339, 1881], [2281, 3666, 3029, 3596, 1240], [4302, 2478, 1538, 732, 2389]]
[[1526, 1560, 134, 4835, 3586], [605, 1650, 3136, 911, 4217], [551, 671, 4015, 2598, 2305], [3420, 4539, 4767, 1792, 1228], [3015, 769, 2429, 1718, 4986]]


In [27]:
cos_dict = {}

for index, movie_loc in enumerate(movies):
    for loc in overview_list[index]:
        if loc in cos_dict:
            cos_dict[loc] += overview_cos_sim[movie_loc][loc]
        else:
            cos_dict[loc] = overview_cos_sim[movie_loc][loc]
    for loc in credits_list[index]:
        if loc in cos_dict:
            cos_dict[loc] += credits_cos_sim[movie_loc][loc]
        else:
            cos_dict[loc] = credits_cos_sim[movie_loc][loc]
    for loc in genre_ids_list[index]:
        if loc in cos_dict:
            cos_dict[loc] += genre_ids_cos_sim[movie_loc][loc]
        else:
            cos_dict[loc] = genre_ids_cos_sim[movie_loc][loc]
print(cos_dict)

{923: 0.52581376, 2783: 0.49517012, 893: 0.49013478, 3220: 0.4781922, 1039: 0.46949032, 3191: 0.9660518, 3326: 0.96004885, 81: 0.959752, 2870: 0.9574533, 4150: 0.95744497, 1526: 0.9999999, 1560: 0.9999999, 134: 0.9999999, 4835: 0.9999999, 3586: 0.9999999, 2784: 0.66616756, 630: 0.5362286, 3078: 0.5316645, 1177: 0.52897054, 2744: 0.5252383, 2489: 0.96941507, 41: 0.9685225, 1958: 0.96308637, 4787: 0.9629638, 377: 0.96267307, 605: 1.0000004, 1650: 1.0000004, 3136: 1.0000004, 911: 1.0000004, 4217: 1.0000004, 2956: 0.52224743, 671: 1.4975524, 4574: 0.48885316, 4688: 0.4855721, 2456: 0.47636354, 2953: 0.9554904, 289: 0.95293814, 1263: 0.9494047, 4339: 0.94785374, 1881: 0.9464012, 551: 1.0, 4015: 1.0, 2598: 1.0, 2305: 1.0, 663: 0.5395138, 1289: 0.4906416, 1135: 0.47910437, 4049: 0.47782165, 3328: 0.47137046, 2281: 0.94423103, 3666: 0.94255835, 3029: 0.9420367, 3596: 0.9412289, 1240: 0.94036037, 3420: 1.0, 4539: 0.97385097, 4767: 0.968633, 1792: 0.95211184, 1228: 0.95211184, 3061: 0.6112274, 1

In [28]:
sorted_cos_dict = []
sorted_cos_dict = sorted(cos_dict.items(), key=lambda x:x[1], reverse=True)
# print(sorted_cos_dict[:10])
for m_id, cos_data in sorted_cos_dict[:10]:
    print(data.title[m_id])

Project X
The 40 Year Old Virgin
Along Came Polly
It's Complicated
American Pie 2
Date Movie
Easy A
Airplane II: The Sequel
Suck Me Shakespeer
Not Another Teen Movie
