In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from boxoffice.db.frames import get_movie_frame_full
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [106]:
target_slugs = ['Paddington-2', 'Deadpool-and-Wolverine-(2024)', 'Renfield']

df = get_movie_frame_full()

Reading from movies.csv


In [107]:
# make dummy variables for genre, mpaa_rating, source, production_method, creative_type
# eventually include franchise
columns = []
for col in ['genre', 'mpaa_rating', 'source', 'production_method', 'creative_type']:
    dummies = pd.get_dummies(df[col], prefix=col)
    columns.extend(dummies.columns)
    df = pd.concat([df, dummies], axis=1)
columns.extend(['budget'])

In [111]:
# make na budget 0
df_distances = df[columns]

print('Budget NA:', df['budget'].isna().sum())

# print all the NA titles in budget
na_title_df = df[df['budget'].isna()]
with open('na_budget_titles.txt', 'w') as f:
    for row in na_title_df.iterrows():
        f.write(row[1]['title'] + ' ' + row[1]['slug'] + '\n')

df_distances['budget'] = df_distances['budget'].fillna(-1)

# divide budget by 20,000,000
df_distances['budget'] = df_distances['budget'] / 20000000

df_distances['in_franchise'] = df['franchise_name'].notna().astype(int)

# multiply all the mpaas by 10
# df_distances['mpaa_rating'] = df_distances['mpaa_rating'].apply(lambda x: x * 10)

euclidean_dist = euclidean_distances(df_distances)

def format_number_in_millions(x):
    return "{:.2f}M".format(x / 1000000)

for target_slug in target_slugs:
    # find the row where the id is the target
    target = df[df['slug'] == target_slug]
    target_id = target.index[0]
    target = target[columns]
    target = target.values.reshape(1, -1)
    distances = euclidean_dist[target_id]
    closest = np.argsort(distances)
    print('Target:', target_id, df.loc[target_id]['title'])
    for i in range(0, 11):
        located = df.loc[closest[i]]
        print(i, located['title'], round(distances[closest[i]], 2), format_number_in_millions(located['total_revenue_within_365_days']))
    print()

Budget NA: 230
822
Target: 716 Paddington 2
0 Paddington 2 0.0 40.77M
1 Peter Rabbit 2: The Runaway 0.25 41.24M
2 Peter Rabbit 0.5 115.23M
3 Paddington 0.75 76.40M
4 Harold and the Purple Crayon 1.0 18.32M
5 The Nut Job 2: Nutty by Nature 1.41 28.70M
6 Goosebumps 2: Haunted Halloween 1.44 47.42M
7 Nanny McPhee and the Big Bang 1.44 29.01M
8 Clifford the Big Red Dog 1.56 48.95M
9 Alvin and the Chipmunks 1.6 217.31M
10 Goosebumps 1.68 78.12M

Target: 0 Deadpool & Wolverine
0 Deadpool & Wolverine 0.0 662.89M
1 Spider-Man: No Way Home 1.41 864.80M
2 Doctor Strange in the Multiverse of Madness 1.41 447.33M
3 Iron Man 3 1.41 405.84M
4 Guardians of the Galaxy Vol 2 1.41 406.81M
5 Ant-Man and the Wasp: Quantumania 1.41 232.01M
6 Green Lantern 1.73 116.60M
7 Captain Marvel 1.89 447.53M
8 Suicide Squad 1.89 345.60M
9 The Avengers 1.89 637.75M
10 Black Adam 2.0 175.75M

Target: 228 Renfield
0 Renfield 0.0 18.05M
1 It’s Complicated 1.42 112.74M
2 Tropic Thunder 1.43 110.50M
3 The Campaign 1.48 86.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_distances['budget'] = df_distances['budget'].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_distances['budget'] = df_distances['budget'] / 20000000
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_distances['in_franchise'] = df['franchise_name'].notna().astype(int)
