In [127]:
import pandas as pd
import numpy as np
import json

In [128]:
df = pd.read_json("tmdb_movies_pages_1-50.json")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   adult              1000 non-null   bool   
 1   backdrop_path      1000 non-null   object 
 2   genre_ids          1000 non-null   object 
 3   id                 1000 non-null   int64  
 4   original_language  1000 non-null   object 
 5   original_title     1000 non-null   object 
 6   overview           1000 non-null   object 
 7   popularity         1000 non-null   float64
 8   poster_path        1000 non-null   object 
 9   release_date       1000 non-null   object 
 10  title              1000 non-null   object 
 11  video              1000 non-null   bool   
 12  vote_average       1000 non-null   float64
 13  vote_count         1000 non-null   int64  
dtypes: bool(2), float64(2), int64(2), object(8)
memory usage: 95.8+ KB


In [129]:
# keep only the columns we deem necessary for vectoring
#   maybe save a few others for display purposes like vote_average and poster_path to display posters
df = df[["original_title","genre_ids","release_date"]]

In [130]:
# ADD GENRE LABELS

# Turn genre ids into actual genre labels
with open("tmdb_genres.json", "r") as file:
    genre_raw_json = json.load(file)

genre_list_of_dicts = genre_raw_json["genres"]
id_to_name = {item['id']: item['name'] for item in genre_list_of_dicts}

named_genres_list = []
for movie_genre_ids in df["genre_ids"]:
    named_genres = [id_to_name[id] for id in movie_genre_ids]
    named_genres_list.append(named_genres)

df["genres"] = named_genres_list
df = df.drop(columns="genre_ids")

In [131]:
# ADD KEYWORDS COLUMN

df_keywords = pd.read_json("tmdb_movies_pages_1-50_keywords.json")
df = pd.concat([df, df_keywords], axis=1)

In [132]:
# ACTORS AND DIRECTOR
with open("tmdb_movies_pages_1-50_credits.json", "r") as file:
    raw_json = json.load(file)

actor_list_of_lists = []
directors_list = []

for movie in raw_json:
    cast = movie["cast"]
    crew = movie["crew"]
    actors = []
    directors = []
    for cast_member in cast:
        if cast_member["known_for_department"] == "Acting":
            actors.append(cast_member["name"])
    for crew_member in crew:
        if crew_member["known_for_department"] == "Directing":
            directors.append(crew_member["name"])
    # lets add first 10 actors per movie
    actor_list_of_lists.append(actors[:10])
    # and first director
    if directors:
        directors_list.append(directors[0])
    else:
        directors_list.append("N/A")

df["actors"] = actor_list_of_lists
df["director"] = directors_list

In [134]:
# Clean looking dataframe. Lets save it seperately for display purposes
df_clean = df
df_clean = df_clean.set_index("original_title")
df_clean.to_csv("clean_dataframe.csv")
df_clean.head()

Unnamed: 0_level_0,release_date,genres,keywords,actors,director
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Inception,2010-07-15,"[Action, Science Fiction, Adventure]","[rescue, mission, dream, airplane, paris, fran...","[Leonardo DiCaprio, Joseph Gordon-Levitt, Ken ...",Christopher Nolan
Interstellar,2014-11-05,"[Adventure, Drama, Science Fiction]","[rescue, future, spacecraft, race against time...","[Matthew McConaughey, Anne Hathaway, Michael C...",Christopher Nolan
The Dark Knight,2008-07-16,"[Drama, Action, Crime, Thriller]","[joker, sadism, chaos, secret identity, crime ...","[Christian Bale, Heath Ledger, Michael Caine, ...",Christopher Nolan
Avatar,2009-12-15,"[Action, Adventure, Fantasy, Science Fiction]","[paraplegic, attachment to nature, culture cla...","[Sam Worthington, Zoe Saldaña, Sigourney Weave...",James Cameron
Deadpool,2016-02-09,"[Action, Adventure, Comedy]","[superhero, anti hero, mercenary, based on com...","[Ryan Reynolds, Morena Baccarin, Ed Skrein, T....",James Bitonti


In [135]:
# DATES

# Convert all dates to datetime format
df["release_date"] = pd.to_datetime(df["release_date"])
# Create a seperate column and convert dates to floats and put them on a linear space between 0 and 1
df["release_date_numerical"] = df["release_date"].values.astype(float)
df["release_date_numerical"] = df["release_date_numerical"] - df["release_date_numerical"].min()
df["release_date_numerical"] = df["release_date_numerical"] / df["release_date_numerical"].max()
df = df.drop(columns="release_date")

In [136]:
# Collects all unique genres from all titles
genres_list = []
for row in df["genres"]:
    genres_list = list(set(genres_list + row))
genres_list_column_names = ["Genre: " + genre for genre in genres_list]
# Creates a matrix of movie genres such that every genre has got their own column and movies containing the genre have the value 1. 0 otherwise. 
genre_matrix = []
for genre in genres_list:
    genre_matrix_row = []
    for row in df["genres"]:
        if genre in row:
            genre_matrix_row.append(1)
        else:
            genre_matrix_row.append(0)
    genre_matrix.append(genre_matrix_row)
genre_matrix = list(map(list, zip(*genre_matrix)))
# Create a seperate dataframe with this information and then add it to the base dataframe. Finally remove the old genre column.
gdf = pd.DataFrame(genre_matrix, columns=genres_list_column_names)
df = pd.concat([df, gdf], axis=1)
df = df.drop(["genres"], axis=1)

In [137]:
# Collects all unique keywords from all titles
keyword_list = []
for row in df["keywords"]:
    keyword_list = list(set(keyword_list + row))
keyword_list_column_names = ["Keyword: " + keyword for keyword in keyword_list]
# Creates a matrix of movie keywords such that every keyword has got their own column and movies containing the keyword have the value 1. 0 otherwise. 
keyword_matrix = []
for keyword in keyword_list:
    keyword_matrix_row = []
    for row in df["keywords"]:
        if keyword in row:
            keyword_matrix_row.append(1)
        else:
            keyword_matrix_row.append(0)
    keyword_matrix.append(keyword_matrix_row)
keyword_matrix = list(map(list, zip(*keyword_matrix)))
# Create a seperate dataframe with this information and then add it to the base dataframe. Finally remove the old keyword column.
kdf = pd.DataFrame(keyword_matrix, columns=keyword_list_column_names)
df = pd.concat([df, tdf], axis=1)
df = df.drop(["keywords"], axis=1)

In [138]:
# Collects all unique actors from all titles
actors_list = []
for row in df["actors"]:
    actors_list = list(set(actors_list + row))
actor_list_column_names = ["Actor: " + actor for actor in actors_list]
# Creates a matrix of movie actors such that every actor has got their own column and movies containing the actor have the value 1. 0 otherwise
actor_matrix = []
for actor in actors_list:
    actor_matrix_row = []
    for row in df["actors"]:
        if actor in row:
            actor_matrix_row.append(1)
        else:
            actor_matrix_row.append(0)
    actor_matrix.append(actor_matrix_row)
actor_matrix = list(map(list, zip(*actor_matrix)))
# Create a seperate dataframe with this information and then add it to the base dataframe. Finally remove the old actors column.
adf = pd.DataFrame(actor_matrix, columns=actor_list_column_names)
df = pd.concat([df, adf], axis=1)
df = df.drop(["actors"], axis=1)

In [153]:
# Collects all unique directors from all titles
directors_list = []
for row in df["director"]:
    directors_list.append(row)
directors_list = list(set(directors_list))
director_list_column_names = ["Director: " + director for director in directors_list]
# Creates a matrix of movie directors such that every director has got their own column and movies containing the director have the value 1. 0 otherwise
director_matrix = []
for director in directors_list:
    director_matrix_row = []
    for row in df["director"]:
        if director in row:
            director_matrix_row.append(1)
        else:
            director_matrix_row.append(0)
    director_matrix.append(director_matrix_row)
director_matrix = list(map(list, zip(*director_matrix)))
# Create a seperate dataframe with this information and then add it to the base dataframe. Finally remove the old director column.
ddf = pd.DataFrame(director_matrix, columns=director_list_column_names)
df = pd.concat([df, adf], axis=1)
df = df.drop(["director"], axis=1)

In [155]:
# Looks good and ready for machine learning and other algorithms. Lets save it!
df = df.set_index("original_title")
df.to_csv("numerical_dataframe.csv")
df.head()

Unnamed: 0_level_0,release_date_numerical,Genre: Comedy,Genre: Music,Genre: Fantasy,Genre: Western,Genre: Drama,Genre: Science Fiction,Genre: Action,Genre: Thriller,Genre: Crime,...,Actor: Stephen Fry,Actor: Larry Rapp,Actor: Ben Schwartz,Actor: Richard T. Jones,Actor: Andrew Rannells,Actor: Tony Shalhoub,Actor: David Prowse,Actor: Patrick Magee,Actor: Lea Salonga,Actor: Casey Siemaszko
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Inception,0.841958,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Interstellar,0.89196,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Dark Knight,0.8188,0,0,0,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
Avatar,0.835223,0,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Deadpool,0.906604,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
