### RESEARCH ON DATA FROM https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

## IMPORTED PACKAGES

In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import ast

## UTIL FUNCTIONS

In [37]:
def print_df(df):
    with pd.option_context("display.max_columns", None):
        display(df.head(5))
        
def random_color():
    return "#" + "".join(random.choice("0123456789ABCDEF") for _ in range(6))

## PREPROCESSING

- Load the dataset

In [86]:
movies_metadata_df = pd.read_csv("../../data/movies_metadata.csv", low_memory=False)
credits_df = pd.read_csv("../../data/credits.csv")

In [87]:
print_df(movies_metadata_df)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [88]:
print_df(credits_df)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


- Basic information

In [89]:
movies_metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [90]:
credits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


- Drop meaningless columns

In [91]:
movies_metadata_df = movies_metadata_df.drop(
    columns=["homepage", 
             "imdb_id", 
             "original_language", 
             "original_title", 
             "overview",
             "popularity",
             "poster_path",
             "production_companies",
             "production_countries",
             "spoken_languages",
             "status",
             "tagline",
             "title",
             "video",
             "vote_average",
             "vote_count"]
    )

- Column type conversion and coercion

In [92]:
movies_metadata_df["adult"] = movies_metadata_df["adult"].astype(bool)
movies_metadata_df["belongs_to_collection"] = movies_metadata_df["belongs_to_collection"].astype(str)
movies_metadata_df["budget"] = pd.to_numeric(movies_metadata_df["budget"], errors="coerce")
movies_metadata_df["genres"] = movies_metadata_df["genres"].astype(str)
movies_metadata_df["id"] = movies_metadata_df["id"].astype(str)
movies_metadata_df['release_date'] = pd.to_datetime(movies_metadata_df['release_date'], errors='coerce')
movies_metadata_df["revenue"] = pd.to_numeric(movies_metadata_df["revenue"], errors="coerce")
movies_metadata_df["runtime"] = pd.to_numeric(movies_metadata_df["runtime"], errors="coerce")

In [93]:
credits_df["cast"] = credits_df["cast"].astype(str)
credits_df["crew"] = credits_df["crew"].astype(str)
credits_df["id"] = credits_df["id"].astype(str)

- NA checking

In [100]:
movies_metadata_df.isna().sum()

adult                        0
belongs_to_collection        0
budget                   36576
genres                       0
id                           0
release_date                90
revenue                  38058
runtime                   1821
dtype: int64

In [95]:
credits_df.isna().sum()

cast    0
crew    0
id      0
dtype: int64

- Budget and Revenue are important features, but they contain a lot of 0 values -> make NaN and drop them

In [97]:
movies_metadata_df["budget"] = movies_metadata_df["budget"].replace(0.0, np.nan)
movies_metadata_df["revenue"] = movies_metadata_df["revenue"].replace(0.0, np.nan)

- Runtime to be 0.0 which is noise, replace them with np.nan

In [98]:
movies_metadata_df["runtime"] = movies_metadata_df["runtime"].replace(0.0, np.nan)

- NaN dropping

In [101]:
movies_metadata_df =  movies_metadata_df.dropna(subset=["budget", "release_date", "revenue", "runtime"])

- Preprocess collection's name

In [102]:
def extract_collection_name(value):
    if pd.isna(value) or (value == "nan"):
        return "None"
    try:
        data = ast.literal_eval(value)
        return data.get("name") if isinstance(data, dict) else None
    except (ValueError, SyntaxError):
        return "None"

movies_metadata_df["belongs_to_collection"] = movies_metadata_df["belongs_to_collection"].apply(extract_collection_name)

- Preprocess movie's genre

In [103]:
movies_metadata_df["genres"] = movies_metadata_df["genres"].apply(ast.literal_eval)
# convert to a list of genres
movies_metadata_df['genres'] = movies_metadata_df['genres'].apply(lambda x: [d['name'] for d in x] if isinstance(x, list) else [])

- Dataframe merging

In [104]:
df = pd.merge(left=movies_metadata_df, right=credits_df, on="id", how="left")

- Make movies which contain no cast or no crew (noise in data) as NaN

In [105]:
df["cast"] = df["cast"].replace("[]", np.nan)
df["crew"] = df["crew"].replace("[]", np.nan)

- Drop NaN values

In [106]:
df = df.dropna(subset=["cast", "crew"])

- Preprocess cast and crew to list of dict

In [111]:
def extract_cast(cast):
    if pd.isna(cast) or (cast == "nan"):
        return None
    
    try:
        data = ast.literal_eval(cast)
        return [c for c in data] if isinstance(data, list) else None
    
    except (ValueError, SyntaxError):
        return None
    
def extract_crew(crew):
    if pd.isna(crew) or (crew == "nan"):
        return None
    
    try:
        data = ast.literal_eval(crew)
        return [c for c in data] if isinstance(data, list) else None
    
    except (ValueError, SyntaxError):
        return None

df["cast"] = df["cast"].apply(extract_cast)
df["crew"] = df["crew"].apply(extract_crew)

- Final dataframe

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5369 entries, 0 to 5380
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   adult                  5369 non-null   bool          
 1   belongs_to_collection  5369 non-null   object        
 2   budget                 5369 non-null   float64       
 3   genres                 5369 non-null   object        
 4   id                     5369 non-null   object        
 5   release_date           5369 non-null   datetime64[ns]
 6   revenue                5369 non-null   float64       
 7   runtime                5369 non-null   float64       
 8   cast                   5369 non-null   object        
 9   crew                   5369 non-null   object        
dtypes: bool(1), datetime64[ns](1), float64(3), object(5)
memory usage: 424.7+ KB


In [112]:
print_df(df)

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,release_date,revenue,runtime,cast,crew
0,True,Toy Story Collection,30000000.0,"[Animation, Comedy, Family]",862,1995-10-30,373554033.0,81.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,True,,65000000.0,"[Adventure, Fantasy, Family]",8844,1995-12-15,262797249.0,104.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,True,,16000000.0,"[Comedy, Drama, Romance]",31357,1995-12-22,81452156.0,127.0,"[{'cast_id': 1, 'character': 'Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
3,True,,60000000.0,"[Action, Crime, Drama, Thriller]",949,1995-12-15,187436818.0,170.0,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de..."
4,True,,35000000.0,"[Action, Adventure, Thriller]",9091,1995-12-22,64350171.0,106.0,"[{'cast_id': 1, 'character': 'Darren Francis T...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'de..."


## MODEL TRAINING

## MODEL EVALUATION

## RESUTL

## NOTES
- If a value like 0.0 is considered to be noise or invalid data collection, use replace() to replace them with np.nan, and then drop them gracefully

### AUTHOR
- LE BUI TRUNG DUNG