# Importing Dependencies

In [41]:
import pandas as pd
import numpy as np
import json
import ast

# Combine credits and movies

In [42]:
# Load the movies dataframe with low_memory set to False to handle mixed data types more gracefully
movies_df = pd.read_csv('movies_metadata.csv', low_memory=False)  # Adjust the filename if necessary

# Load the credits dataframe
credits_df = pd.read_csv('credits.csv', low_memory=False)

# Ensure the 'id' columns are of type string for both dataframes
movies_df['id'] = movies_df['id'].astype(str)
credits_df['id'] = credits_df['id'].astype(str)

# Now merge the dataframes on the 'id' column
merged_df = pd.merge(movies_df, credits_df, on='id', how='inner')
merged_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


# Reading the Dataset

In [43]:
movies = merged_df.copy()

In [44]:
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


# Checking for missing values (per column)

In [45]:
missing_per_column = movies.isnull().sum()
total_rows = movies.shape[0]
percentage_missing = (missing_per_column / total_rows) * 100

print("------MISSING------")
print(missing_per_column)
print("\n------PERCENTAGE MISSING------")
print(percentage_missing)


------MISSING------
adult                        0
belongs_to_collection    41038
budget                       0
genres                       0
homepage                 37746
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   3
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      3
runtime                    260
spoken_languages             3
status                      84
tagline                  25099
title                        3
video                        3
vote_average                 3
vote_count                   3
cast                         0
crew                         0
dtype: int64

------PERCENTAGE MISSING------
adult                     0.000000
belongs_to_collection    90.118143
budget                    0.000000
genres                  

# Dropping Columns

In [46]:
movies_dropped = movies.copy()

columns_to_drop = [
    'belongs_to_collection', 'homepage', 'tagline', 'imdb_id', 'id', 'poster_path'
]


movies_dropped.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')

# Checking for unique values (by column)

In [47]:
for column in movies_dropped.columns:
    print(f".. {column} -----")
    unique_values = movies_dropped[column].unique()
    unique_count = movies_dropped[column].nunique()
    print(f"----- {column} -----")
    print(unique_values)
    print(f"Number of unique values in '{column}':", unique_count)
    print("\n")

.. adult -----
----- adult -----
['False' 'True']
Number of unique values in 'adult': 2


.. budget -----
----- budget -----
['30000000' '65000000' '0' ... '3417000' '25868826' '1254040']
Number of unique values in 'budget': 1223


.. genres -----
----- genres -----
["[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"
 "[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]"
 "[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]" ...
 "[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name': 'Comedy'}, {'id': 28, 'name': 'Action'}, {'id': 10751, 'name': 'Family'}]"
 "[{'id': 28, 'name': 'Action'}, {'id': 9648, 'name': 'Mystery'}, {'id': 53, 'name': 'Thriller'}, {'id': 27, 'name': 'Horror'}]"
 "[{'id': 10751, 'name': 'Family'}, {'id': 16, 'name': 'Animation'}, {'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]"]
Number of unique values in 'genres': 4066


.. original_language -----

----- title -----
['Toy Story' 'Jumanji' 'Grumpier Old Men' ... 'Century of Birthing'
 'Satan Triumphant' 'Queerama']
Number of unique values in 'title': 42276


.. video -----
----- video -----
[False True nan]
Number of unique values in 'video': 2


.. vote_average -----
----- vote_average -----
[ 7.7  6.9  6.5  6.1  5.7  6.2  5.4  5.5  6.6  7.1  7.8  7.2  6.4  6.
  6.3  7.   7.4  7.6  6.8  7.3  3.5  6.7  8.1  5.9  5.2  3.   5.8  4.5
  4.4  2.8  4.1  5.1  3.9  7.5  0.   7.9  5.6  3.3  5.3  4.3  3.8  5.
  4.  10.   4.9  4.6  4.7  2.5  4.8  8.2  8.3  8.5  8.   2.   3.4  3.7
  4.2  3.6  2.7  3.2  2.9  9.   9.3  8.8  8.7  1.5  1.7  3.1  1.   8.4
  2.4  8.6  8.9  1.2  1.6  2.3  1.3  1.9  0.5  2.1  2.6  9.1  1.8  9.5
  9.2  9.6  2.2  nan  1.4  9.8  9.4  0.7  1.1]
Number of unique values in 'vote_average': 92


.. vote_count -----
----- vote_count -----
[5415. 2413.   92. ... 2083. 2002. 2712.]
Number of unique values in 'vote_count': 1820


.. cast -----
----- cast -----
["[{'cast_id': 14,

----- crew -----
['[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenpla

# Most common value for each column

In [48]:
def mode_percentage(column):
    mode_value = column.mode().iloc[0]
    mode_count = (column == mode_value).sum()
    total_count = column.count()  
    return mode_value, (mode_count / total_count) * 100


mode_info = {col: mode_percentage(movies_dropped[col]) for col in movies_dropped.columns}

for column, (value, percentage) in mode_info.items():
    print(f"{column}: Mode = {value}, Percentage = {percentage:.2f}%")


adult: Mode = False, Percentage = 99.98%
budget: Mode = 0, Percentage = 80.43%
genres: Mode = [{'id': 18, 'name': 'Drama'}], Percentage = 11.00%
original_language: Mode = en, Percentage = 70.98%
original_title: Mode = Blackout, Percentage = 0.03%
overview: Mode = No overview found., Percentage = 0.30%
popularity: Mode = 0.0, Percentage = 0.14%
production_companies: Mode = [], Percentage = 26.12%
production_countries: Mode = [{'iso_3166_1': 'US', 'name': 'United States of America'}], Percentage = 39.25%
release_date: Mode = 2008-01-01, Percentage = 0.30%
revenue: Mode = 0.0, Percentage = 83.69%
runtime: Mode = 90.0, Percentage = 5.65%
spoken_languages: Mode = [{'iso_639_1': 'en', 'name': 'English'}], Percentage = 49.25%
status: Mode = Released, Percentage = 99.19%
title: Mode = Blackout, Percentage = 0.03%
video: Mode = False, Percentage = 99.80%
vote_average: Mode = 0.0, Percentage = 6.60%
vote_count: Mode = 1.0, Percentage = 7.18%
cast: Mode = [], Percentage = 5.31%
crew: Mode = [], P

# Filtering and dropped further

In [49]:
movies_filtered = movies_dropped[movies_dropped['adult'] == 'False']
movies_filtered1 = movies_filtered[movies_filtered['status'] == 'Released']
movies_filtered2 = movies_filtered1[movies_filtered1['video'] == False]
movies_filtered2

movies_dropped2 = movies_filtered2.copy()

columns_to_drop = [
    'adult', 'status', 'video'
]


movies_dropped2.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')

movies_dropped2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44986 entries, 0 to 45537
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                44986 non-null  object 
 1   genres                44986 non-null  object 
 2   original_language     44976 non-null  object 
 3   original_title        44986 non-null  object 
 4   overview              44071 non-null  object 
 5   popularity            44986 non-null  object 
 6   production_companies  44986 non-null  object 
 7   production_countries  44986 non-null  object 
 8   release_date          44912 non-null  object 
 9   revenue               44986 non-null  float64
 10  runtime               44736 non-null  float64
 11  spoken_languages      44986 non-null  object 
 12  title                 44986 non-null  object 
 13  vote_average          44986 non-null  float64
 14  vote_count            44986 non-null  float64
 15  cast                  44

In [50]:
movies_filtered3 = movies_dropped2[movies_dropped2['original_language'].notna()]
print(movies_filtered3.info())

<class 'pandas.core.frame.DataFrame'>
Index: 44976 entries, 0 to 45537
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                44976 non-null  object 
 1   genres                44976 non-null  object 
 2   original_language     44976 non-null  object 
 3   original_title        44976 non-null  object 
 4   overview              44062 non-null  object 
 5   popularity            44976 non-null  object 
 6   production_companies  44976 non-null  object 
 7   production_countries  44976 non-null  object 
 8   release_date          44902 non-null  object 
 9   revenue               44976 non-null  float64
 10  runtime               44726 non-null  float64
 11  spoken_languages      44976 non-null  object 
 12  title                 44976 non-null  object 
 13  vote_average          44976 non-null  float64
 14  vote_count            44976 non-null  float64
 15  cast                  44

# Bayesian Average

In [51]:
movies = movies_filtered3.copy()

C = movies['vote_count'].mean()
m = movies['vote_average'].mean()


def bayesian_avg(vote_count, vote_average, C=C, m=m):
    bayesian_avg = (C * m + vote_count * vote_average) / (C + vote_count)
    return round(bayesian_avg, 3)


movies['bayesian_average'] = movies.apply(lambda x: bayesian_avg(x['vote_count'], x['vote_average']), axis=1)

# Manipulating Json string into a list

In [52]:
# Define the data processing functions
def extract_genre_names(genre_string):
    if pd.isnull(genre_string):
        return []
    try:
        genre_list = json.loads(genre_string.replace("'", "\""))
        return [genre['name'] for genre in genre_list]
    except json.JSONDecodeError:
        return []

def extract_company_names(company_string):
    if pd.isnull(company_string):
        return []
    try:
        company_list = json.loads(company_string.replace("'", "\""))
        return [company['name'] for company in company_list]
    except json.JSONDecodeError:
        return []

def extract_country_names(country_string):
    if pd.isnull(country_string):
        return []
    try:
        country_data = json.loads(country_string.replace("'", "\""))
        return [country['name'] for country in country_data]
    except (json.JSONDecodeError, TypeError):
        return []

def extract_language_codes(language_string):
    if pd.isnull(language_string):
        return []
    try:
        language_list = json.loads(language_string.replace("'", "\""))
        return [language['iso_639_1'] for language in language_list]
    except (json.JSONDecodeError, TypeError):
        return []
    
def extract_cast_name(cast_str):
    try:
        cast_data = ast.literal_eval(cast_str)
        return [person['name'] for person in cast_data]
    except ValueError as e:
        print(f"ValueError in cast: {e}")
        return []
    except SyntaxError as e:
        print(f"SyntaxError in cast: {e}")
        return []


def extract_job_name_pairs(crew_str):
    try:
        crew_data = ast.literal_eval(crew_str)
        return [[person['job'], person['name']] for person in crew_data]
    except ValueError as e:
        print(f"ValueError in crew: {e}")
        return []
    except SyntaxError as e:
        print(f"SyntaxError in crew: {e}")
        return []

movies['genre_list'] = movies['genres'].apply(extract_genre_names)
movies['company_list'] = movies['production_companies'].apply(extract_company_names)
movies['country_list'] = movies['production_countries'].apply(extract_country_names)
movies['language_codes'] = movies['spoken_languages'].apply(extract_language_codes)
movies['actor_names'] = movies['cast'].apply(extract_cast_name)
movies['job_name_pairs'] = movies['crew'].apply(extract_job_name_pairs)

In [53]:
columns_to_drop = [
    'genres','production_companies','production_countries','spoken_languages','cast', 'crew'
]


movies.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')

movies.head()

Unnamed: 0,budget,original_language,original_title,overview,popularity,release_date,revenue,runtime,title,vote_average,vote_count,bayesian_average,genre_list,company_list,country_list,language_codes,actor_names,job_name_pairs
0,30000000,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Toy Story,7.7,5415.0,7.658,"[Animation, Comedy, Family]",[Pixar Animation Studios],[United States of America],[en],"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[[Director, John Lasseter], [Screenplay, Joss ..."
1,65000000,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,Jumanji,6.9,2413.0,6.844,"[Adventure, Fantasy, Family]","[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],"[en, fr]","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[[Executive Producer, Larry J. Franco], [Scree..."
2,0,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,0.0,101.0,Grumpier Old Men,6.5,92.0,6.022,"[Romance, Comedy]","[Warner Bros., Lancaster Gate]",[United States of America],[en],"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[[Director, Howard Deutch], [Characters, Mark ..."
3,16000000,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,127.0,Waiting to Exhale,6.1,34.0,5.737,"[Comedy, Drama, Romance]",[Twentieth Century Fox Film Corporation],[United States of America],[en],"[Whitney Houston, Angela Bassett, Loretta Devi...","[[Director, Forest Whitaker], [Screenplay, Ron..."
4,0,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76578911.0,106.0,Father of the Bride Part II,5.7,173.0,5.671,[Comedy],"[Sandollar Productions, Touchstone Pictures]",[United States of America],[en],"[Steve Martin, Diane Keaton, Martin Short, Kim...","[[Original Music Composer, Alan Silvestri], [D..."


# Saving the updated movies dataset

In [54]:
movies.to_csv('movies.csv', index=False)

# Extra stuff ----------- ignore below this line ---------

# Dropping rows with one or more empty values. Also dropping values with all empty lists.

In [None]:
import ast
def convert_to_list(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else x
    except ValueError:
        return []

for col in ['genre_list', 'company_list', 'country_list', 'language_codes']:
    movies_filtered[col] = movies_filtered[col].apply(convert_to_list)

# Calculate missing values in each row
movies_filtered['missing_values'] = movies_filtered.isna().sum(axis=1)

# Create conditions
condition_more_than_one_missing = movies_filtered['missing_values'] >= 1
condition_all_lists_empty = movies_filtered.apply(lambda row: all(not lst for lst in [
    row.get('genre_list', []),
    row.get('company_list', []),
    row.get('country_list', []),
    row.get('language_codes', [])
]), axis=1)

# Drop temporary 'missing_values' column
movies_filtered.drop('missing_values', axis=1, inplace=True)

# Combine conditions
final_condition = condition_more_than_one_missing | condition_all_lists_empty


print(f"Total number of rows with more than one missing value: {condition_more_than_one_missing.sum()}")
print(f"Number of rows with all four lists empty: {condition_all_lists_empty.sum()}")


# Preprocessing Credits file

In [None]:
final_dataframe = movies_filtered[~final_condition]
final_dataframe.to_csv('final_processed_movies.csv', index=False)

# ---------------------------------------------------------------------