# Clean and Extract Keywords

In [7]:
import pandas as pd

key_df = pd.read_csv('../Dataset/keywords.csv')
key_df.shape

(46419, 2)

In [8]:
key_df.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [9]:
import ast

# Function to clean and extract keyword names
def extract_keyword_names(keyword_str):
    try:
        # Safely evaluate the string into a Python list of dictionaries
        keywords_list = ast.literal_eval(keyword_str)
        # Extract only the 'name' field from each dictionary
        return [item['name'] for item in keywords_list]
    except (ValueError, SyntaxError):
        # Return an empty list if there's an issue parsing the string
        return []

# Apply the function to the 'keywords' column
key_df['keywords'] = key_df['keywords'].apply(extract_keyword_names)
key_df

Unnamed: 0,id,keywords
0,862,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,"[board game, disappearance, based on children'..."
2,15602,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,"[based on novel, interracial relationship, sin..."
4,11862,"[baby, midlife crisis, confidence, aging, daug..."
...,...,...
46414,439050,[tragic love]
46415,111109,"[artist, play, pinoy]"
46416,67758,[]
46417,227506,[]


In [10]:
key_df['keywords'] = key_df['keywords'].apply(lambda x: ' '.join(x))
key_df

Unnamed: 0,id,keywords
0,862,jealousy toy boy friendship friends rivalry bo...
1,8844,board game disappearance based on children's b...
2,15602,fishing best friend duringcreditsstinger old men
3,31357,based on novel interracial relationship single...
4,11862,baby midlife crisis confidence aging daughter ...
...,...,...
46414,439050,tragic love
46415,111109,artist play pinoy
46416,67758,
46417,227506,


In [11]:
key_df.isna().sum()

id          0
keywords    0
dtype: int64

# Clean and Extract Metadata

In [12]:
import pandas as pd

meta_df = pd.read_csv('../Dataset/movies_metadata.csv',low_memory=False)
meta_df.shape

(45466, 24)

In [13]:
meta_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [14]:
meta_df = meta_df[['id', 'title', 'genres','overview']]
meta_df.isna().sum()

id            0
title         6
genres        0
overview    954
dtype: int64

In [15]:
meta_df.dropna(inplace=True)
meta_df.head()

Unnamed: 0,id,title,genres,overview
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...


In [16]:
import ast

# Function to clean and extract keyword names
def extract_genres_names(keyword_str):
    try:
        # Safely evaluate the string into a Python list of dictionaries
        keywords_list = ast.literal_eval(keyword_str)
        # Extract only the 'name' field from each dictionary
        return [item['name'] for item in keywords_list]
    except (ValueError, SyntaxError):
        # Return an empty list if there's an issue parsing the string
        return []

# Apply the function to the 'keywords' column
meta_df['genres'] = meta_df['genres'].apply(extract_genres_names)
meta_df.head()

Unnamed: 0,id,title,genres,overview
0,862,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...


In [17]:
meta_df['genres'] = meta_df['genres'].apply(lambda x: ' '.join(x))
meta_df.head()

Unnamed: 0,id,title,genres,overview
0,862,Toy Story,Animation Comedy Family,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,Adventure Fantasy Family,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,Romance Comedy,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,Comedy Drama Romance,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Comedy,Just when George Banks has recovered from his ...


# Combine Keywords and Metadata

In [18]:
# Ensure 'id' columns in both DataFrames are of the same type
key_df['id'] = key_df['id'].astype(str)
meta_df['id'] = meta_df['id'].astype(str)

# Merge the DataFrames
combined_df = pd.merge(key_df, meta_df, how='inner', on='id')
combined_df.head()

Unnamed: 0,id,keywords,title,genres,overview
0,862,jealousy toy boy friendship friends rivalry bo...,Toy Story,Animation Comedy Family,"Led by Woody, Andy's toys live happily in his ..."
1,8844,board game disappearance based on children's b...,Jumanji,Adventure Fantasy Family,When siblings Judy and Peter discover an encha...
2,15602,fishing best friend duringcreditsstinger old men,Grumpier Old Men,Romance Comedy,A family wedding reignites the ancient feud be...
3,31357,based on novel interracial relationship single...,Waiting to Exhale,Comedy Drama Romance,"Cheated on, mistreated and stepped on, the wom..."
4,11862,baby midlife crisis confidence aging daughter ...,Father of the Bride Part II,Comedy,Just when George Banks has recovered from his ...


In [19]:
combined_df.isna().sum()

id          0
keywords    0
title       0
genres      0
overview    0
dtype: int64

In [20]:
combined_df.to_csv('../Dataset/combined.csv', index=False)