In [1]:
import pandas as pd
import numpy as np

## Credits.csv cleaning

In [149]:
df = pd.read_csv('the-movies-dataset/credits.csv')

In [150]:
df['cast'] = df['cast'].apply(literal_eval)
df['crew'] = df['crew'].apply(literal_eval)
df['cast_size'] = df['cast'].apply(lambda x: len(x))
df['crew_size'] = df['crew'].apply(lambda x: len(x))

In [151]:
# Define a function to get the director name
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [152]:
df['director'] = df['crew'].apply(get_director)

In [153]:
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df['cast'] = df['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [154]:
df.head()

Unnamed: 0,cast,crew,id,cast_size,crew_size,director
0,"[Tom Hanks, Tim Allen, Don Rickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,13,106,John Lasseter
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,26,16,Joe Johnston
2,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,7,4,Howard Deutch
3,"[Whitney Houston, Angela Bassett, Loretta Devine]","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,10,10,Forest Whitaker
4,"[Steve Martin, Diane Keaton, Martin Short]","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,12,7,Charles Shyer


# Merge Data

## Movie metadata

In [103]:
movie_metadata = pd.read_csv('the-movies-dataset/movies_metadata.csv')
# Select columns that we need
movie_metadata = movie_metadata[['genres','id','imdb_id','overview','popularity',
                                 'production_companies','production_countries','release_date',
                                 'revenue','runtime','spoken_languages','status',
                                 'title','vote_average','vote_count']]


In [104]:
# Rename the column id to tmdb_id
movie_metadata.rename(columns = {'id':'tmdb_id'},inplace=True)

In [105]:
credits = pd.read_csv('the-movies-dataset/credits.csv')
credits.rename(columns = {'id':'tmdb_id'},inplace=True)
credits['tmdb_id'] = credits['tmdb_id'].apply(str)

In [106]:
# Merge the movie_metadata.csv, credits.csv, and keywords.csv
result1 = pd.merge(movie_metadata, credits, how = 'left', on = ['tmdb_id'])

keywords = pd.read_csv('the-movies-dataset/keywords.csv')
keywords.rename(columns = {'id':'tmdb_id'},inplace=True)
keywords['tmdb_id'] = keywords['tmdb_id'].apply(str)

result2 = pd.merge(result1, keywords, how = 'left', on = ['tmdb_id'])

In [107]:
# Select rows where the column status == 'Released'
cleaned_data = result2[result2['status'] == 'Released']

In [108]:
# Fill NA by the space
cleaned_data = cleaned_data.fillna(' ')

In [112]:
cleaned_data.head()

Unnamed: 0,genres,tmdb_id,imdb_id,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,title,vote_average,vote_count,cast,crew,keywords
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,"Led by Woody, Andy's toys live happily in his ...",21.9469,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Toy Story,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,When siblings Judy and Peter discover an encha...,17.0155,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Jumanji,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Grumpier Old Men,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,"Cheated on, mistreated and stepped on, the wom...",3.85949,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Waiting to Exhale,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,Just when George Banks has recovered from his ...,8.38752,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Father of the Bride Part II,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


## User rating data

In [128]:
# Combine ratings with tags by userId, movieId, and timestamp
ratings = pd.read_csv('the-movies-dataset/ratings.csv')
tags = pd.read_csv('ml-25m/tags.csv')
user_data = pd.merge(ratings, tags, how = 'left', on = ['userId','movieId','timestamp'])