In [18]:
import pandas as pd
import numpy as np
import os
import json
import seaborn as sns
from matplotlib import pyplot as plt
import random 

##### Load required datasets

In [19]:
credits = pd.read_csv("../datasets/TMDB/tmdb_5000_credits.csv")
movies = pd.read_csv("../datasets/TMDB/tmdb_5000_movies.csv")
ratings = pd.read_csv("../datasets/The_Movies/ratings_small.csv")
links = pd.read_csv("../datasets/The_Movies/links_small.csv")
movies_unchanged = movies
credits_unchanged = credits
links_unchanged = links
ratings_unchanged = ratings

In [20]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [21]:
movies = movies[['genres','id','keywords','title','overview','popularity','vote_average','vote_count']]
movies.head(2)

Unnamed: 0,genres,id,keywords,title,overview,popularity,vote_average,vote_count
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,7.2,11800
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,6.9,4500


##### Rename the movie id column names , as they are different in different datasets. For TMDB using tmdbId

In [22]:

movies = movies.rename(columns={"id":"tmdbId"})
if('title' in credits):
    credits = credits.drop(columns=["title"])
credits = credits.rename(columns={"movie_id":"tmdbId"})
credits.head()

Unnamed: 0,tmdbId,cast,crew
0,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


##### Combine movies and credits to single dataset & check if duplicates exist

In [23]:

print("Duplicate movie id's conunt in movies df: ",movies.tmdbId.duplicated().sum())
print("Duplicate movie id's conunt in credits df: ",credits.tmdbId.duplicated().sum())
movies = movies.merge(credits,on="tmdbId")
movies.head(2)

Duplicate movie id's conunt in movies df:  0
Duplicate movie id's conunt in credits df:  0


Unnamed: 0,genres,tmdbId,keywords,title,overview,popularity,vote_average,vote_count,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


##### Check if null values exist

In [24]:
movies.isnull().sum()

genres          0
tmdbId          0
keywords        0
title           0
overview        3
popularity      0
vote_average    0
vote_count      0
cast            0
crew            0
dtype: int64

##### Fill all null values with empty character

In [25]:
movies = movies.fillna("")
movies.isnull().sum()

genres          0
tmdbId          0
keywords        0
title           0
overview        0
popularity      0
vote_average    0
vote_count      0
cast            0
crew            0
dtype: int64

##### check for duplicate records

In [26]:
print("duplicated records", movies.duplicated().sum())

duplicated records 0


##### Genres data is represented in JSON format. Re-format the column values to list

In [27]:
if(isinstance(movies.genres[0], str)):
    genreList = []
    for i in movies.genres:
        JSON = json.loads(i)
        movieGenres = []
        for j in JSON:
            movieGenres.append(j['name'])
        genreList.append(movieGenres[:5])
    movies = movies.drop(columns = ['genres'])
    movies['genres'] = pd.Series(genreList)
movies.genres.head(2)

0    [Action, Adventure, Fantasy, Science Fiction]
1                     [Adventure, Fantasy, Action]
Name: genres, dtype: object

##### Cast data is represented in JSON format. Re-format the column values to list

In [28]:
if(isinstance(movies.cast[0], str)):
    castList = []
    for i in movies.cast:
        JSON = json.loads(i)
        movieCast = []
        for j in JSON:
            movieCast.append(j['name'])
        castList.append(movieCast[:5])
    movies = movies.drop(columns = ['cast'])
    movies['cast'] = pd.Series(castList)
movies.cast.head(2)

0    [Sam Worthington, Zoe Saldana, Sigourney Weave...
1    [Johnny Depp, Orlando Bloom, Keira Knightley, ...
Name: cast, dtype: object

##### Sample List of crew members  job role in a movie

In [29]:
JSON = json.loads(credits_unchanged.crew[0])
jobs = []
for j in JSON:
    jobs.append(j['job'])
print(set(jobs))

{'Conceptual Design', 'Best Boy Electric', 'Lighting Artist', 'Sound Designer', 'Art Direction', 'Writer', 'Sound Effects Editor', 'Makeup Artist', 'Visual Effects Art Director', 'Lighting Technician', 'Digital Intermediate', 'Costume Design', 'Music Editor', 'Dialogue Editor', 'Post Production Supervisor', 'Casting', 'Special Effects Coordinator', 'Publicist', 'Dialect Coach', 'Visual Effects Editor', 'Producer', 'Set Decoration', 'Motion Capture Artist', 'Supervising Art Director', 'Stunts', 'Art Department Manager', 'Sound Re-Recording Mixer', 'Set Costumer', 'Art Department Coordinator', 'Still Photographer', 'Executive Producer', 'Foley', 'Director of Photography', 'Construction Coordinator', 'Visual Effects Producer', 'Original Music Composer', 'Choreographer', 'CG Supervisor', 'Hairstylist', 'Set Designer', 'Modeling', 'Director', 'Transportation Coordinator', 'Animation Director', 'Steadicam Operator', 'Costume Supervisor', 'Camera Operator', 'Production Manager', 'Screenplay',

##### Crew data is represented in JSON format & there are several crew job positions available. Select Director role which we use for prediction

In [30]:
if(('crew' in movies) and (isinstance(movies.crew[0], str))):
    directorList = []
    for i in movies.crew:
        JSON = json.loads(i)
        movieDirectors = []
        for j in JSON:
            if(j['job'] == 'Director'):
                movieDirectors.append(j['name'])
        directorList.append(movieDirectors[:5])
    movies = movies.drop(columns = ['crew'])
    movies['directors'] = pd.Series(directorList)
movies['directors']

0                                [James Cameron]
1                               [Gore Verbinski]
2                                   [Sam Mendes]
3                            [Christopher Nolan]
4                               [Andrew Stanton]
                          ...                   
4798                          [Robert Rodriguez]
4799                              [Edward Burns]
4800                               [Scott Smith]
4801                               [Daniel Hsia]
4802    [Brian Herzlinger, Jon Gunn, Brett Winn]
Name: directors, Length: 4803, dtype: object

##### Similarly clean keywords column data

In [31]:
if(isinstance(movies.keywords[0], str)):
    keywordsList = []
    for i in movies.keywords:
        JSON = json.loads(i)
        movieKeywords = []
        for j in JSON:
            movieKeywords.append(j['name'])
        keywordsList.append(movieKeywords[:5])
    movies = movies.drop(columns = ['keywords'])
    movies['keywords'] = pd.Series(keywordsList)
movies.keywords.head(2)

0    [culture clash, future, space war, space colon...
1    [ocean, drug abuse, exotic island, east india ...
Name: keywords, dtype: object

##### Function to merge multiple words in name field to avoid partial names as common features 

In [32]:
def formatData(column):
    movies[column] = [[str.lower(j.replace(" ", "_")) for j in i ] for i in movies[column]]

In [33]:
formatData('keywords')
formatData('cast')
formatData('directors')
formatData('genres')
movies.head(2)

Unnamed: 0,tmdbId,title,overview,popularity,vote_average,vote_count,genres,cast,directors,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,7.2,11800,"[action, adventure, fantasy, science_fiction]","[sam_worthington, zoe_saldana, sigourney_weave...",[james_cameron],"[culture_clash, future, space_war, space_colon..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,6.9,4500,"[adventure, fantasy, action]","[johnny_depp, orlando_bloom, keira_knightley, ...",[gore_verbinski],"[ocean, drug_abuse, exotic_island, east_india_..."


#### Combine all the extracted features to a single column which we use it for prediction. 

In [34]:
movies['content'] = (movies['keywords'] + movies['cast'] + movies['directors'] + movies['genres'])
movies['content'] = movies['content'].apply(lambda x: ' '.join(x))
movies['content'][0]

'culture_clash future space_war space_colony society sam_worthington zoe_saldana sigourney_weaver stephen_lang michelle_rodriguez james_cameron action adventure fantasy science_fiction'