In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [40]:
credits = pd.read_csv(r"tmdb_5000_credits.csv")
movies = pd.read_csv(r"tmdb_5000_movies.csv")


In [41]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [42]:
credits.shape

(4803, 4)

In [43]:
credits.head(1)['crew'].values

array(['[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cam

In [44]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [45]:
movies.shape

(4803, 20)

Now we are merging both datasets. We can merge them using either the Movie ID or the Movie Title, as both tables contain these two common columns.

In [46]:
movies = movies.merge(credits, on='title')

In [47]:
movies.merge(credits, on='title').shape

(4821, 26)

In [48]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


**Here, we remove the columns that are not required for analysis. The dataset has 23 columns, but since we are building a content-based recommender system, we only keep the columns that help us create tags for each movie**

In [49]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [50]:
# genres
# id
# keywords
# title
# overview
# cast
# crew

movies = movies[['movie_id','title', 'overview','genres', 'keywords', 'cast', 'crew' ]]

We keep the columns genres, id, keywords, title, overview, cast, and crew because these columns contain textual and descriptive information that helps us create tags for each movie.

In [51]:
movies.head()

# This is the dataframe where we have to work

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


Now, we create a new dataframe that contains only three columns: movie_id, title, and tags. In the tags column, we combine information from overview, genres, keywords, cast, and crew 

In [52]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [53]:
movies.dropna(inplace=True)

In [54]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [55]:
movies.duplicated().sum()

np.int64(0)

In [56]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

Here we do preprocessing. The genres column is in dictionary format, so we convert it into a simple list of words like action, adventure, fantasy, and science fiction. This makes the data easy to use for creating tags

In [57]:
import ast
ast.literal_eval

<function ast.literal_eval(node_or_string)>

## We import ast to convert string data into a Python-readable format like list or dictionary

In [58]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [59]:
# convert([{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}])

In [60]:
movies['genres'].apply(convert)

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4806, dtype: object

In [61]:
movies['genres'] = movies['genres'].apply(convert)

In [62]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [63]:
movies['keywords'].apply(convert)

0       [culture clash, future, space war, space colon...
1       [ocean, drug abuse, exotic island, east india ...
2       [spy, based on novel, secret agent, sequel, mi...
3       [dc comics, crime fighter, terrorist, secret i...
4       [based on novel, mars, medallion, space travel...
                              ...                        
4804    [united states–mexico barrier, legs, arms, pap...
4805                                                   []
4806    [date, love at first sight, narration, investi...
4807                                                   []
4808            [obsession, camcorder, crush, dream girl]
Name: keywords, Length: 4806, dtype: object

In [64]:
movies['keywords'] = movies['keywords'].apply(convert)

In [65]:
def convert3(obj):
    L = []
    counter = 0
    for i in obj:
        if counter < 3:
            L.append(i)
            counter += 1
        else:
            break
    return L


In [66]:
movies['cast'].apply(convert3)

0       [[, {, "]
1       [[, {, "]
2       [[, {, "]
3       [[, {, "]
4       [[, {, "]
          ...    
4804    [[, {, "]
4805    [[, {, "]
4806    [[, {, "]
4807    [[, {, "]
4808    [[, {, "]
Name: cast, Length: 4806, dtype: object

In [67]:
movies['cast'] = movies['cast'].apply(convert3)

In [68]:
def fetch_director(crew):
    if isinstance(crew, list):
        for person in crew:
            if person == 'James Cameron':  # example
                return [person]
        return []
    return []




In [69]:
movies['crew'].apply(fetch_director)

0       []
1       []
2       []
3       []
4       []
        ..
4804    []
4805    []
4806    []
4807    []
4808    []
Name: crew, Length: 4806, dtype: object

In [70]:
movies['crew'].apply(fetch_director)

0       []
1       []
2       []
3       []
4       []
        ..
4804    []
4805    []
4806    []
4807    []
4808    []
Name: crew, Length: 4806, dtype: object

In [71]:
movies['crew'] = movies['crew'].apply(convert)

In [72]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[[, {, ""]","[Stephen E. Rivkin, Rick Carter, Christopher B..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[[, {, ""]","[Dariusz Wolski, Gore Verbinski, Jerry Bruckhe..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[[, {, ""]","[Thomas Newman, Sam Mendes, Anna Pinnock, John..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[[, {, ""]","[Hans Zimmer, Charles Roven, Christopher Nolan..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[[, {, ""]","[Andrew Stanton, Andrew Stanton, John Lasseter..."


In [73]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[[, {, ""]","[Stephen E. Rivkin, Rick Carter, Christopher B..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[[, {, ""]","[Dariusz Wolski, Gore Verbinski, Jerry Bruckhe..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[[, {, ""]","[Thomas Newman, Sam Mendes, Anna Pinnock, John..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[[, {, ""]","[Hans Zimmer, Charles Roven, Christopher Nolan..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[[, {, ""]","[Andrew Stanton, Andrew Stanton, John Lasseter..."


In [76]:
movies['crew'] = movies['crew'].apply(convert)

ValueError: malformed node or string: ['Stephen E. Rivkin', 'Rick Carter', 'Christopher Boyes', 'Christopher Boyes', 'Mali Finn', 'James Horner', 'James Cameron', 'James Cameron', 'James Cameron', 'James Cameron', 'James Cameron', 'Andrew Menzies', 'Jill Brooks', 'Margery Simkin', 'Kevin Ishioka', 'Dick Bernstein', 'Shannon Mills', 'Dennie Thorpe', 'Jana Vance', 'Deborah Lynn Scott', 'Jon Landau', 'Sean Haworth', 'Kim Sinclair', 'Kim Sinclair', 'Richard F. Mays', 'Laeta Kalogridis', 'Mayes C. Rubeo', 'Mauro Fiore', 'Scott Herbertson', 'Woody Schultz', 'Linda DeVetta', 'Linda DeVetta', 'Richard Bluck', 'Simon Bright', 'Richard Martin', 'Steve R. Moore', 'John Refoua', 'Karl J. Martin', 'Chiling Lin', 'Ilram Choi', 'Steven Quale', 'Carla Meyer', 'Nick Bassett', 'Jill Cormack', 'Andy McLaren', 'Terry Notary', 'Garrett Warren', 'Jonathan Rothbart', 'Stefan Dechant', 'Todd Cherniawsky', 'Miranda Rivers', 'Robert Stromberg', 'John Harding', 'Roberto De Angelis', 'Mike Smithson', 'Alain Lalanne', 'Lucas Salton', 'Janace Tashjian', 'Stephen Rosenbaum', 'Frankie Karena', 'Lisa Lovaas', 'Jonathan Fawkner', 'Robert Bavin', 'Anthony Almaraz', 'Carolyn M. Fenton', 'Beth Koenigsberg', 'Sam Page', 'Tex Kadonaga', 'Kim Foscato', 'Tammy S. Lee', 'Denny Caira', 'James Waitkus', 'Addison Teague', 'C. Scott Baker', 'Luke Caska', 'David Chow', 'Jonathan Dyer', 'Joseph Hiura', 'Rebecca Jellie', 'Robert Andrew Johnson', 'Mike Stassi', 'John Villarino', 'Jeffrey Wisniewski', 'Cheryl Nardi', 'Marshall Winn', 'Gwendolyn Yates Whittle', 'William Stein', 'Lula Washington', 'Chris Del Conte', 'R. Christopher White', 'Dan Lemmon', 'Tim Nielsen', 'Michael Mulholland', 'Thomas Nittmann', 'Edson Williams', 'Christine Carr', 'John Bruno', 'David Emmerichs', 'Christopher Scarabosio', 'Jennifer Teves', 'Brigitte Yorke', 'Ken Fischer', 'Iain Hutton', 'Steve Ingram', 'Joyce Cox', 'Jenny Foster', 'Christopher Marino', 'Jim Milton', 'Cyndi Ochs', 'Lucas Putnam', "Anthony 'Max' Ivins", 'John Knoll', 'Eric Saindon', 'Wayne Stables', 'David Stinnett', 'Guy Williams', 'Stuart Thorp', 'Giles Coburn', 'Mark Fellman', 'Scott Sprague', 'Jeremy Hollobon', 'Orlando Meunier', 'Taisuke Tanimura', 'Lilia Mishel Acevedo', 'Alejandro M. Hernandez', 'Marvin Hall', 'Judy Alley', 'Mike Perry', 'Andrew Morley', 'Seth Engstrom', 'Eric Oliver', 'Matsune Suzuki', 'Paul Tobin', 'Roxane Griffin', 'Arun Ram-Mohan', 'Georgia Lockhart-Adams', 'Thrain Shadbolt', 'Brad Alexander', 'Shadi Almassizadeh', 'Simon Clutterbuck', 'Graeme Demmocks', 'Adrian Fernandes', 'Mitch Gates', 'Jerry Kung', 'Andy Lomas', 'Sebastian Marino', 'Matthias Menz', 'Sergei Nevshupov', 'Philippe Rebours', 'Michael Takarangi', 'David Weitzberg', 'Ben White', 'Min Windle']

In [None]:
movies['crew'].apply(fetch_director)

0       [James Cameron]
1                    []
2                    []
3                    []
4                    []
             ...       
4804                 []
4805                 []
4806                 []
4807                 []
4808                 []
Name: crew, Length: 4806, dtype: object

In [None]:
movies['crew'] = movies['crew'].apply(convert)

ValueError: malformed node or string: ['Stephen E. Rivkin', 'Rick Carter', 'Christopher Boyes', 'Christopher Boyes', 'Mali Finn', 'James Horner', 'James Cameron', 'James Cameron', 'James Cameron', 'James Cameron', 'James Cameron', 'Andrew Menzies', 'Jill Brooks', 'Margery Simkin', 'Kevin Ishioka', 'Dick Bernstein', 'Shannon Mills', 'Dennie Thorpe', 'Jana Vance', 'Deborah Lynn Scott', 'Jon Landau', 'Sean Haworth', 'Kim Sinclair', 'Kim Sinclair', 'Richard F. Mays', 'Laeta Kalogridis', 'Mayes C. Rubeo', 'Mauro Fiore', 'Scott Herbertson', 'Woody Schultz', 'Linda DeVetta', 'Linda DeVetta', 'Richard Bluck', 'Simon Bright', 'Richard Martin', 'Steve R. Moore', 'John Refoua', 'Karl J. Martin', 'Chiling Lin', 'Ilram Choi', 'Steven Quale', 'Carla Meyer', 'Nick Bassett', 'Jill Cormack', 'Andy McLaren', 'Terry Notary', 'Garrett Warren', 'Jonathan Rothbart', 'Stefan Dechant', 'Todd Cherniawsky', 'Miranda Rivers', 'Robert Stromberg', 'John Harding', 'Roberto De Angelis', 'Mike Smithson', 'Alain Lalanne', 'Lucas Salton', 'Janace Tashjian', 'Stephen Rosenbaum', 'Frankie Karena', 'Lisa Lovaas', 'Jonathan Fawkner', 'Robert Bavin', 'Anthony Almaraz', 'Carolyn M. Fenton', 'Beth Koenigsberg', 'Sam Page', 'Tex Kadonaga', 'Kim Foscato', 'Tammy S. Lee', 'Denny Caira', 'James Waitkus', 'Addison Teague', 'C. Scott Baker', 'Luke Caska', 'David Chow', 'Jonathan Dyer', 'Joseph Hiura', 'Rebecca Jellie', 'Robert Andrew Johnson', 'Mike Stassi', 'John Villarino', 'Jeffrey Wisniewski', 'Cheryl Nardi', 'Marshall Winn', 'Gwendolyn Yates Whittle', 'William Stein', 'Lula Washington', 'Chris Del Conte', 'R. Christopher White', 'Dan Lemmon', 'Tim Nielsen', 'Michael Mulholland', 'Thomas Nittmann', 'Edson Williams', 'Christine Carr', 'John Bruno', 'David Emmerichs', 'Christopher Scarabosio', 'Jennifer Teves', 'Brigitte Yorke', 'Ken Fischer', 'Iain Hutton', 'Steve Ingram', 'Joyce Cox', 'Jenny Foster', 'Christopher Marino', 'Jim Milton', 'Cyndi Ochs', 'Lucas Putnam', "Anthony 'Max' Ivins", 'John Knoll', 'Eric Saindon', 'Wayne Stables', 'David Stinnett', 'Guy Williams', 'Stuart Thorp', 'Giles Coburn', 'Mark Fellman', 'Scott Sprague', 'Jeremy Hollobon', 'Orlando Meunier', 'Taisuke Tanimura', 'Lilia Mishel Acevedo', 'Alejandro M. Hernandez', 'Marvin Hall', 'Judy Alley', 'Mike Perry', 'Andrew Morley', 'Seth Engstrom', 'Eric Oliver', 'Matsune Suzuki', 'Paul Tobin', 'Roxane Griffin', 'Arun Ram-Mohan', 'Georgia Lockhart-Adams', 'Thrain Shadbolt', 'Brad Alexander', 'Shadi Almassizadeh', 'Simon Clutterbuck', 'Graeme Demmocks', 'Adrian Fernandes', 'Mitch Gates', 'Jerry Kung', 'Andy Lomas', 'Sebastian Marino', 'Matthias Menz', 'Sergei Nevshupov', 'Philippe Rebours', 'Michael Takarangi', 'David Weitzberg', 'Ben White', 'Min Windle']