In [2]:
import numpy as np 
import pandas as pd 


In [3]:
movies = pd.read_csv("movies.csv")
credits = pd.read_csv("credits.csv")


In [4]:
#merging two datasets on the basis of title
movies = movies.merge(credits, on = "title")

In [5]:
#checking the datasets
movies.shape

(4809, 23)

In [6]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [8]:
#these are the columns that we are going to need. 

# genres 
# id 
# keywords 
# title 
# overview 
# release date
# cast 
# crew


In [9]:
movies = movies[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [10]:
#Since we have shrinked our dataframe into a tiny one. Now we will merge overview, genres, keywords, cast, crew columns into a single column,
#so that we have only three columns: movie_id, title, and a merged column that contains the overview of the movie. 

In [11]:
#Before that we should check for null vlaues.

In [12]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [13]:
#Now we have three null values in overview column, we will remove those three rows. 

In [14]:
movies.dropna(inplace=True)

In [15]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [16]:
movies.duplicated().sum() 

np.int64(0)

In [17]:
#Finally we have cleaned our data.

In [18]:
#We are going to merge those aforementioned columns into a single one. 
#But before that we need to extract some meaningful data from those columns.

In [21]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [None]:
# the movie in first row has these tangled data about the genres. so now we are going to extract the 
# things that will be useful like: Action, Adventure. Also note that this is not a list nor a dict but a string.
# for that we will import ast which has a function called literal_eval() which will convert a string into the list.

In [22]:
import ast
def conversion(obj): 
    list1 =[]
    for i in ast.literal_eval(obj): 
        list1.append(i["name"]) 
    return list1 

In [26]:
movies["genres"] = movies["genres"].apply(conversion) 
# this one goes through the every element of genres column and applies the conversion function and stores thoes values in the same column.

In [30]:
movies.head()


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [31]:
movies.keywords = movies.keywords.apply(conversion)

In [40]:
movies.head()


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [41]:
def cast_pickup(obj):
    list2 = []
    count = 0
    for i in ast.literal_eval(obj):
        if count == 3: 
            print(" ")
            break
        else: 
            print(i["character"])
            count += 1
movies.cast.apply(cast_pickup)

Jake Sully
Neytiri
Dr. Grace Augustine
 
Captain Jack Sparrow
Will Turner
Elizabeth Swann
 
James Bond
Blofeld
Madeleine
 
Bruce Wayne / Batman
Alfred Pennyworth
James Gordon
 
John Carter
Dejah Thoris
Sola
 
Peter Parker / Spider-Man
Mary Jane Watson
Harry Osborn / New Goblin
 
Flynn Rider (voice)
Rapunzel (voice)
Mother Gothel (voice)
 
Tony Stark / Iron Man
Thor Odinson
Bruce Banner / Hulk
 
Harry Potter
Ron Weasley
Hermione Granger
 
Bruce Wayne / Batman
Clark Kent / Superman
Diana Prince / Wonder Woman
 
Superman / Clark Kent
Lex Luthor
Lois Lane
 
James Bond
Camille Montes
Dominic Greene
 
Captain Jack Sparrow
Will Turner
Elizabeth Swann
 
Tonto
John Reid / The Lone Ranger
Butch Cavendish
 
Clark Kent / Kal-El
Lois Lane
General Zod
 
Prince Caspian
Peter Pevensie
Susan Pevensie
 
Tony Stark / Iron Man
Steve Rogers / Captain America
Bruce Banner / The Hulk
 
Captain Jack Sparrow
Angelica Teach
Captain Edward "Blackbeard" Teach
 
Agent J
Agent K
Young Agent K
 
Bilbo Baggins
Gandal

0       None
1       None
2       None
3       None
4       None
        ... 
4804    None
4805    None
4806    None
4807    None
4808    None
Name: cast, Length: 4806, dtype: object