In [19]:
# IMPORTING LIBRARIES

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns


In [20]:
# Using the imdb-5000-movie-dataset
# license - http://opendatacommons.org/licenses/dbcl/1.0/

imdb_data = pd.read_csv('movie_metadata.csv')
imdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      5024 non-null   object 
 1   director_name              4939 non-null   object 
 2   num_critic_for_reviews     4993 non-null   float64
 3   duration                   5028 non-null   float64
 4   director_facebook_likes    4939 non-null   float64
 5   actor_3_facebook_likes     5020 non-null   float64
 6   actor_2_name               5030 non-null   object 
 7   actor_1_facebook_likes     5036 non-null   float64
 8   gross                      4159 non-null   float64
 9   genres                     5043 non-null   object 
 10  actor_1_name               5036 non-null   object 
 11  movie_title                5043 non-null   object 
 12  num_voted_users            5043 non-null   int64  
 13  cast_total_facebook_likes  5043 non-null   int64

In [21]:
imdb_data.isnull().sum()

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64

In [22]:

imdb_data.dropna(axis=0,subset=['color',
                                'director_name', 
                                'num_critic_for_reviews',
                                'duration',
                                'director_facebook_likes',
                                'actor_3_facebook_likes',
                                'actor_2_name',
                                'actor_1_facebook_likes',
                                'actor_1_name',
                                'actor_3_name',
                                'facenumber_in_poster',
                                'num_user_for_reviews',
                                'language',
                                'country',
                                'actor_2_facebook_likes',
                                'plot_keywords'],
                                inplace=True)

In [23]:
imdb_data.shape

(4729, 28)

In [38]:
from imdb import IMDb

# create an instance of the IMDb class
ia = IMDb()

# search for the movie you are interested in
movie = ia.search_movie('The Dark Knight')[0]

# get the full information about the movie
ia.update(movie)

# retrieve the gross value of the movie
gross = movie.get('box office', {}).get('Gross United States', None)

print(gross)

None


We lost around 6% of data which is within an acceptable range. We will now try and replace null values with average values to have a tidy dataset.

In [24]:
imdb_data.isnull().sum()

color                          0
director_name                  0
num_critic_for_reviews         0
duration                       0
director_facebook_likes        0
actor_3_facebook_likes         0
actor_2_name                   0
actor_1_facebook_likes         0
gross                        637
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   0
facenumber_in_poster           0
plot_keywords                  0
movie_imdb_link                0
num_user_for_reviews           0
language                       0
country                        0
content_rating               169
budget                       351
title_year                     0
actor_2_facebook_likes         0
imdb_score                     0
aspect_ratio                 205
movie_facebook_likes           0
dtype: int64

In [25]:
rating_data = imdb_data['content_rating'].value_counts()
rating_data

R            2076
PG-13        1434
PG            681
G             109
Not Rated     100
Unrated        58
Approved       55
X              13
Passed          9
NC-17           7
GP              6
M               5
TV-14           3
TV-G            3
TV-PG           1
Name: content_rating, dtype: int64

In [26]:
imdb_data[imdb_data['content_rating'].isnull()]['movie_title']

98                              Godzilla Resurgence 
204                             Godzilla Resurgence 
242                    Asterix at the Olympic Games 
489                                       Evolution 
810     Xi you ji zhi: Sun Wukong san da Baigu Jing 
                            ...                     
5017                                      Dry Spell 
5018                                       Flywheel 
5029                                       The Cure 
5032                                           Bang 
5038                        Signed Sealed Delivered 
Name: movie_title, Length: 169, dtype: object

In [27]:
# Removing duplicate values
imdb_data.drop_duplicates(inplace=True)


In [10]:
imdb_data.shape

(4687, 28)

In [28]:
import requests
api_key = '8af056a9a0e63e410987616db5c07f80'

In [29]:
def getMovieID(title):
    title = title.replace(' ','+')
    movie = requests.get(f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={title}")
    id = movie.json()['results'][0]['id']
    return id

In [30]:
def getContentRating(id):
    response = requests.get(f"https://api.themoviedb.org/3/movie/{id}/release_dates?api_key={api_key}")
    for i in response.json()['results']:
        if i['iso_3166_1'] == 'US':
            return i['release_dates'][0]['certification']


In [31]:
def getGross(id):
    response = requests.get(f"https://api.themoviedb.org/3/movie/{id}?api_key={api_key}")
    return response.json()['revenue']

In [32]:
def getBudget(id):
    response = requests.get(f"https://api.themoviedb.org/3/movie/{id}?api_key={api_key}")
    return response.json()['budget']

In [272]:
getBudget(getMovieID('Great Gatsby'))


105000000

In [33]:
null_values = imdb_data.isnull()



# Use the any() method to identify which rows have at least one null value
null_rows = null_values.any(axis=1)

# Use boolean indexing to filter the dataframe and extract the rows with null values
result = imdb_data[null_rows]
print(result.columns)

# Print the result
print(len(result))

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')
964


In [34]:
def fillValues(nulldataSet,original):
    for index, row in nulldataSet.iterrows():
    # Replace null values in column1 with 0
        try :
            if pd.isnull(row['content_rating']):
                original.at[index, 'content_rating'] = getContentRating(getMovieID(nulldataSet.at[index, 'movie_title']))
            if pd.isnull(row['budget']):
                original.at[index, 'budget'] = getBudget(getMovieID(nulldataSet.at[index, 'movie_title']))
            if pd.isnull(row['gross']):
                original.at[index, 'gross'] = getGross(getMovieID(nulldataSet.at[index, 'movie_title']))
        except IndexError:
            continue
    

In [35]:
fillValues(result,imdb_data)

KeyboardInterrupt: 

In [36]:
imdb_data.isna().sum()

color                          0
director_name                  0
num_critic_for_reviews         0
duration                       0
director_facebook_likes        0
actor_3_facebook_likes         0
actor_2_name                   0
actor_1_facebook_likes         0
gross                        539
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   0
facenumber_in_poster           0
plot_keywords                  0
movie_imdb_link                0
num_user_for_reviews           0
language                       0
country                        0
content_rating               151
budget                       278
title_year                     0
actor_2_facebook_likes         0
imdb_score                     0
aspect_ratio                 203
movie_facebook_likes           0
dtype: int64

In [None]:
imdb_data.dropna(axis=0,subset=['content_rating','gross','budget'],
                                inplace=True)

In [298]:
result.isna().sum()

color                          0
director_name                  0
num_critic_for_reviews         0
duration                       0
director_facebook_likes        0
actor_3_facebook_likes         0
actor_2_name                   0
actor_1_facebook_likes         0
gross                          6
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   0
facenumber_in_poster           0
plot_keywords                  0
movie_imdb_link                0
num_user_for_reviews           0
language                       0
country                        0
content_rating                46
budget                         1
title_year                     0
actor_2_facebook_likes         0
imdb_score                     0
aspect_ratio                 203
movie_facebook_likes           0
dtype: int64

In [299]:
imdb_data.isnull().sum()

color                          0
director_name                  0
num_critic_for_reviews         0
duration                       0
director_facebook_likes        0
actor_3_facebook_likes         0
actor_2_name                   0
actor_1_facebook_likes         0
gross                        630
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   0
facenumber_in_poster           0
plot_keywords                  0
movie_imdb_link                0
num_user_for_reviews           0
language                       0
country                        0
content_rating               167
budget                       348
title_year                     0
actor_2_facebook_likes         0
imdb_score                     0
aspect_ratio                 203
movie_facebook_likes           0
dtype: int64

In [300]:
result.isna().sum()

color                          0
director_name                  0
num_critic_for_reviews         0
duration                       0
director_facebook_likes        0
actor_3_facebook_likes         0
actor_2_name                   0
actor_1_facebook_likes         0
gross                          6
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   0
facenumber_in_poster           0
plot_keywords                  0
movie_imdb_link                0
num_user_for_reviews           0
language                       0
country                        0
content_rating                46
budget                         1
title_year                     0
actor_2_facebook_likes         0
imdb_score                     0
aspect_ratio                 203
movie_facebook_likes           0
dtype: int64