### Movie Database 
This dataset is the IMDB 5000 Movie Dataset from Kaggle. <br>

https://www.kaggle.com/deepmatrix/imdb-5000-movie-dataset

In [1]:
%matplotlib inline       
import numpy as np
import pandas as pd      # Import 'numpy' and 'pandas' modules

# Set pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 15)
pd.set_option('display.width', 120)

movie = pd.read_csv("data/movie_metadata.csv")    #reads data to current_data
movie.head()                                      #prints first five rows of data

   color      director_name  num_critic_for_reviews  duration  director_facebook_likes         ...           \
0  Color      James Cameron                   723.0     178.0                      0.0         ...            
1  Color     Gore Verbinski                   302.0     169.0                    563.0         ...            
2  Color         Sam Mendes                   602.0     148.0                      0.0         ...            
3  Color  Christopher Nolan                   813.0     164.0                  22000.0         ...            
4    NaN        Doug Walker                     NaN       NaN                    131.0         ...            

   title_year actor_2_facebook_likes  imdb_score  aspect_ratio movie_facebook_likes  
0      2009.0                  936.0         7.9          1.78                33000  
1      2007.0                 5000.0         7.1          2.35                    0  
2      2015.0                  393.0         6.8          2.35             

In [2]:
movie.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name', 'facenumber_in_poster',
       'plot_keywords', 'movie_imdb_link', 'num_user_for_reviews', 'language', 'country', 'content_rating', 'budget',
       'title_year', 'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [3]:
movie.describe()           # Gives the statistics of all the numeric columns in the dataset

       num_critic_for_reviews     duration  director_facebook_likes  actor_3_facebook_likes  actor_1_facebook_likes  \
count             4993.000000  5028.000000              4939.000000             5020.000000             5036.000000   
mean               140.194272   107.201074               686.509212              645.009761             6560.047061   
std                121.601675    25.197441              2813.328607             1665.041728            15020.759120   
min                  1.000000     7.000000                 0.000000                0.000000                0.000000   
25%                 50.000000    93.000000                 7.000000              133.000000              614.000000   
50%                110.000000   103.000000                49.000000              371.500000              988.000000   
75%                195.000000   118.000000               194.500000              636.000000            11000.000000   
max                813.000000   511.000000      

In [4]:
movie['movie_title']        # List of all titles of the movies

0                                                 Avatar 
1               Pirates of the Caribbean: At World's End 
2                                                Spectre 
3                                  The Dark Knight Rises 
4       Star Wars: Episode VII - The Force Awakens    ...
5                                            John Carter 
6                                           Spider-Man 3 
                              ...                        
5036                                     The Mongol King 
5037                                           Newlyweds 
5038                             Signed Sealed Delivered 
5039                           The Following             
5040                                A Plague So Pleasant 
5041                                    Shanghai Calling 
5042                                   My Date with Drew 
Name: movie_title, Length: 5043, dtype: object

In [5]:
movie.shape        # There are 5043 observations and 28 variables in the dataset

(5043, 28)

In [6]:
movie.duplicated().sum()     # Number of movies that were duplicated

45

In [7]:
movie[movie.duplicated()]['movie_title']    # List of movies which are duplicated

137                          The Legend of Tarzan 
187     The Twilight Saga: Breaking Dawn - Part 2 
204                           Godzilla Resurgence 
303                                           Pan 
389                                Fantastic Four 
395                      The Fast and the Furious 
590                                      Hercules 
                           ...                    
4631                                       Trance 
4769                                   Crossroads 
4882                              The Love Letter 
4927                                  The Calling 
4942                                   Cat People 
4950                            A Dog's Breakfast 
4951                     Night of the Living Dead 
Name: movie_title, Length: 45, dtype: object

In [8]:
movie.drop_duplicates(inplace=True)     # Drop the duplicates
movie.shape                             

(4998, 28)

In [9]:
movie.duplicated().sum()

0

#### Top movies country wise based on imdb_score

In [10]:
movie['imdb_score'].isnull().sum()        # Check if any movie score is null

0

In [11]:
movie['country'].isnull().sum()        # Check NaN entries in country column

5

In [45]:
movie.country = movie.country.fillna('Not Available')   # Replace null values in country with a string none given

In [46]:
movie['country'].isnull().sum()

0

In [47]:
# Top 10 actors by revenue

top_10_actors = movie.groupby('actor_1_name')['gross'].sum().sort_values(ascending=False).head(10)
top_10_actors

actor_1_name
Johnny Depp          3.688020e+09
Harrison Ford        3.391556e+09
Tom Hanks            3.264559e+09
Tom Cruise           2.987622e+09
J.K. Simmons         2.856407e+09
Will Smith           2.762618e+09
Leonardo DiCaprio    2.640582e+09
Robert Downey Jr.    2.456990e+09
Jennifer Lawrence    2.367856e+09
Robin Williams       2.297193e+09
Name: gross, dtype: float64

In [48]:
# Top 10 directors by revenue

top_10_directors = movie.groupby('director_name')['gross'].sum().sort_values(ascending=False).head(10)
top_10_directors


director_name
Steven Spielberg     4.114233e+09
Peter Jackson        2.592969e+09
Michael Bay          2.231243e+09
Tim Burton           2.071275e+09
Sam Raimi            2.049549e+09
James Cameron        1.948126e+09
Christopher Nolan    1.813228e+09
George Lucas         1.741418e+09
Robert Zemeckis      1.619309e+09
Chris Columbus       1.618708e+09
Name: gross, dtype: float64

In [16]:
movie.color.unique()         # What type of movies are present, i.e color or black and white

array(['Color', nan, ' Black and White'], dtype=object)

In [20]:
movie[movie['duration'] >= 180].shape[0]   # Number of movies having a duration of more than 3 hours

68

In [21]:
movie[movie['imdb_score'] > 8.0].shape[0]   # Number of movies having a rating of 8 or above on IMDB

250

In [22]:
movie['director_name'].nunique()      # Total number of directors in the dataset

2398

In [36]:
top_15_number = movie['director_name'].value_counts().nlargest(15)    # Top 15 directors who made most number of movies
top_15_number

Steven Spielberg     26
Woody Allen          22
Martin Scorsese      20
Clint Eastwood       20
Ridley Scott         17
Spike Lee            16
Steven Soderbergh    16
Tim Burton           16
Renny Harlin         15
Oliver Stone         14
Robert Zemeckis      13
Robert Rodriguez     13
Michael Bay          13
Joel Schumacher      13
Barry Levinson       13
Name: director_name, dtype: int64

In [43]:
# Average rating of directors's movies on IMDB, with highest at the top

avg_rating_directors = movie.groupby('director_name')['imdb_score'].mean().sort_values(ascending=False).head(10)
avg_rating_directors

director_name
John Blanchard      9.5
Cary Bell           8.7
Sadyk Sher-Niyaz    8.7
Mitchell Altieri    8.7
Mike Mayhall        8.6
Charles Chaplin     8.6
Ron Fricke          8.5
Damien Chazelle     8.5
Majid Majidi        8.5
Raja Menon          8.5
Name: imdb_score, dtype: float64

In [44]:
# Average duration of movies of the directors (longest at the top)

avg_duration_directors = movie.groupby('director_name')['duration'].mean().sort_values(ascending=False).head(10)
avg_duration_directors

director_name
Chatrichalerm Yukol     300.0
Ron Maxwell             275.5
Peter Flinth            270.0
Michael Cimino          254.0
Joseph L. Mankiewicz    251.0
George Stevens          225.0
Michael Wadleigh        215.0
Stanley Kramer          191.5
David Lean              188.0
Kevin Costner           184.0
Name: duration, dtype: float64

In [42]:
movie.groupby(['director_name', 'title_year'])['title_year'].count()

director_name       title_year
A. Raven Cruz       2005.0        1
Aaron Hann          2015.0        1
Aaron Schneider     2009.0        1
Aaron Seltzer       2006.0        1
Abel Ferrara        1996.0        1
Adam Brooks         2008.0        1
Adam Carolla        2015.0        1
                                 ..
Zal Batmanglij      2011.0        1
                    2013.0        1
Zoran Lisinac       2013.0        1
Álex de la Iglesia  2008.0        1
Émile Gaudreault    2003.0        1
Éric Tessier        2003.0        1
Étienne Faure       2015.0        1
Name: title_year, Length: 4739, dtype: int64