In [1]:

import pandas as pd
import requests
import json
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker # updated
import seaborn as sns
import numpy as np

# Extract the money and data

In [2]:
movies = pd.read_csv("Resources/tmdb_5000_movies.csv")
movies_df = pd.DataFrame(movies)
movies_df=movies_df.rename(columns={"overview": "Overview", "vote_average":"Average Vote", "title":"Title", "revenue":"Revenue", "budget":"Budget"})
movies_df.head(3)

Unnamed: 0,Budget,genres,homepage,id,keywords,original_language,original_title,Overview,popularity,production_companies,production_countries,release_date,Revenue,runtime,spoken_languages,status,tagline,Title,Average Vote,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


# movies_df.describe()

In [3]:
movies_df.describe()

Unnamed: 0,Budget,id,popularity,Revenue,runtime,Average Vote,vote_count
count,4803.0,4803.0,4803.0,4803.0,4801.0,4803.0,4803.0
mean,29045040.0,57165.484281,21.492301,82260640.0,106.875859,6.092172,690.217989
std,40722390.0,88694.614033,31.81665,162857100.0,22.611935,1.194612,1234.585891
min,0.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,790000.0,9014.5,4.66807,0.0,94.0,5.6,54.0
50%,15000000.0,14629.0,12.921594,19170000.0,103.0,6.2,235.0
75%,40000000.0,58610.5,28.313505,92917190.0,118.0,6.8,737.0
max,380000000.0,459488.0,875.581305,2787965000.0,338.0,10.0,13752.0


In [4]:
movies_df.isnull().any()

Budget                  False
genres                  False
homepage                 True
id                      False
keywords                False
original_language       False
original_title          False
Overview                 True
popularity              False
production_companies    False
production_countries    False
release_date             True
Revenue                 False
runtime                  True
spoken_languages        False
status                  False
tagline                  True
Title                   False
Average Vote            False
vote_count              False
dtype: bool

In [5]:
movies_df.nunique()

Budget                   436
genres                  1175
homepage                1691
id                      4803
keywords                4222
original_language         37
original_title          4801
Overview                4800
popularity              4802
production_companies    3697
production_countries     469
release_date            3280
Revenue                 3297
runtime                  156
spoken_languages         544
status                     3
tagline                 3944
Title                   4800
Average Vote              71
vote_count              1609
dtype: int64

In [6]:
movies_df["status"].value_counts()

Released           4795
Rumored               5
Post Production       3
Name: status, dtype: int64

In [7]:
sum(pd.isnull(movies_df['release_date']))

1

In [9]:
movies_df.dropna(subset=["release_date"],inplace=True)

In [10]:
movies_df.loc[(movies_df["Revenue"]<=0) | (movies_df["Budget"]<=0)].count()

Budget                  1573
genres                  1573
homepage                 365
id                      1573
keywords                1573
original_language       1573
original_title          1573
Overview                1570
popularity              1573
production_companies    1573
production_countries    1573
release_date            1573
Revenue                 1573
runtime                 1571
spoken_languages        1573
status                  1573
tagline                  975
Title                   1573
Average Vote            1573
vote_count              1573
dtype: int64

In [11]:
movies_df.loc[(movies_df["Average Vote"]==0)].count()

Budget                  62
genres                  62
homepage                13
id                      62
keywords                62
original_language       62
original_title          62
Overview                61
popularity              62
production_companies    62
production_countries    62
release_date            62
Revenue                 62
runtime                 61
spoken_languages        62
status                  62
tagline                 15
Title                   62
Average Vote            62
vote_count              62
dtype: int64

# Movie data analysis results

- Out of c4,803 records in the source data c1,574 have zero value for Budget and/or Revenue - removed.
- 1 record had Null release date - removed.
- 8 Records in status Rumored(c5) and Post Production(c3) removed.
- Average Vote is 0 for 62 records - to be taken into account

# Create Clean Dataset

In [12]:
movies_money_df=movies_df.loc[(movies_df["status"]=="Released") & (movies_df["Revenue"]>0) & (movies_df["Budget"]>0),["Budget","Revenue","Title","Average Vote","release_date"]]

In [13]:
movies_money_df.isnull().any()

Budget          False
Revenue         False
Title           False
Average Vote    False
release_date    False
dtype: bool

In [14]:
movies_money_df.describe()

Unnamed: 0,Budget,Revenue,Average Vote
count,3228.0,3228.0,3228.0
mean,40666420.0,121280300.0,6.309665
std,44398400.0,186319700.0,0.873846
min,1.0,5.0,0.0
25%,10500000.0,17000000.0,5.8
50%,25000000.0,55191500.0,6.3
75%,55000000.0,146343400.0,6.9
max,380000000.0,2787965000.0,8.5
