# Getting familiar with Movies_Metadata File

#### Loading libraries and data

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
#Set the path to the data files
path="C:/Users/elsad/OneDrive/master/03_reve/data/"
metadata_file="movies_metadata.csv"

In [3]:
metadata=pd.read_csv(path+metadata_file,low_memory=False)
#The reason you get this low_memory warning is because guessing dtypes for each column is very memory demanding. 
#Pandas tries to determine what dtype to set by analyzing the data in each column

#### Analyzing what we have

In [4]:
print(type(metadata))
print(metadata.shape)
print(metadata.columns)

<class 'pandas.core.frame.DataFrame'>
(45466, 24)
Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


We have a 45466 row DataFrame, with 24 columns or variables. At first sight, we are interested in the following columns
- Id
- Belongs to collection
- Budget
- Genres
- Popularity
- Production companies
- Production countries
- Release date
- **Revenue**
- Runtime
- Title


In [5]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [6]:
metadata.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [46]:
#print (metadata[pd.to_numeric(metadata['id'], errors='coerce').isnull()])

                                                   adult  \
19730                                 - Written by Ørnås   
29503   Rune Balot goes to a casino connected to the ...   
35587   Avalanche Sharks tells the story of a bikini ...   

      belongs_to_collection                            budget  \
19730              0.065736  /ff9qCepilowshEtG2GYWwzt2bs4.jpg   
29503              1.931659  /zV8bHuSL6WXoD6FWogP9j4x80bL.jpg   
35587              2.185485  /zaSf5OG7V8X8gqFvly88zDdRm46.jpg   

                                                  genres  \
19730  [{'name': 'Carousel Productions', 'id': 11176}...   
29503  [{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...   
35587  [{'name': 'Odyssey Media', 'id': 17161}, {'nam...   

                                                homepage          id imdb_id  \
19730  [{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...  1997-08-20       0   
29503  [{'iso_3166_1': 'US', 'name': 'United States o...  2012-09-29       0   
35587           [

Apparently, seems like there are no null, but let's take a deeper look at each column

In [7]:
metadata[metadata.columns].dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

- Budget should be numeric
- Belongs_to_collection should be bool
- Id should be numeric

#### 0. Id

This will be the Key to join the different dataframes analyzed in this project. As we have established before, there are no missing values for this key-column, but type is wrong because it shouldn't be object but numeric. 

In [8]:
metadata["id"] =pd.to_numeric(metadata['id'], errors='coerce',downcast="integer")

In [10]:
metadata["id"].dtype

dtype('float64')

#### 1. Revenue

In [6]:
metadata["revenue"].describe()

count    4.546000e+04
mean     1.120935e+07
std      6.433225e+07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      2.787965e+09
Name: revenue, dtype: float64

In [14]:
metadata["revenue"].isnull().sum()

6

The whole ponint to our model is being able to predict a movie revenue, so, this column is essential for this project. That's why the first thing we should do is find out how many movies do we have that actually have a registered value for the revenue. If this value is null, we will have to drop those rows.

As we found out in the previous cell, and also checked at metadata's stats, there are 6 null values in Revenue. Furthermore, we ought to check if we have values worth $0 revenue, because they'd be like null for us. 

In [16]:
metadata[metadata["revenue"]>0].shape

(7408, 24)

Out of 45466 movies, there are only 7408 rows with revenue metadata. We are going to create a new dataframe containing only the movies with revenue information, since it's essencial for our model. From now on, we would be using this new dataframe named "reves_metadata".

In [18]:
reves_metadata=metadata[metadata["revenue"]>0]

In [22]:
print(reves_metadata.shape)
print(reves_metadata.columns)

(7408, 24)
Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


The new file has 7408 rows with the same 24 columns

In [26]:
print(reves_metadata["revenue"].describe())
print("-----")
print("There are %d null values in revenue column in this new dataframe" % (reves_metadata["revenue"].isnull().sum()))

count    7.408000e+03
mean     6.878739e+07
std      1.464203e+08
min      1.000000e+00
25%      2.400000e+06
50%      1.682272e+07
75%      6.722707e+07
max      2.787965e+09
Name: revenue, dtype: float64
-----
There are 0 null values in revenue column in this new dataframe


Now let's check some random values. They seem like reasonable numbers for a movie revenue, don't they?

In [27]:
reves_metadata["revenue"].sample(10)

26562    519311965.0
1868      32980880.0
10870      3910019.0
2526     300218018.0
1667        302204.0
2283     289317794.0
23349     10429707.0
14817       150000.0
35455       140779.0
1924      21288692.0
Name: revenue, dtype: float64

#### 2. Belongs to Collection

In [12]:
reves_metadata["belongs_to_collection"].head(10)

0     {'id': 10194, 'name': 'Toy Story Collection', ...
1                                                   NaN
3                                                   NaN
4     {'id': 96871, 'name': 'Father of the Bride Col...
5                                                   NaN
8                                                   NaN
9     {'id': 645, 'name': 'James Bond Collection', '...
10                                                  NaN
12    {'id': 117693, 'name': 'Balto Collection', 'po...
13                                                  NaN
Name: belongs_to_collection, dtype: object

It seems that this column contains 2 different types of values: 

- If, indeed, the movie belongs to a series or a collection, the row contains a dict with the Id and the Name of that collection. 
- Otherwise, if the movie does not belong to a collection, the value is NaN. 

For our model, the valuable information is to know whether a movie belongs to a collection or no, because our assumption is that if it does, the expected revenue could be higher. 

Values different from NaN mean that they actually belong to a collection, so this could be simplified if translated into a dummie column where 0 means "Doesn't belong to collection" and 1 "Does belong to a collection". 

We will also create a  new column named "Collection info" with the id of the collection, in order to use it in the future if necessary. 

In [13]:
type(reves_metadata["belongs_to_collection"][0])

str

In [14]:
type(reves_metadata["belongs_to_collection"][1])

float

We have different type of data in this Pandas Series:
- Dict values --> str
- NaN values --> float

In [15]:
reves_metadata["collection"]=reves_metadata["belongs_to_collection"].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
reves_metadata["belongs_to_collection"][1]

nan

In [None]:
for index,value in reves_metadata["belongs_to_collection"].items():
    reves_metadata["belongs_to_collection"][index]=value
    if type (value) == str:
        value=1
        print(value)
    else:
        value=0
        print(value)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


1
0
0
1
0
0
1
0
1
0
0
0
0
0
1
0
1
0
0
0
0
0
0
1
0
0
1
0
0
0
1
0
0
0
0
1
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
1
0
1
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
1
0
1
0
0
1
0
1
1
0
1
0
0
0
0
0
1
0
0
1
0
1
0
0
1
0
0
1
0
0
0
1
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
1
0
0
0
0
0
0
0
1
0
0
1
0
1
0
1
0
0
0
0
0
0
1
0
1
1
1
0
0
1
1
0
0
0
0
0
0
1
1
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
1
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
1
0
1
1
0
1
0
0
0
0
1
0
1
1
0
0
0
1
0
0
0
0
1
1
0
0
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
1
1
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
1
0
0
0
1
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
0
0
0
1
0
0
0
0
0
0
1
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0


In [19]:
reves_metadata["belongs_to_collection"].head()

0    {'id': 10194, 'name': 'Toy Story Collection', ...
1                                                  NaN
3                                                  NaN
4    {'id': 96871, 'name': 'Father of the Bride Col...
5                                                  NaN
Name: belongs_to_collection, dtype: object

In [None]:
#hint! Use replace(what,with)

#### 3. Budget

First, basic data exploring

In [28]:
reves_metadata["budget"].head()

0    30000000
1    65000000
3    16000000
4           0
5    60000000
Name: budget, dtype: object

In [29]:
reves_metadata["budget"].describe()

count     7408
unique     705
top          0
freq      2027
Name: budget, dtype: object

Surprise! Seems like values are not numeric! We should fix that.

In [30]:
reves_metadata["budget"]=pd.to_numeric(reves_metadata["budget"], errors="coerce")
reves_metadata["budget"]=reves_metadata["budget"].replace(0,np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [31]:
reves_metadata["budget"].dtype

dtype('float64')

Now we have to analyze, out of the 7408, how many movies contain budget information in order to determine if we can use this variable in our model. As it happened with "Revenue" seems like we have not nulls, but we have false positives, since a 0 budget is null for us and a perfectly "not null" value for Python. This column should be numeric and $0 budget should be Nan.

In [69]:
reves_metadata[reves_metadata["budget"]>0]["budget"].count()

5381

In [32]:
reves_metadata["budget"].sample(5)

900      1688000.0
3894    43000000.0
8057     2500000.0
4626    40000000.0
2292    25000000.0
Name: budget, dtype: float64

Out of 7404 movies, 5381 have a positive value for budget. Is that enough data for our model?
Also, taking a deeper look at the values, seems that most of them are exact and round amounts, and they don't seem very accurate. Should we use this data in our model?

#### 4. Genres

In [28]:
reves_metadata["genres"].sample(5)

21841    [{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...
12299    [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
2603                        [{'id': 35, 'name': 'Comedy'}]
2870     [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
6489     [{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...
Name: genres, dtype: object

In [33]:
reves_metadata["genres"][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [34]:
reves_metadata["genres"][1]

"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]"

In [35]:
reves_metadata["genres"][235]

"[{'id': 10749, 'name': 'Romance'}, {'id': 16, 'name': 'Animation'}, {'id': 10751, 'name': 'Family'}, {'id': 35, 'name': 'Comedy'}, {'id': 12, 'name': 'Adventure'}]"

Seems that in this column, we can find a list of dicts that indicate the different movie genres associated to each movie. As we can see, there are many different generes associated to each movie, but not always the same ammount of them. First of all, we should find out how many different genres we've got, in order to decide what to do and how to use this information. 

Basic questions we should ask ourselves about the column "Genres":
- Should we create dummie variables with all the genres? 
- What's the maxium numbers of genres per movie we want to use in our model 
- ...
- Maybe we shoul just keep the id_genre info, and save the dict of (index,genre) outside the main dataframe, just for querys.


First, let's find out how many different genres we have got, and what's the maximum number of genres any of the movies has

In [33]:
from ast import literal_eval
reves_metadata['genres'] = reves_metadata['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [34]:
reves_metadata["genres"].head(10)

0          [Animation, Comedy, Family]
1         [Adventure, Fantasy, Family]
3             [Comedy, Drama, Romance]
4                             [Comedy]
5     [Action, Crime, Drama, Thriller]
8        [Action, Adventure, Thriller]
9        [Adventure, Action, Thriller]
10            [Comedy, Drama, Romance]
12      [Family, Animation, Adventure]
13                    [History, Drama]
Name: genres, dtype: object

In [35]:
genres_lenghts=[]
for line in reves_metadata["genres"]:
    lenght=len(line)
    genres_lenghts.append(lenght)
print("There are a max of %d genres per movie"%(max(genres_lenghts)))

There are a max of 8 genres per movie


How many genres per movie are we using for out model?

In [118]:
np.mean(genres_lenghts)

2.4966252699784017

Maybe we should set up the maximux on 3. It's a reasonable number of categories for a variable. In case we do that, we would be assuming that the first 3 genres mentioned in this column would be the most reliable, and the ones that best describe the movie's storyline.

Let's see how many genres we've got just in case we want to create dummie coluns for each of them. 

In [36]:
genres=[]
for line in reves_metadata["genres"]:
    genres.append(line)
flat_list=[item for sublist in genres for item in sublist]
for sublist in genres:
    for item in sublist:
        flat_list.append(item)
genres=list(set(flat_list))
print("There are %d unique genres"%(len(genres)))
print(genres)

There are 20 unique genres
['Horror', 'Fantasy', 'TV Movie', 'Animation', 'Drama', 'Romance', 'Science Fiction', 'History', 'Adventure', 'Family', 'Crime', 'Documentary', 'Foreign', 'Western', 'War', 'Comedy', 'Thriller', 'Action', 'Mystery', 'Music']


In [None]:
#we should do a barplot with the most common genres.

#### 5. Popularity

In [None]:
reves_metadata["popularity"].describe()

In [None]:
reves_metadata["popularity"].sample(5)

#### 6. Production Companies

In [122]:
reves_metadata["production_companies"].head(5)

0       [{'name': 'Pixar Animation Studios', 'id': 3}]
1    [{'name': 'TriStar Pictures', 'id': 559}, {'na...
3    [{'name': 'Twentieth Century Fox Film Corporat...
4    [{'name': 'Sandollar Productions', 'id': 5842}...
5    [{'name': 'Regency Enterprises', 'id': 508}, {...
Name: production_companies, dtype: object

Same as it happened with genres. Contains a list of dicts.

In [123]:
reves_metadata["production_companies"] = reves_metadata["production_companies"].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [125]:
reves_metadata["production_companies"].head(5)

0                            [Pixar Animation Studios]
1    [TriStar Pictures, Teitler Film, Interscope Co...
3             [Twentieth Century Fox Film Corporation]
4         [Sandollar Productions, Touchstone Pictures]
5    [Regency Enterprises, Forward Pass, Warner Bros.]
Name: production_companies, dtype: object

In [133]:
production_companies=[]
for line in reves_metadata["production_companies"]:
    production_companies.append(line)
flat_list2=[item for sublist in production_companies for item in sublist]
for sublist in production_companies:
    for item in sublist:
        flat_list2.append(item)
production_companies=list(set(flat_list2))
print("There are %d unique production companies"%(len(production_companies)))
production_companies

There are 7091 unique production companies


['Chessman Park Productions',
 'Yari Film Group Releasing',
 'The Last Picture Company',
 'Whitewater Films',
 'Dentsu',
 'Centropolis Entertainment',
 'Investec Merchant Bank',
 'Rai 1',
 'Sovik Global Contents Fund',
 'Technical Black',
 'ABC Motion Pictures',
 'TV 1000',
 'Crab Apple Films',
 'Page 114',
 'Distant Horizon',
 'Filmax International',
 "Willie's Movies",
 'Stone Village Pictures',
 'Digital Image Associates',
 'Chevy Chase Films',
 'Spanish Ministry of Culture',
 'Niama Film',
 'TDJ Enterprises / New Dimensions Entertainment',
 'Mindshare Media',
 'New World Pictures',
 'Lonetree Entertainment',
 'Generalitat Valenciana',
 'Bad Robot',
 'New Horizon Picture Corp.',
 'British Film Council',
 'Emmitt Productions Limited',
 'Artfire Films',
 'Harris Company',
 'Huayi Brothers Media',
 'Treasure Company',
 'India Talkies',
 'Producers Sales Organization',
 'Red Hour Films',
 'Government of Saudi Arabia',
 'Vif Babelsberger Filmproduktion GmbH & Co. Zweite KG',
 'Top Shelf 

There are to many production companies for our model. Maybe we should get the most common ones, or the 20% that represent the 80% of the movies. 

#### 7. Production Countries

In [134]:
reves_metadata["production_countries"].sample(5)

1187     [{'iso_3166_1': 'US', 'name': 'United States o...
15581    [{'iso_3166_1': 'US', 'name': 'United States o...
886      [{'iso_3166_1': 'US', 'name': 'United States o...
24372    [{'iso_3166_1': 'US', 'name': 'United States o...
2877     [{'iso_3166_1': 'BS', 'name': 'Bahamas'}, {'is...
Name: production_countries, dtype: object

Almost same as before

In [38]:
#production_countries=[]
#for line in reves_metadata["production_countries"]:
#    production_countries.append(line)
#flat_list3=[item for sublist in production_countries for item in sublist]
#for sublist in production_countries:
#    for item in sublist:
#        flat_list3.append(item)
#production_countries=list(set(flat_list3))
#print("There are %d unique production countries"%(len(production_countries)))
#production_countries

#### 8. Release Date

Release Date can be a very significant variable in our model since we assume that the movies released nearby certain months or times of the year, can reach more revenue more easily. 

In [None]:
print(reves_metadata["release_date"].sample(5))
print("There are %d null values" % (reves_metadata["release_date"].isnull().sum()))

In [None]:
reves_metadata["release_date"][35207][8:11]

In [None]:
type(reves_metadata["release_date"][2255])

Info is stored in string format, so we should transform that to Date format, in order to be useful. 

We can also create new columns just storing the year and the month, since the day or the exact date wont be necessary

In [None]:
reves_metadata["release_date"]=reves_metadata["release_date"].apply(lambda x: str(x).replace("-","/"))
reves_metadata["release_date"]=reves_metadata["release_date"].apply(lambda x: pd.to_datetime(x, infer_datetime_format=True))

In [None]:
from datetime import datetime
#for line in reves_metadata:
    #str(reves_metadata["release_date"])
    #reves_metadata["release_date"].replace("-","/")
reves_metadata["release_date"]=reves_metadata["release_date"].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

In [None]:
reves_metadata["release_year"]=reves_metadata["release date"][0:3]
reves_metadata["release_month"]=""
reves_metadata["release_day"]=""
def split_to_date(x):
    year=x[0:3]
    month=x[5:7]
    day=x[8:11]

#### 9. Runtime

In [None]:
reves_metadata["runtime"].sample(5)

In [None]:
reves_metadata["runtime"].describe()

Seems like runtime means how many days the movie has been running on cinemas. The min is 0, and that should mean that some movies have not been released. We should check if that's true, because if they have not been released, they shouldn't have revenue. 

#### 10. Title