# Import Libraries

In [1]:
# Linear algebra
import numpy as np
#Storage data
import pandas as pd
#Helper function that helps traverse an abstract syntax tree
from ast import literal_eval
# Statistical data visualization
import seaborn as sns
# Generate plots
import matplotlib.pyplot as plt
# Generate interactive plots
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
# Convert String in Datetime 
from datetime import datetime

## Load Dataset from CSV

In [2]:
# Read csv
movies =  pd.read_csv('movies_metadata.csv',
                     skiprows=[19731, 29504, 35588]) ## Filas que tienen un error en el dataset
## Extract Genres in List
movies['genres'] = movies['genres'].apply(lambda x: [i['name'] for i in literal_eval(x)])

## Add year movie realised 

movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
movies['year'] = movies['release_date'].apply(lambda x : x.year)
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995.0
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995.0
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995.0


## Dataset info

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45463 entries, 0 to 45462
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   adult                  45463 non-null  bool          
 1   belongs_to_collection  4491 non-null   object        
 2   budget                 45463 non-null  int64         
 3   genres                 45463 non-null  object        
 4   homepage               7779 non-null   object        
 5   id                     45463 non-null  int64         
 6   imdb_id                45446 non-null  object        
 7   original_language      45452 non-null  object        
 8   original_title         45463 non-null  object        
 9   overview               44509 non-null  object        
 10  popularity             45460 non-null  float64       
 11  poster_path            45077 non-null  object        
 12  production_companies   45460 non-null  object        
 13  p

In [4]:
## Dataset summary

In [5]:
movies.describe()

Unnamed: 0,budget,id,popularity,revenue,runtime,vote_average,vote_count,year
count,45463.0,45463.0,45460.0,45460.0,45203.0,45460.0,45460.0,45376.0
mean,4224579.0,108359.918813,2.921478,11209350.0,94.128199,5.618207,109.897338,1991.881193
std,17424130.0,112460.749278,6.005414,64332250.0,38.40781,1.924216,491.310374,24.05536
min,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1874.0
25%,0.0,26449.5,0.385948,0.0,85.0,5.0,3.0,1978.0
50%,0.0,60003.0,1.127685,0.0,95.0,6.0,10.0,2001.0
75%,0.0,157328.0,3.678902,0.0,107.0,6.8,34.0,2010.0
max,380000000.0,469172.0,547.488298,2787965000.0,1256.0,10.0,14075.0,2020.0


## When did the movies hit the big screen?

In [10]:
# We count how many movies were released each year, then order from the oldest year
movies_per_year = movies['year'].value_counts().sort_index()
movies_per_year.head()

## We generate Scatter
sc = go.Scatter(x=movies_per_year.index,
                y=movies_per_year.values,
                marker = {'color':'#1d00db'})
## Generame layout
lyt = {'title':f"{movies['year'].shape[0]} Films classified by release year ",
         'xaxis':{'title':'Release Year'},
         'yaxis':{'title':'Films'}}

## Generate plot
fig = go.Figure(data=[sc], layout=lyt)
iplot(fig)


(Release Year, number of films made that year )

## Most Popular Categorys of Films 

In [26]:
films_category = pd.Series(np.concatenate(movies['genres'])).value_counts()
films_category

Drama              20265
Comedy             13182
Thriller            7624
Romance             6735
Action              6596
Horror              4673
Crime               4307
Documentary         3932
Adventure           3496
Science Fiction     3049
Family              2770
Mystery             2467
Fantasy             2313
Animation           1935
Foreign             1622
Music               1598
History             1398
War                 1323
Western             1042
TV Movie             767
dtype: int64

In [31]:
# Generate Figure
fig = go.Figure(go.Bar(
            x=films_category.values,
            y=films_category.index,
            orientation='h'))

fig.show()