# Import Libraries

In [1]:
# Linear algebra
import numpy as np
#Storage data
import pandas as pd
#Helper function that helps traverse an abstract syntax tree
from ast import literal_eval
# Statistical data visualization
import seaborn as sns
# Generate plots
import matplotlib.pyplot as plt
# Generate interactive plots
#!pip install plotly==5.8.0 
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
# Convert String in Datetime 
from datetime import datetime

## Load Main Dataset from CSV

In [None]:
# DATASET FROM : https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?select=movies_metadata.csv

# Read csv
movies =  pd.read_csv('movies_metadata.csv',
                     skiprows=[19731, 29504, 35588]) ## Filas que tienen un error en el dataset
## Extract Genres in List
movies['genres'] = movies['genres'].apply(lambda x: [i['name'] for i in literal_eval(x)])

## Add year movie realised 

movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
movies['year'] = movies['release_date'].apply(lambda x : x.year)
movies.head()

## Dataset info

In [None]:
movies.info()

### Filling percentage of each column of dataset

In [None]:
missing_columns = movies.isnull().sum().reset_index()
missing_columns.columns = ['variables', 'nan_values']
missing_columns['filling_factor'] = (movies.shape[0] 
                                - missing_columns['nan_values']) / movies.shape[0] * 100
missing_columns

In [None]:
# Generate Figure
fig = go.Figure(go.Bar(
            y=missing_columns['filling_factor'],
            x=missing_columns['variables']))

fig.show()

The good news is that the variables we want to use for this notebook have few NaN values.

## When did the movies hit the big screen?

In [None]:
# We count how many movies were released each year, then order from the oldest year
movies_per_year = movies['year'].value_counts().sort_index()
movies_per_year.head()

## We generate Scatter
sc = go.Scatter(x=movies_per_year.index,
                y=movies_per_year.values,
                marker = {'color':'#1d00db'})
## Generame layout
lyt = {'title':f"{movies['year'].shape[0]} Films classified by release year from this Dataset",
         'xaxis':{'title':'Release Year'},
         'yaxis':{'title':'Films'}}

## Generate plot
fig = go.Figure(data=[sc], layout=lyt)
iplot(fig)

(Release Year, number of films made that year )

## Most popular original language of Films

In [None]:
langs = movies['original_language'].value_counts()
or_languages_film = pd.DataFrame(langs[:10])
or_languages_film.loc['Other languages', :] = langs[10:].sum()
print(or_languages_film)

In [None]:
sc= go.Pie(labels=or_languages_film.index, values=or_languages_film.values.flatten())

lyt = {'title':'Films by language'}

fig = go.Figure(data=[sc],layout=lyt)

fig.show()

## Most Popular Categorys of Films 

In [None]:
films_category = pd.Series(np.concatenate(movies['genres'])).value_counts()
films_category

In [None]:
sc= go.Bar(x=films_category.values,
           y=films_category.index, orientation='h')

lyt = {'title':'Films by category',
         'xaxis':{'title':'Number of Films'},
         'yaxis':{'title':'Categories'}}

fig = go.Figure(data=[sc],layout=lyt)

fig.show()

## Countries with the most film production

In [None]:
# movies['production_countries'] = movies.loc[movies['production_countries'].notna(), 'production_countries'].str.split("'name': ").str[1].str.split("'").str[1]
# production_countries = pd.Series(movies['production_countries']).value_counts()
movies['production_countries'] = movies['production_countries'].replace(np.nan,'unknown')
movies['production_countries'] = movies['production_countries'].apply(lambda x: ['unknown'] if x=='unknown' else [i['name'] for i in literal_eval(x)])

In [None]:
production_countries = pd.Series(np.concatenate(movies['production_countries'])).value_counts()
production_countries

In [None]:
sc= go.Bar(x=production_countries.values[:15],
           y=production_countries.index[:15], orientation='h')

lyt = {'title':'15 countries with the most film production ',
         'xaxis':{'title':'Number of Films produced'},
         'yaxis':{'title':'Countries'}}

fig = go.Figure(data=[sc],layout=lyt)

fig.show()

### Greater number of films by category by country

In [None]:
# for this reason, we need create a function that compute most category by film for each country
def mostCategoresProducedByCountry(country:str):
    def select_country(countries:list):
        flag = False
        for c in countries:
            if c==country:
                flag = True
        return flag
    # select films that has been produced in this country
    df = movies.loc[movies['production_countries'].apply(select_country)][['production_countries','genres']].reset_index()
    # count the categories
    cat_counts = pd.Series(np.concatenate(df['genres'])).value_counts()
    ## Generate Bar
    scx= go.Bar(x=cat_counts.values, y=cat_counts.index, orientation='h')

    lytx = {'title':f'Films by category : {country}',
             'xaxis':{'title':'Number of Films'},
             'yaxis':{'title':'Categories'}}

    figx = go.Figure(data=[scx],layout=lytx)

    figx.show()


### United States of America

In [None]:
mostCategoresProducedByCountry('United States of America')

### United Kingdom

In [None]:
mostCategoresProducedByCountry('United Kingdom')

### France

In [None]:
mostCategoresProducedByCountry('France')

### Correlations in Movies Metadata

In [None]:
plt.figure(figsize = (15,8))
heatmap = sns.heatmap(movies.corr(), annot=True, cmap='YlGnBu')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':16}, pad=12)
plt.show()

## We relate credits dataset with Movies Metadata

In [None]:
credits = pd.read_csv('credits.csv')
credits.head(5)

In [None]:
# Auxiliar function that return director of Movie and his genre (number)
def getDirTuple(x:str):
    for i in x:
        if i['job'] == 'Director':
            return [i['name'],i['gender']]
    return [np.nan,0]

# We know the gender value by this website: https://towardsdatascience.com/bechdel-test-comparing-female-representation-metrics-in-movies-6cbade15010f
def getGender(x:str):
    if x==2:
        return 'male'
    elif x==1:
        return 'female'
    elif x==0:
        return 'unknown'
    
# We add director's name and gender (numeric value) in list
credits['director_ls'] = credits['crew'].apply(lambda x: getDirTuple(literal_eval(x)))
# We split director's name and gender in 2 new columns
# https://stackoverflow.com/questions/35491274/split-a-pandas-column-of-lists-into-multiple-columns
credits['director_name'], credits['director_gender'] = zip(*list(credits['director_ls'].values)) ##More efficient
# Transform gender (numeric value) in string
credits['director_gender'] = credits['director_gender'].apply(getGender)
# we select each actor of the film
credits['actors'] = credits['cast'].apply(lambda x: [i['name'] for i in literal_eval(x)])
credits.head()

In [None]:
# Now, we merge the most relevant from Movie Metadata
movies = movies.merge(credits,on='id')
movies.head(2)

### Percentage according to the gender of the director

In [None]:
gender_values = movies['director_gender'].value_counts()
sc= go.Pie(labels=gender_values.index, values=gender_values.values)
lyt = {'title':'Percentage according to the gender of the director'}
fig = go.Figure(data=[sc],layout=lyt)
fig.show()

## Highest-grossing films (without adjusted for inflation)

In [None]:
# Both are equivalent function  but nlargest is more performant.
# df_new = movies[['original_title','revenue']].sort_values(by=['revenue'],ascending=False)
#df_new.head()
index_revenues = movies['revenue'].nlargest(7).index
most_revenues = movies[['original_title','revenue']].loc[index_revenues]
most_revenues

In [None]:
sc= go.Bar(x=most_revenues['revenue'],
           y=most_revenues['original_title'], orientation='h')

lyt = {'title':'Highest-grossing films (without adjusted for inflation)',
         'xaxis':{'title':'Revenue'},
         'yaxis':{'title':'Title of film'}}

fig = go.Figure(data=[sc],layout=lyt)

fig.show()

In [None]:
# Most Popular Director

In [None]:
n_director = movies['director_name'].value_counts()
n_director

In [None]:
#TODO: perform inference
# You can get higher directors according to the average ratings of their movies
# percentage of male and female actors in movies
# percentage of major movies that have spent money to make.

