In [None]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import numpy as np
from numpy import mean
import json
import gzip
from subprocess import check_output
import math
import ast
%matplotlib inline

In [None]:
production_df = pd.read_excel('../data/tmdb_production_data.xlsx', index_col=0)
movie_df = pd.read_excel('../data/tmdb_movie_data.xlsx', index_col=0)
genre_df = pd.read_excel('../data/movie_genres_data.xlsx', index_col=0)
cast_df = pd.read_excel('../data/tmdb_cast_data.xlsx', index_col=0)

In [None]:
director_df = cast_df.loc[cast_df['cast_known_for_department']=='Directing']
director_df

In [None]:
director_movie = director_df[['movie_id', 'cast_name']]
director_movie = director_movie.groupby("cast_name").agg({"movie_id": lambda x: list(x)})
director_movie.reset_index(inplace=True)
director_movie = director_movie.rename(columns={'cast_name':'Director'})
director_movie

In [None]:
movie_profits = production_df[['movie_title', 'movie_budget']]
movie_profits

In [None]:
movie_ids = movie_df[['movie_id', 'movie_title']]
movie_ids

In [None]:
profit_ids = movie_profits.merge(movie_ids)
profit_ids = profit_ids[profit_ids['movie_budget'] != 0]
profit_ids = profit_ids.drop(columns={'movie_title'})
profit_ids

In [None]:
profit_genre = profit_ids.merge(genre_df)

animation_movies = profit_genre[profit_genre['Animation'] == 1]
animation_movies

In [None]:
animation_movies = animation_movies.drop(columns={'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western'})
animation_movies

In [None]:
avg_ani_budget = animation_movies['movie_budget'].mean()
round(avg_ani_budget/1000000,0)

In [None]:
repeated_rows = []
for movie_id, profits in zip(animation_movies['movie_id'], animation_movies['movie_budget']):
    directors = director_movie[director_movie['movie_id'].apply(lambda x: movie_id in x)]['Director'].values
    for director in directors:
        repeated_rows.append({'movie_id': movie_id, 'movie_budget': profits, 'Director': director})

animation_movies_repeated = pd.DataFrame(repeated_rows)

In [None]:
animation_movies_repeated

In [None]:
animation_movies_repeated = animation_movies_repeated.drop(columns={'movie_id'})

In [None]:
movies_directed = animation_movies_repeated.groupby('Director').size()
movies_directed = pd.DataFrame(movies_directed)
movies_directed.reset_index(inplace=True)
movies_directed = movies_directed.rename(columns={0:'Movies Directed'})
movies_directed

In [None]:
animation_movies_repeated = animation_movies_repeated.groupby('Director').mean()
animation_movies_repeated.reset_index(inplace=True)
animation_movies_repeated['movie_budget'] = round(animation_movies_repeated['movie_budget']/1000000,0)
animation_movies_repeated

In [None]:
director_data = animation_movies_repeated.merge(movies_directed)
director_data

In [None]:
canidates = ['Brad Bird', 'Lee Unkrich', 'Stephen J. Anderson']

In [None]:
result = director_data[director_data['Director'].isin(canidates)]
result = result.rename(columns={'movie_budget':'Average Budget (Millions)'})
result = result.sort_values('Average Budget (Millions)', ascending=1)
result

In [None]:
means = list(result['Average Budget (Millions)'])
means = [int(i) for i in means]
means

In [None]:
plt.figure(figsize=(14,11))
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")

palette = {'Brad Bird':'#485c70', 'Lee Unkrich':'#630330', 'Stephen J. Anderson':'#D61A1F'}

bar = sns.barplot(x='Director', y='Average Budget (Millions)', data=result, palette=palette, ci=False);

for x in range(len(means)):
    bar.text(x,means[x]+1.5, str(round(means[x],0)),
            fontdict = dict(color='white', fontsize=18),
            horizontalalignment = 'center')