In [None]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import numpy as np
from numpy import mean
import json
import gzip
from subprocess import check_output
import math
%matplotlib inline

In [None]:
production_df = pd.read_excel('../data/tmdb_production_data.xlsx', index_col=0)
movie_df = pd.read_excel('../data/tmdb_movie_data.xlsx', index_col=0)
genre_df = pd.read_excel('../data/movie_genres_data.xlsx', index_col=0)

In [None]:
revenue = production_df[['movie_title', 'movie_revenue', 'movie_budget', 'movie_profits']]
revenue = revenue[revenue['movie_revenue'] != 0]
revenue = revenue[revenue['movie_budget'] != 0]
revenue = revenue.drop(columns={'movie_revenue', 'movie_budget'})
revenue

In [None]:
movie_votes = movie_df[['movie_title', 'movie_avg_vote', 'movie_vote_count']]
movie_votes = movie_votes[movie_votes['movie_vote_count'] > 1000]
movie_votes['movie_avg_vote'] = movie_votes['movie_avg_vote']
movie_votes

In [None]:
rating_revenue = revenue.merge(movie_votes, how='inner')
rating_revenue

In [None]:
box_data = rating_revenue.copy(deep=True)
box_data

In [None]:
rating_revenue['movie_profits'] = rating_revenue['movie_profits']/1000000

In [None]:
data = rating_revenue.groupby('movie_avg_vote').mean()
data = data.drop(columns='movie_vote_count')
data['movie_profits'] = round(data['movie_profits'], 0)
data.reset_index(inplace=True)
data = data.rename(columns={'movie_avg_vote':'Rating', 'movie_profits':'Average Profit (Millions)'})
data

In [None]:
means = list(data['Average Profit (Millions)'])
means = [int(i) for i in means]
means

In [None]:
avg_profit = mean(means)
avg_profit

In [None]:
movie_count = movie_votes.drop(columns={'movie_title'})
movie_count = movie_count.groupby('movie_avg_vote').count()
movie_count

In [None]:
plt.figure(figsize=(14,11))
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")

cols = ['#D61A1F' if (x < avg_profit) else '#F9C31A' for x in data['Average Profit (Millions)']]

bar = sns.barplot(x='Rating', y='Average Profit (Millions)', data=data, palette=cols, ci=False);

for x in range(len(means)):
    bar.text(x,means[x]+2.5, str(round(means[x],0)),
            fontdict = dict(color='white', fontsize=18),
            horizontalalignment = 'center')

In [None]:
box_data = box_data.drop(columns={'movie_title', 'movie_vote_count'})

In [None]:
def custom_round(x):
    if x % 1 >= 0.25 and x % 1 <= 0.75:
        return round(x * 2) / 2
    else:
        return round(x)

In [None]:
box_data['movie_avg_vote'] = box_data['movie_avg_vote'].apply(lambda x: custom_round(x))
box_data['movie_profits'] = box_data['movie_profits']/1000000000
box_data

In [None]:
plt.figure(figsize=(25,11))
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")

box = sns.boxplot(y='movie_avg_vote', x='movie_profits', data=box_data, color='#F9C31A',  orient='h');
box.set(ylabel=('TMDb Rating'), xlabel=('Movie Profits (Billions)'))