In [83]:
# Setting inline to True should display graphs inline
global inline
inline=False

In [69]:
'''
Plot Movie Budget vs Expert Ratings
'''
import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

# Filter out movies with budgets less than $10,000
data = data[(data['budget'] >= 10000) & (data['budget'] <= 300000000)]

# Filter out movies with budgets and ratings that are 0
data = data.query('budget > 0')
data = data.query('RT_expert_rating > 0')

# Define color scheme
color_scheme = {
    'Certified Fresh': '#E0B713',  
    'Fresh': '#b30000',  
    'Rotten': '#444444'  
}

# Create the scatter plot 
chart = alt.Chart(data).mark_point().encode(
    x=alt.X('budget', axis=alt.Axis(title='Budget')),
    y=alt.Y('RT_expert_rating', axis=alt.Axis(title='Expert Rating')),
    color=alt.Color('tomatometer_status:N', legend=alt.Legend(title='Tomato Status'), scale=alt.Scale(range=list(color_scheme.values()))),
    tooltip=['title','budget', 'RT_expert_rating']
).properties(
    title='Movie Budget vs Expert Ratings',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart, inline=inline)



In [70]:
'''
Plot Movie Budget vs User Ratings
'''

import pandas as pd
import altair as alt
import altair_viewer as view

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

# Filter out budgets less than $10,000
data = data[(data['budget'] >= 10000) & (data['budget'] <= 300000000)
]

# Filter out budgets that are zero
data = data.query('budget > 0')
data = data.query('audience_rating > 0')
data = data.dropna(subset=['audience_status'])

color_scheme = {
    'Spilled': '#444444',
    'Upright': '#b30000'  
}
      

# Create the scatter plot
chart = alt.Chart(data).mark_point().encode(
    x=alt.X('budget', axis=alt.Axis(title='Budget')),
    y=alt.Y('audience_rating', axis=alt.Axis(title='User Rating')),
    color=alt.Color('audience_status:N', legend=alt.Legend(title='Tomato Status'), scale=alt.Scale(range=list(color_scheme.values()))),
    tooltip=['title','budget', 'audience_rating']
).properties(
    title='Movie Budget vs User Ratings',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart, inline=inline)


In [71]:
''' 
Plot Revenue by Year
'''

import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# Load the data
data = pd.read_csv("data/master_dataset.csv")

# Filter data by year 1930 - 2017
data = data.query("release_date >= '1930-01-01' and release_date < '2017-01-01'")

# Create a bar chart showing relationships between revenue and year
bar_chart = alt.Chart(data).mark_bar().encode(
    x=alt.X('year(release_date):T', axis=alt.Axis(title='Release Date'), scale=alt.Scale(domain=(1930, 2016))),
    y=alt.Y('sum(revenue)', axis=alt.Axis(title='Total Revenue')),
    color=alt.Color('sum(revenue)', scale=alt.Scale(scheme='goldred'), legend=None),
    tooltip=['year(release_date):T', 'sum(revenue)'],
).properties(
    title='Revenue by Year',
    width=800,
    height=400
).interactive()

# display the chart
view.display(bar_chart, inline=inline)


In [72]:
''' 
Plot User Rating vs Expert Rating
'''

import pandas as pd
import altair as alt
import altair_viewer as view

# load the data
data = pd.read_csv("data/master_dataset.csv")

# Filter out budgets that are zero
data = data.query('budget > 0')

# filter out rows where RT_expert_rating is 0
data = data[(data['RT_expert_rating'] != 0) & (data['audience_rating'].notna())]

# create a scatter plot showing the relationship between user rating and expert rating
scatter_plot = alt.Chart(data).mark_point().encode(
    x=alt.X('user_rating', axis=alt.Axis(title='User Rating')),
    y=alt.Y('RT_expert_rating', axis=alt.Axis(title='Expert Rating')),
    tooltip=['title','user_rating', 'RT_expert_rating', 'budget']
).properties(
    width=800,
    height=400,
    title='User Rating vs Expert Rating'
)

# Display the chart
view.display(scatter_plot, inline=inline)


In [73]:
'''
Production Companies vs Total Revenue
'''

import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

# Extract the name of each production company from the dictionary and explode the column
data['production_companies'] = data['production_companies'].str.split(", ")
data = data.explode('production_companies')

# Filter to get the top 30 production companies based on total revenue and put it into a list
top_producers = data.groupby('production_companies')['revenue'].sum().sort_values(ascending=False).head(15).index.tolist()
data = data[data['production_companies'].isin(top_producers)]

# Define the color scale as a gradient with the desired number of colors
num_colors = len(top_producers)
color_scale = alt.Scale(scheme='goldred', domain=top_producers)


# Create a chart for all selected production companies, sorting in decsending order
chart = alt.Chart(data).mark_bar().encode(
    x=alt.X('production_companies:N', sort='-y', axis=alt.Axis(labelAngle=45, title='Producion Companies')),
    y=alt.Y('sum(revenue):Q', axis=alt.Axis(title='Total Revenue')),
    color=alt.Color('production_companies:N', sort=alt.EncodingSortField('revenue', order='descending'),
                    scale=color_scale, legend=None),
    tooltip=['production_companies:N', 'sum(revenue):Q']
).properties(
    title='Top 10 Production Companies vs Total Revenue',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart, inline=inline)

# Print the top 10 production companies based on total revenue
print("Top 10 production companies based on total revenue:")
print(data.groupby('production_companies')['revenue'].sum().sort_values(ascending=False).head(10))



Top 10 production companies based on total revenue:
production_companies
Warner Bros.                              6.604922e+10
Universal Pictures                        6.194357e+10
Paramount Pictures                        5.409983e+10
Walt Disney Pictures                      5.266523e+10
Twentieth Century Fox Film Corporation    4.876887e+10
Columbia Pictures                         3.551452e+10
New Line Cinema                           2.294863e+10
Amblin Entertainment                      1.752602e+10
DreamWorks SKG                            1.636693e+10
Relativity Media                          1.574079e+10
Name: revenue, dtype: float64


In [74]:
''' 
Genre vs Average User Rating
'''
import altair as alt
import altair_viewer as view
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

# Split the genres column and explode the column
data['genres'] = data['genres'].str.split(', ')
data = data.explode('genres')

# Filter to get the genres and sort in decsending order
top_genres = data.groupby('genres').size().sort_values(ascending=False).index
data = data[data['genres'].isin(top_genres)]

# Group the data by genre and calculate the average user rating for each genre
genre_ratings = data.groupby('genres').agg({'user_rating': 'mean'}).reset_index()

# Define the color scale as a gradient with the desired number of colors
num_colors = len(top_genres)
color_scale = alt.Scale(scheme='goldred', domain=list(range(num_colors)))

# Sort the data by average user rating in descending order and reset the index
genre_ratings = genre_ratings.sort_values('user_rating', ascending=False).reset_index(drop=True)

# Assign a rank to each genre based on its index in the sorted data
genre_ratings['rank'] = genre_ratings.index

# Create a stacked bar chart showing the average user rating for each genre
chart = alt.Chart(genre_ratings).mark_bar().encode(
    x=alt.X('genres:N', sort='-y', axis=alt.Axis(labelAngle=45, title='Genres')),
    y=alt.Y('user_rating:Q', axis=alt.Axis(title='Average User Rating')),
    color=alt.Color('rank:O', scale=color_scale, legend=None)
).properties(
    title='Genre vs Average User Rating',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart, inline=inline)
print(f"There are {len(data['genres'].unique())} unique genres in the data")








There are 20 unique genres in the data


In [75]:
'''
Genres vs Expert Ratings
'''

import pandas as pd
import altair as alt
import altair_viewer as view

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

# Split the genres column and explode the column
data['genres'] = data['genres'].str.split(', ')
data = data.explode('genres')

# Filter to get the genres and sort in decsending order
top_genres = data.groupby('genres').size().sort_values(ascending=False).index
data = data[data['genres'].isin(top_genres)]

# Group the data by genre and calculate the average expert rating for each genre
genre_ratings = data.groupby('genres').agg({'RT_expert_rating': 'mean'}).reset_index()

# Define the color scale as a gradient with the desired number of colors
num_colors = len(top_genres)
color_scale = alt.Scale(scheme='goldred', domain=list(range(num_colors)))

# Sort the data by average expert rating in descending order and reset the index
genre_ratings = genre_ratings.sort_values('RT_expert_rating', ascending=False).reset_index(drop=True)

# Assign a rank to each genre based on its index in the sorted data
genre_ratings['rank'] = genre_ratings.index

# Create a stacked bar chart showing the average expert rating for each genre
chart = alt.Chart(genre_ratings).mark_bar().encode(
    x=alt.X('genres:N', sort='-y', axis=alt.Axis(labelAngle=45, title='Genres')),
    y=alt.Y('RT_expert_rating:Q', axis=alt.Axis(title='Average Expert Rating')),
    color=alt.Color('rank:O', scale=color_scale, legend=None)
).properties(
    title='Genre vs Expert Rating',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart, inline=inline)
print(f"There are {len(data['genres'].unique())} unique genres in the data")


There are 20 unique genres in the data


In [76]:
''' 
Genre vs Total Revenue
'''

import pandas as pd
import altair as alt
import altair_viewer as view

# Load the data
data = pd.read_csv("data/master_dataset.csv")

# Filter out movies with zero revenue and missing genres
data = data.query('revenue > 0')
data = data.dropna(subset=['genres'])

# Explode the genres column to make a row for each genre in a movie
data = data.assign(genres=data['genres'].str.split(',')).explode('genres')

# Remove duplicates from genres column
data['genres'] = data['genres'].str.strip()
data = data.drop_duplicates(subset=['genres', 'imdb_id'])  # Consider unique (genre, imdb_id) pairs

# Calculate the total revenue for each genre
genre_revenue_sum = data.groupby('genres')['revenue'].sum().reset_index()
print("Total Revenue:")
print(genre_revenue_sum)

# Create the bar graph showing revenue by genre
chart = alt.Chart(genre_revenue_sum).mark_bar().encode(
    x=alt.X('genres:N', sort='-y', axis=alt.Axis(labelAngle=45, title='Genres')),
    y=alt.Y('revenue:Q', axis=alt.Axis(title='Total Revenue')),
    color=alt.Color('genres:N', sort=alt.EncodingSortField('revenue', order='descending'),
                    scale=alt.Scale(scheme='goldred', reverse=False), legend=None),
    tooltip=['genres:N', 'revenue:Q']
).properties(
    title='Genre vs Total Revenue',
    width=800,
    height=400
).interactive()

# display the chart
view.display(chart, inline=inline)



Total Revenue:
             genres       revenue
0            Action  2.013881e+11
1         Adventure  1.999787e+11
2         Animation  6.743297e+10
3            Comedy  1.668450e+11
4             Crime  6.337573e+10
5       Documentary  1.449112e+09
6             Drama  1.607544e+11
7            Family  1.070768e+11
8           Fantasy  1.039202e+11
9           Foreign  2.915363e+08
10          History  1.490220e+10
11           Horror  3.083709e+10
12            Music  1.337029e+10
13          Mystery  3.475461e+10
14          Romance  7.347319e+10
15  Science Fiction  9.784796e+10
16         TV Movie  4.200000e+07
17         Thriller  1.297246e+11
18              War  1.591046e+10
19          Western  5.122499e+09


In [77]:
''' 
Genre vs Average Revenue
https://vega.github.io/vega/docs/schemes/
'''
import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# Load the data
data = pd.read_csv("data/master_dataset.csv")

# Filter out movies with zero revenue and missing genres
data = data.query('revenue > 0')
data = data.dropna(subset=['genres'])

# Explode the genres column to make a row for each genre in a movie
data = data.assign(genres=data['genres'].str.split(',')).explode('genres')

# Remove duplicates from genres column
data['genres'] = data['genres'].str.strip()
data = data.drop_duplicates(subset=['genres', 'imdb_id'])  # Consider unique (genre, imdb_id) pairs

# Calculate the average revenue for each genre
genre_revenue = data.groupby('genres')['revenue'].mean().reset_index()

# Sort the genres by revenue in descending order
genre_revenue = genre_revenue.sort_values('revenue', ascending=False)

# Create the bar chart showing average revenue by genre
chart = alt.Chart(genre_revenue).mark_bar().encode(
    x=alt.X('genres:N', sort='-y', axis=alt.Axis(labelAngle=45, title='Genres')),
    y=alt.Y('revenue:Q', axis=alt.Axis(title='Average Revenue')),
    color=alt.Color('genres:N', sort=alt.EncodingSortField('revenue', order='descending'),
                    scale=alt.Scale(scheme='goldred', reverse=False), legend=None),
    tooltip=['genres:N', 'revenue:Q']
).properties(
    title='Genre vs Average Revenue',
    width=800,
    height=400
).interactive()

# display the chart
view.display(chart, inline=inline)



In [78]:
''' 
Top actors vs Revenue
'''
import pandas as pd
import altair as alt
import altair_viewer as view

# load data
data = pd.read_csv('data/master_dataset.csv')

# calculate average revenue for movies with top 100 actors
alist_avg_revenue = data[data['a_list'] == 1]['revenue'].mean()

# calculate average revenue for movies with top 100 actors
top100_avg_revenue = data[data['top_100'] == 1]['revenue'].mean()

# calculate average revenue for movies with top 1k actors
top1k_avg_revenue = data[data['top_1k'] == 1]['revenue'].mean()

# calculate average revenue for movies with no top actors
no_top_avg_revenue = data[(data['a_list'] == 0) & (data['top_100'] == 0) & (data['top_1k'] == 0)]['revenue'].mean()

# create a DataFrame to use for plotting
plot_data = pd.DataFrame({
    'actor_type': pd.Categorical(['A List', 'Top 100', 'Top 1K', 'No Top Actors'], categories=['A List', 'Top 100', 'Top 1K', 'No Top Actors'], ordered=True),
    'average_revenue': [alist_avg_revenue, top100_avg_revenue, top1k_avg_revenue, no_top_avg_revenue],
})

# define the desired colors for each bar 6699ff
color_scheme = {
    'A List': '#b30000',
    'Top 100': '#E0B713',
    'Top 1K': '#13A3E0',
    'No Top Actors': '#444444'
}

# create bar chart
chart = alt.Chart(plot_data).mark_bar().encode(
    x=alt.X('actor_type', title='Actor Type', axis=alt.Axis(labelAngle=0), sort=['A List', 'Top 100', 'Top 1K', 'No Top Actors']),
    y=alt.Y('average_revenue', title='Average Revenue'),
    color=alt.Color('actor_type:N', legend=None, scale=alt.Scale(domain=list(color_scheme.keys()), range=list(color_scheme.values()))),
    tooltip=['actor_type:N', 'average_revenue:Q']
).properties(
    title='Type of Actor vs Average Revenue',
    width=400,
    height=400
).interactive()

# display chart
view.display(chart, inline=inline)




In [79]:
'''
Top actors vs User_Ratings
'''
import pandas as pd
import altair as alt
import altair_viewer as view

# load data
data = pd.read_csv('data/master_dataset.csv')

# calculate average user rating for movies with A-list actors
alist_avg_rating = data[data['a_list'] == 1]['audience_rating'].mean()

# calculate average user rating for movies with top 100 actors
top100_avg_rating = data[data['top_100'] == 1]['audience_rating'].mean()

# calculate average user rating for movies with top 1k actors
top1k_avg_rating = data[data['top_1k'] == 1]['audience_rating'].mean()

# calculate average user rating for movies without top actors
no_top_avg_rating = data[(data['a_list'] == 0) & (data['top_100'] == 0) & (data['top_1k'] == 0)]['user_rating'].mean()

# create a DataFrame to use for plotting
plot_data = pd.DataFrame({
    'actor_type': pd.Categorical(['A List', 'Top 100', 'Top 1K', 'No Top Actors'], categories=['A List', 'Top 100', 'Top 1K', 'No Top Actors'], ordered=True),
    'average_rating': [alist_avg_rating, top100_avg_rating, top1k_avg_rating, no_top_avg_rating],
})

# define the desired colors for each bar
color_scheme = {
    'A List': '#b30000',
    'Top 100': '#E0B713',
    'Top 1K': '#13A3E0',
    'No Top Actors': '#444444'
}

# create bar chart
chart = alt.Chart(plot_data).mark_bar().encode(
    x=alt.X('actor_type', title='Actor Type', axis=alt.Axis(labelAngle=0),sort=['A List', 'Top 100', 'Top 1K', 'No Top Actors']),
    y=alt.Y('average_rating', title='Average User Rating'),
    color=alt.Color('actor_type:N', legend=None, scale=alt.Scale(domain=list(color_scheme.keys()), range=list(color_scheme.values()))),
    tooltip=['actor_type:N', 'average_rating:Q']
).properties(
    title='Top Actors vs User Ratings',
    width=400,
    height=400
).interactive()


# display chart
view.display(chart, inline=inline)




In [80]:
'''
Top actors vs Expert_Ratings
'''
import pandas as pd
import altair as alt
import altair_viewer as view

# load data
data = pd.read_csv('data/master_dataset.csv')

# calculate average expert rating for movies with A-list actors
alist_avg_rating = data[data['a_list'] == 1]['RT_expert_rating'].mean()

# calculate average expert rating for movies with top 100 actors
top100_avg_rating = data[data['top_100'] == 1]['RT_expert_rating'].mean()

# calculate average expert rating for movies with top 1k actors
top1k_avg_rating = data[data['top_1k'] == 1]['RT_expert_rating'].mean()

# calculate average expert rating for movies without top actors
no_top_avg_rating = data[(data['a_list'] == 0) & (data['top_100'] == 0) & (data['top_1k'] == 0)]['RT_expert_rating'].mean()

# create a DataFrame to use for plotting
plot_data = pd.DataFrame({
    'actor_type': pd.Categorical(['A List', 'Top 100', 'Top 1K', 'No Top Actors'], categories=['A List', 'Top 100', 'Top 1K', 'No Top Actors'], ordered=True),
    'average_rating': [alist_avg_rating, top100_avg_rating, top1k_avg_rating, no_top_avg_rating],

})

color_scheme = {
    'A List': '#b30000',
    'Top 100': '#E0B713',
    'Top 1K': '#13A3E0',
    'No Top Actors': '#444444'
}

# create bar chart
chart = alt.Chart(plot_data).mark_bar().encode(
    x=alt.X('actor_type', title='Actor Type', axis=alt.Axis(labelAngle=0), sort=['A List', 'Top 100', 'Top 1K', 'No Top Actors']),
    y=alt.Y('average_rating:Q', title='Average Expert Rating'),
    color=alt.Color('actor_type:N', legend=None, scale=alt.Scale(domain=list(color_scheme.keys()), range=list(color_scheme.values()))),
    tooltip=['actor_type:N', 'average_rating:Q']
).properties(
    title='Top Actors vs Expert Ratings',
    width=400,
    height=400
).interactive()

# display chart
view.display(chart, inline=inline)


In [81]:
''' 
Release Data vs Expert Rating
'''
import pandas as pd
import altair as alt
import altair_viewer as view
from scipy.signal import savgol_filter

# Load the data
data = pd.read_csv("data/master_dataset.csv")

# Convert release_date to datetime type
data['release_date'] = pd.to_datetime(data['release_date'])

# Filter data by year 1930 - 2017
data = data.query("release_date >= '1930-01-01' and release_date < '2017-01-01'")

# Calculate the mean expert rating for each year
avg_rating = data.groupby(pd.Grouper(key='release_date', freq='Y'))['RT_expert_rating'].mean(numeric_only=True).reset_index()

avg_rating['RT_expert_rating'] = savgol_filter(avg_rating['RT_expert_rating'], 50, 7)


# Create a line graph showing average expert rating over the years
chart = alt.Chart(avg_rating).mark_line(color='#b30000').encode(
    x=alt.X('year(release_date):T', axis=alt.Axis(title='Release Date'), scale=alt.Scale(domain=(1930,2016))),
    y=alt.Y('RT_expert_rating', axis=alt.Axis(title='Average Expert Rating')),
    tooltip=['year(release_date):T', 'RT_expert_rating'],
)

# set chart properties
chart = chart.properties(
    width=800,
    height=400,
    title='Average Expert Rating per Year'
)

# display the chart
view.display(chart, inline=inline)




In [82]:
'''
Plot Movie Budget vs Average Revenue
'''

import pandas as pd
import altair as alt
import altair_viewer as view
from scipy.signal import savgol_filter

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

# Filter out budgets less than $10,000
data = data[(data['budget'] >= 10000) & (data['budget'] <= 40000000)]

# Filter out budgets and revenues that are zero
data = data.query('budget > 0')
data = data.query('revenue > 0')

# Calculate the average revenue for each budget value
avg_revenue = data.groupby('budget')['revenue'].mean().reset_index()
avg_revenue['revenue'] = savgol_filter(avg_revenue['revenue'], 70, 10)

# Create the line plot
chart = alt.Chart(avg_revenue).mark_line(color='#E0B713', interpolate='bundle').encode(
    x=alt.X('budget', axis=alt.Axis(title='Budget')),
    y=alt.Y('revenue', axis=alt.Axis(title='Average Revenue')),
    tooltip=['budget', 'revenue']
).properties(
    title='Movie Budget vs Average Revenue',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart, inline=inline)


