Can we use this as inspiration for some graphs?  
[Example Notebook](https://www.kaggle.com/code/gnk980/night-in-let-s-watch-a-movie)

In [4]:
'''
Plot Movie Budget vs Expert Ratings
'''
import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

# Filter out movies with budgets less than $10,000
data = data[data['budget'] >= 10000]

data = data.query('budget > 0')
data = data.query('RT_expert_rating > 0')
# Create the line chart
chart = alt.Chart(data).mark_point().encode(
    x='budget',
    y='RT_expert_rating',
    tooltip=['budget', 'RT_expert_rating']
).properties(
    title='Movie Budget vs Expert Ratings',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart)


In [6]:
'''
Plot Movie Budget vs User Ratings
'''

import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

data = data[data['budget'] >= 10000]

data = data.query('budget > 0')

# Create the line chart
chart = alt.Chart(data).mark_point().encode(
    x='budget',
    y='user_rating',
    tooltip=['budget', 'user_rating']
).properties(
    title='Movie Budget vs User Ratings',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart)


In [11]:
''' 
Plot Revenue by Year
'''

import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# load the data
data = pd.read_csv("data/master_dataset.csv")

# filter data by year
data = data.query("release_date >= '1930-01-01' and release_date < '2017-01-01'")

# create a bar chart showing revenue by year
bar_chart = alt.Chart(data).mark_bar(color='green').encode(
    x=alt.X('year(release_date):T', scale=alt.Scale(domain=(1930,2016))),
    y='sum(revenue)',
    tooltip=['year(release_date):T', 'sum(revenue)'],
)

# set chart properties
bar_chart = bar_chart.properties(
    width=800,
    height=400,
    title='Revenue by Year'
)

# display the chart
bar_chart.show()


Displaying chart at http://localhost:64594/


In [13]:
''' 
Plot User Rating vs Expert Rating
difference on y and avg rating
'''

import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# load the data
data = pd.read_csv("data/master_dataset.csv")

# filter out rows where RT_expert_rating is 0
data = data[data['RT_expert_rating'] != 0]

# create a scatter plot showing the relationship between user rating and expert rating
scatter_plot = alt.Chart(data).mark_point().encode(
    x='user_rating:Q',
    y='RT_expert_rating:Q',
    tooltip=['user_rating', 'RT_expert_rating']
).properties(
    width=800,
    height=400,
    title='User Rating vs Expert Rating'
)

# Display the chart
scatter_plot.show()


Displaying chart at http://localhost:64594/


KeyboardInterrupt: 

In [2]:
'''
Production Companies vs Revenue
Filter to get the top 5 most popular production companies (based on how many movies they have released) 
filter out companies who have made less than 10 movies and take the bottom 5 after that and compare the 10 companies
Use a scatter plot that has different colors for each company and use the circle size to show amount of movies released by that company 
'''

import pandas as pd
import altair as alt
import altair_viewer as view

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

# Extract the name of each production company from the dictionary and explode the column
data['production_companies'] = data['production_companies'].apply(lambda x: [company['name'] for company in eval(x) if 'name' in company] if not pd.isna(x) else [])
data = data.explode('production_companies')

# Filter to get the top 5 and bottom 5 production companies based on total revenue
top_producers = data.groupby('production_companies')['revenue'].sum().sort_values(ascending=False).head(10).index.tolist()

data = data[data['production_companies'].isin(top_producers)]

# Create a chart for all selected production companies
chart = alt.Chart(data).mark_bar().encode(
    x=alt.X('production_companies:N', sort='-y', axis=alt.Axis(labelAngle=45)),
    y=alt.Y('sum(revenue):Q', axis=alt.Axis(title='Total Revenue')),
    color=alt.Color('production_companies:N', scale=alt.Scale(scheme='dark2'), legend=None),
    tooltip=['production_companies:N', 'sum(revenue):Q']
).properties(
    title='Top 10 Production Companies vs Total Revenue',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart)


# Print the top 5 production companies based on total revenue
print("Top 5 production companies based on total revenue:")
print(data.groupby('production_companies')['revenue'].sum().sort_values(ascending=False).head(10))






Top 5 production companies based on total revenue:
production_companies
Warner Bros.                              6.754320e+10
Universal Pictures                        6.194357e+10
Paramount Pictures                        5.414377e+10
Twentieth Century Fox Film Corporation    5.376612e+10
Walt Disney Pictures                      5.266523e+10
Columbia Pictures                         3.551452e+10
New Line Cinema                           2.302646e+10
Amblin Entertainment                      1.752602e+10
DreamWorks SKG                            1.636693e+10
Dune Entertainment                        1.620930e+10
Name: revenue, dtype: float64


In [1]:
''' 
Genre vs Rating
Either use a pie chart or stacked bar chart showing the different genres and what the avg rating was for each
'''


' \nGenre vs Rating\n\n'

In [14]:
''' 
Genre vs Revenue
Use simiar graph as the genre vs rating but use avg revenue as the scale
'''

import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# load the data
data = pd.read_csv("data/master_dataset.csv")

# filter out movies with zero revenue and missing genres
data = data.query('revenue > 0')
data = data.dropna(subset=['genres'])

# explode the genres column to make a row for each genre in a movie
data = data.assign(genres=data['genres'].str.split(',')).explode('genres')

# calculate the total revenue for each genre
genre_revenue = data.groupby('genres')['revenue'].sum().reset_index()

# create the stacked bar chart showing revenue by genre
chart = alt.Chart(genre_revenue).mark_bar().encode(
    x=alt.X('genres:N', sort='-y'),
    y='revenue:Q',
    color=alt.Color('genres:N', scale=alt.Scale(scheme='category20'), legend=None),
    tooltip=['genres:N', 'revenue:Q']
).properties(
    title='Genre vs Revenue',
    width=800,
    height=400
).interactive()

# display the chart
view.display(chart)


KeyError: ['genres']

In [None]:
''' 
Production Companies vs 
'''