Can we use this as inspiration for some graphs?  
[Example Notebook](https://www.kaggle.com/code/gnk980/night-in-let-s-watch-a-movie)

In [172]:
'''
Plot Movie Budget vs Expert Ratings
add color for fresh, rotten, CF
'''
import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

# Filter out movies with budgets less than $10,000
data = data[data['budget'] >= 10000]

data = data.query('budget > 0')
data = data.query('RT_expert_rating > 0')
# Create the line chart
chart = alt.Chart(data).mark_point().encode(
    x='budget',
    y='RT_expert_rating',
    tooltip=['title','budget', 'RT_expert_rating']
).properties(
    title='Movie Budget vs Expert Ratings',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart)


In [173]:
'''
Plot Movie Budget vs User Ratings
'''

import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

data = data[data['budget'] >= 10000]

data = data.query('budget > 0')

# Create the line chart
chart = alt.Chart(data).mark_point().encode(
    x='budget',
    y='user_rating',
    tooltip=['title','budget', 'user_rating']
).properties(
    title='Movie Budget vs User Ratings',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart)


In [174]:
''' 
Plot Revenue by Year
'''

import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# load the data
data = pd.read_csv("data/master_dataset.csv")

# filter data by year
data = data.query("release_date >= '1930-01-01' and release_date < '2017-01-01'")

# create a bar chart showing revenue by year
bar_chart = alt.Chart(data).mark_bar(color='green').encode(
    x=alt.X('year(release_date):T', scale=alt.Scale(domain=(1930,2016))),
    y='sum(revenue)',
    tooltip=['year(release_date):T', 'sum(revenue)'],
)

# set chart properties
bar_chart = bar_chart.properties(
    width=800,
    height=400,
    title='Revenue by Year'
)

# display the chart
view.display(bar_chart)


In [175]:
''' 
Plot User Rating vs Expert Rating
difference on y and avg rating
'''

import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# load the data
data = pd.read_csv("data/master_dataset.csv")

# filter out rows where RT_expert_rating is 0
data = data[(data['RT_expert_rating'] != 0) & (data['user_rating'].notna())]

# create a scatter plot showing the relationship between user rating and expert rating
scatter_plot = alt.Chart(data).mark_point().encode(
    x='user_rating:Q',
    y='RT_expert_rating:Q',
    tooltip=['title','user_rating', 'RT_expert_rating']
).properties(
    width=800,
    height=400,
    title='User Rating vs Expert Rating'
)

# Display the chart
view.display(scatter_plot)


In [176]:
'''
Production Companies vs Revenue
Filter to get the top 10 most popular production companies (based on total revenue generated) 
Bar Graph to display data by company
'''

import pandas as pd
import altair as alt
import altair_viewer as view

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

# Extract the name of each production company from the dictionary and explode the column
data['production_companies'] = data['production_companies'].apply(lambda x: [company['name'] for company in eval(x) if 'name' in company] if not pd.isna(x) else [])
data = data.explode('production_companies')

# Filter to get the top 5 and bottom 5 production companies based on total revenue
top_producers = data.groupby('production_companies')['revenue'].sum().sort_values(ascending=False).head(10).index.tolist()

data = data[data['production_companies'].isin(top_producers)]

# Create a chart for all selected production companies
chart = alt.Chart(data).mark_bar().encode(
    x=alt.X('production_companies:N', sort='-y', axis=alt.Axis(labelAngle=45)),
    y=alt.Y('sum(revenue):Q', axis=alt.Axis(title='Total Revenue')),
    color=alt.Color('production_companies:N', scale=alt.Scale(scheme='dark2'), legend=None),
    tooltip=['production_companies:N', 'sum(revenue):Q']
).properties(
    title='Top 10 Production Companies vs Total Revenue',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart)


# Print the top 10 production companies based on total revenue
print("Top 10 production companies based on total revenue:")
print(data.groupby('production_companies')['revenue'].sum().sort_values(ascending=False).head(10))

SyntaxError: invalid syntax (<string>, line 1)

In [177]:
''' 
Genre vs Average User Rating
Either use a pie chart or stacked bar chart showing the different genres and what the avg rating was for each
'''
import pandas as pd
import altair as alt
import altair_viewer as view

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

# Remove punctuation from the genres column
data['genres'] = data['genres'].str.replace('[^\w\s]','')

# Split the genres column and explode the column
data['genres'] = data['genres'].apply(lambda x: x.split('|') if not pd.isna(x) else [])
data = data.explode('genres')

# Filter to get the top 10 genres based on count
top_genres = data.groupby('genres').size().sort_values(ascending=False).head(10).index
data = data[data['genres'].isin(top_genres)]

# Group the data by genre and calculate the average user rating for each genre
genre_ratings = data.groupby('genres').agg({'user_rating': 'mean'}).reset_index()

# Define the color scale as a gradient
color_scale = alt.Scale(
    domain=top_genres.tolist(),
    range=['#256fcd', '#fbf4f2', '#d1d5cb', '#baccdc', '#7fbbf7','#7aaff3', '#548bd9', '#edcfc1','#356cb0','#295da4']
)

# Create a stacked bar chart showing the average user rating for each genre
chart = alt.Chart(genre_ratings).mark_bar().encode(
    x=alt.X('genres:N', sort='-y', axis=alt.Axis(labelAngle=45)),
    y=alt.Y('user_rating:Q', axis=alt.Axis(title='Average User Rating')),
    color=alt.Color('genres:N', scale=color_scale, legend=None)
).properties(
    title='Genre vs Average User Rating',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart)
print(f"There are {len(data['genres'].unique())} unique genres in the data")




  data['genres'] = data['genres'].str.replace('[^\w\s]','')


There are 10 unique genres in the data


In [178]:
''' 
Genre vs Average Expert Rating
Either use a pie chart or stacked bar chart showing the different genres and what the avg rating was for each
'''
import pandas as pd
import altair as alt
import altair_viewer as view

# Load the data from the CSV file
data = pd.read_csv('data/master_dataset.csv')

# Remove punctuation from the genres column
data['genres'] = data['genres'].str.replace('[^\w\s]','')

# Split the genres column and explode the column
data['genres'] = data['genres'].apply(lambda x: x.split('|') if not pd.isna(x) else [])
data = data.explode('genres')

# Filter to get the top 10 genres based on count
top_genres = data.groupby('genres').size().sort_values(ascending=False).head(10).index
data = data[data['genres'].isin(top_genres)]

# Group the data by genre and calculate the average user rating for each genre
genre_ratings = data.groupby('genres').agg({'RT_expert_rating': 'mean'}).reset_index()

# Create a stacked bar chart showing the average user rating for each genre
chart = alt.Chart(genre_ratings).mark_bar().encode(
    x=alt.X('genres:N', sort='-y', axis=alt.Axis(labelAngle=45)),
    y=alt.Y('RT_expert_rating:Q', axis=alt.Axis(title='Average Expert Rating')),
    color=alt.Color('genres:N', scale=alt.Scale(scheme='dark2'), legend=None)
).properties(
    title='Genre vs Average Expert Rating',
    width=800,
    height=400
).interactive()

# Display the chart
view.display(chart)
print(f"There are {len(data['genres'].unique())} unique genres in the data")

  data['genres'] = data['genres'].str.replace('[^\w\s]','')


There are 10 unique genres in the data


In [179]:
''' 
Genre vs Total Revenue
Use simiar graph as the genre vs rating but use avg revenue as the scale
'''

import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# load the data
data = pd.read_csv("data/master_dataset.csv")

# filter out movies with zero revenue and missing genres
data = data.query('revenue > 0')
data = data.dropna(subset=['genres'])

# explode the genres column to make a row for each genre in a movie
data = data.assign(genres=data['genres'].str.split(',')).explode('genres')

# calculate the total revenue for each genre
genre_revenue = data.groupby('genres')['revenue'].sum().reset_index()

# create the stacked bar chart showing revenue by genre
chart = alt.Chart(genre_revenue).mark_bar().encode(
    x=alt.X('genres:N', sort='-y', axis=alt.Axis(labelAngle=45)),
    y='revenue:Q',
    color=alt.Color('genres:N', scale=alt.Scale(scheme='yellowgreenblue'), legend=None),
    tooltip=['genres:N', 'revenue:Q']
).properties(
    title='Genre vs Total Revenue',
    width=800,
    height=400
).interactive()

# display the chart
view.display(chart)


In [180]:
''' 
Genre vs Average Revenue
Use a similar graph as the genre vs rating but use avg revenue as the scale
https://vega.github.io/vega/docs/schemes/
'''


import pandas as pd
import altair as alt
import altair_viewer as view

alt.data_transformers.disable_max_rows()

# load the data
data = pd.read_csv("data/master_dataset.csv")

# filter out movies with zero revenue and missing genres
data = data.query('revenue > 0')
data = data.dropna(subset=['genres'])

# explode the genres column to make a row for each genre in a movie
data = data.assign(genres=data['genres'].str.split(',')).explode('genres')

# calculate the average revenue for each genre
genre_revenue = data.groupby('genres')['revenue'].mean().reset_index()

# sort the genres by revenue in descending order
genre_revenue = genre_revenue.sort_values('revenue', ascending=False)

# create the bar chart showing average revenue by genre
chart = alt.Chart(genre_revenue).mark_bar().encode(
    x=alt.X('genres:N', sort=alt.EncodingSortField(field='revenue', order='descending'), 
            axis=alt.Axis(labelAngle=45)),
    y='revenue:Q',
    color=alt.Color('genres:N', scale=alt.Scale(scheme='sinebow'), legend=None),
    tooltip=['genres:N', 'revenue:Q']
).properties(
    title='Genre vs Average Revenue',
    width=800,
    height=400
).interactive()

# display the chart
view.display(chart)



In [181]:
''' 
Top actors vs Revenue
'''
import pandas as pd
import altair as alt

# load data
data = pd.read_csv('data/master_dataset.csv')

# calculate average revenue for movies with top 100 actors
alist_avg_revenue = data[data['a_list'] == 1]['revenue'].mean()

# calculate average revenue for movies with top 100 actors
top100_avg_revenue = data[data['top_100'] == 1]['revenue'].mean()

# calculate average revenue for movies with top 1k actors
top1k_avg_revenue = data[data['top_1k'] == 1]['revenue'].mean()

# calculate average revenue for movies with no top actors
no_top_avg_revenue = data[(data['a_list'] == 0) & (data['top_100'] == 0) & (data['top_1k'] == 0)]['revenue'].mean()

# create a DataFrame to use for plotting
plot_data = pd.DataFrame({
    'actor_type': pd.Categorical(['A List', 'Top 100', 'Top 1K', 'No Top Actors'], categories=['A List', 'Top 100', 'Top 1K', 'No Top Actors'], ordered=True),
    'average_revenue': [alist_avg_revenue, top100_avg_revenue, top1k_avg_revenue, no_top_avg_revenue],
    'color': ['red', 'green', 'blue', 'gray']
})

# create bar chart
chart = alt.Chart(plot_data).mark_bar().encode(
    x=alt.X('actor_type', title='Actor Type', axis=alt.Axis(labelAngle=0), sort=['A List', 'Top 100', 'Top 1K', 'No Top Actors']),
    y=alt.Y('average_revenue', title='Average Revenue'),
    color=alt.Color('color', legend=None)
).properties(
    title='Type of Actor vs Average Revenue',
    width=400,
    height=400
).interactive()

# display chart
chart.display()




In [182]:
'''
Top actors vs User_Ratings
'''
import pandas as pd
import altair as alt

# load data
data = pd.read_csv('data/master_dataset.csv')

# calculate average user rating for movies with A-list actors
alist_avg_rating = data[data['a_list'] == 1]['user_rating'].mean()

# calculate average user rating for movies with top 100 actors
top100_avg_rating = data[data['top_100'] == 1]['user_rating'].mean()

# calculate average user rating for movies with top 1k actors
top1k_avg_rating = data[data['top_1k'] == 1]['user_rating'].mean()

# calculate average user rating for movies without top actors
no_top_avg_rating = data[(data['a_list'] == 0) & (data['top_100'] == 0) & (data['top_1k'] == 0)]['user_rating'].mean()

# create a DataFrame to use for plotting
plot_data = pd.DataFrame({
    'actor_type': ['A List', 'Top 100', 'Top 1K', 'No Top Actors'],
    'average_rating': [alist_avg_rating, top100_avg_rating, top1k_avg_rating, no_top_avg_rating]
})

# create bar chart
chart = alt.Chart(plot_data).mark_bar().encode(
    x=alt.X('actor_type', title='Actor Type', axis=alt.Axis(labelAngle=0),sort=['A List', 'Top 100', 'Top 1K', 'No Top Actors']),
    y=alt.Y('average_rating', title='Average User Rating'),
    color=alt.Color('actor_type', legend=None, scale=alt.Scale(domain=['A List', 'Top 100', 'Top 1K', 'No Top Actors'], range=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']))
).properties(
    title='Top Actors vs User Ratings',
    width=400,
    height=400
).interactive()


# display chart
chart.display()




In [183]:
'''
Top actors vs Expert_Ratings
'''
import pandas as pd
import altair as alt

# load data
data = pd.read_csv('data/master_dataset.csv')

# calculate average user rating for movies with A-list actors
alist_avg_rating = data[data['a_list'] == 1]['RT_expert_rating'].mean()

# calculate average user rating for movies with top 100 actors
top100_avg_rating = data[data['top_100'] == 1]['RT_expert_rating'].mean()

# calculate average user rating for movies with top 1k actors
top1k_avg_rating = data[data['top_1k'] == 1]['RT_expert_rating'].mean()

# calculate average user rating for movies without top actors
no_top_avg_rating = data[(data['a_list'] == 0) & (data['top_100'] == 0) & (data['top_1k'] == 0)]['RT_expert_rating'].mean()

# create a DataFrame to use for plotting
plot_data = pd.DataFrame({
    'actor_type': ['A List', 'Top 100', 'Top 1K', 'No Top Actors'],
    'average_rating': [alist_avg_rating, top100_avg_rating, top1k_avg_rating, no_top_avg_rating]
})

# create bar chart
chart = alt.Chart(plot_data).mark_bar().encode(
    x=alt.X('actor_type:N', title='Actor Type:N', axis=alt.Axis(labelAngle=0), sort=['A List', 'Top 100', 'Top 1K', 'No Top Actors']),
    y=alt.Y('average_rating:Q', title='Average Expert Rating:Q'),
    color=alt.Color('actor_type:N', legend=None)
).properties(
    title='Top Actors vs Expert Ratings',
    width=400,
    height=400
).interactive()

# display chart
chart.display()
