In [None]:
import numpy as np 
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats

In-class exercise: movie data

#### For this exercise read in movie_dataset.csv. This is a Kaggle dataset. The original version can be found [here](https://www.kaggle.com/datasets/utkarshx27/movies-dataset?resource=download). We'll do a bit of exploratory visual analysis on these data. Questions as well as some code to get you started are below.

In [None]:
# read the file
movies = pd.read_csv('movie_dataset.csv')

In [None]:
movies.head()

In [None]:
movies['release_date'] = pd.to_datetime(movies['release_date'])
movies['year'] = movies['release_date'].dt.year 

In [None]:
# we're only going to conern ourselves with records that
# don't have missing values for revenue and budget

movies.dropna(subset=['revenue', 'budget'], how='any', inplace=True)

In [None]:
# we'll also assume that there aren't any movies that have a $0 budget.

movies = movies.loc[movies['budget'] > 0, :].copy()

#### 1. Run the following cell. 
* Is this plot accurate?<br><br>
* Experiment with different markers sizes. Is anything obscured/highlighted?<br><br>
* Discuss. Any suggestions? Feel free to experiment with changes.

In [None]:
# Create an interactive scatter plot with Plotly
fig = px.scatter(
    movies, 
    x='vote_average', 
    y='revenue',
    opacity=0.5,
    hover_name='title',  # Show title on hover
    hover_data={
        'vote_average': ':.1f',
        'revenue': ':$,.0f',
        'title': False  # Hide duplicate title in hover data
    },
    labels={
        'vote_average': 'Vote Average',
        'revenue': 'Revenue'
    },
    title='Movie Vote Average vs Revenue'
)

# Format y-axis to show values in millions
fig.update_layout(
    yaxis=dict(
        tickformat='$,.0fM',
        tickprefix='$',
        ticksuffix='M',
        title='Revenue'
    ),
    xaxis=dict(title='Vote Average'),
    height=600,
    width=900
)

# Update hover template to show revenue in millions
fig.update_traces(
    hovertemplate='<b>%{hovertext}</b><br>Vote: %{x:.1f}<br>Revenue: $%{y:,.0f}'
)

fig.show()

#### 2. What percent of the observed revenues are at or below 500 million USD?

* Plot those observations versus 'vote_average'.<br><br>
* _hint: use scipy.stats percentileofscore()_

In [None]:
# Calculate the percentage of revenues at or below 500 million USD
threshold = 500000000  # 500 million USD
percentage = stats.percentileofscore(movies['revenue'], threshold)
print(f"{percentage:.2f}% of movies have revenue at or below $500 million")

# Filter movies with revenue at or below 500 million USD
movies_below_threshold = movies[movies['revenue'] <= threshold]

# Create an interactive scatter plot with Plotly for movies below threshold
fig = px.scatter(
    movies_below_threshold, 
    x='vote_average', 
    y='revenue',
    opacity=0.5,
    hover_name='title',
    hover_data={
        'vote_average': ':.1f',
        'revenue': ':$,.0f',
        'title': False
    },
    labels={
        'vote_average': 'Vote Average',
        'revenue': 'Revenue'
    },
    title=f'Movies with Revenue ≤ $500M ({len(movies_below_threshold)} movies, {percentage:.1f}% of dataset)'
)

# Format y-axis to show values in millions
fig.update_layout(
    yaxis=dict(
        tickformat='$,.0fM',
        tickprefix='$',
        ticksuffix='M',
        title='Revenue'
    ),
    xaxis=dict(title='Vote Average'),
    height=600,
    width=900
)

# Update hover template to show revenue in millions
fig.update_traces(
    hovertemplate='<b>%{hovertext}</b><br>Vote: %{x:.1f}<br>Revenue: $%{y:,.0f}'
)

fig.show()

#### 3. Plot the distribution of 'vote_average' using a histogram. 
* Compare the shape of the distribution to the scatter plot of 'vote_average' by 'revenue'.<br><br>
* What do you observe?

In [None]:
# Create a histogram of vote_average with Plotly
# Calculate descriptive statistics
mean_vote = movies['vote_average'].mean()
median_vote = movies['vote_average'].median()
std_vote = movies['vote_average'].std()

# Create a figure with both histogram and KDE
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add histogram
histogram = go.Histogram(
    x=movies['vote_average'],
    nbinsx=20,
    opacity=0.7,
    name='Frequency',
    marker_color='skyblue',
    marker_line_color='black',
    marker_line_width=1,
    hovertemplate='Vote Range: %{x}<br>Count: %{y}<extra></extra>'
)
fig.add_trace(histogram)

# Add KDE (using a smoothed line)
# First, calculate the KDE values
from scipy.stats import gaussian_kde
kde = gaussian_kde(movies['vote_average'])
x_range = np.linspace(movies['vote_average'].min(), movies['vote_average'].max(), 100)
y_kde = kde(x_range)

# Scale the KDE to be visible on the same plot
y_kde_scaled = y_kde * (histogram.nbinsx * len(movies['vote_average']) / y_kde.max())

# Add the KDE line
kde_line = go.Scatter(
    x=x_range,
    y=y_kde_scaled,
    mode='lines',
    name='Density',
    line=dict(color='navy', width=2),
    hovertemplate='Vote: %{x:.2f}<br>Density: %{y:.4f}<extra></extra>',
    yaxis='y2'
)
fig.add_trace(kde_line, secondary_y=True)

# Add a vertical line for the mean
mean_line = go.Scatter(
    x=[mean_vote, mean_vote],
    y=[0, histogram.nbinsx * len(movies['vote_average']) / 5],  # Adjust height as needed
    mode='lines',
    name=f'Mean: {mean_vote:.2f}',
    line=dict(color='red', width=2, dash='dash'),
    hovertemplate=f'Mean: {mean_vote:.2f}<extra></extra>'
)
fig.add_trace(mean_line)

# Add a vertical line for the median
median_line = go.Scatter(
    x=[median_vote, median_vote],
    y=[0, histogram.nbinsx * len(movies['vote_average']) / 6],  # Adjust height as needed
    mode='lines',
    name=f'Median: {median_vote:.2f}',
    line=dict(color='green', width=2, dash='dash'),
    hovertemplate=f'Median: {median_vote:.2f}<extra></extra>'
)
fig.add_trace(median_line)

# Add stats as an annotation
stats_text = f"Mean: {mean_vote:.2f}<br>Median: {median_vote:.2f}<br>Std Dev: {std_vote:.2f}"
fig.add_annotation(
    x=0.95,
    y=0.95,
    xref="paper",
    yref="paper",
    text=stats_text,
    showarrow=False,
    font=dict(size=12),
    bgcolor="white",
    opacity=0.8,
    bordercolor="black",
    borderwidth=1,
    borderpad=4,
    align="right"
)

# Update layout
fig.update_layout(
    title='Distribution of Vote Average Ratings',
    xaxis_title='Vote Average',
    yaxis_title='Frequency',
    yaxis2_title='Density',
    yaxis2=dict(
        showgrid=False,
        zeroline=False,
        showticklabels=False
    ),
    height=600,
    width=1000,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    margin=dict(t=100)  # Add more top margin for the legend
)

# Add grid lines
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='rgba(0,0,0,0.1)')

fig.show()

# Observation: The distribution of vote_average appears to be approximately normal,
# centered around 6.5-7.0. When comparing to the scatter plot of vote_average vs revenue,
# we can observe that while most movies cluster in this middle range of ratings,
# the relationship with revenue doesn't appear to be strongly linear. There are high-revenue
# movies across different rating values, suggesting that factors beyond just ratings
# influence a movie's financial success.

#### 4. How can we visual identify records that have a high ROI?

In [None]:
# Calculate ROI (Return on Investment) for each movie
movies['ROI'] = (movies['revenue'] - movies['budget']) / movies['budget']

# Create a scatter plot with ROI represented by color and size using Plotly
# Clip ROI values for sizing to avoid extreme sizes
movies['ROI_clipped'] = movies['ROI'].clip(0, 20)

# Create the scatter plot
fig = px.scatter(
    movies,
    x='budget',
    y='revenue',
    color='ROI',
    size='ROI_clipped',  # Size based on clipped ROI
    size_max=50,  # Maximum marker size
    opacity=0.7,
    hover_name='title',
    hover_data={
        'budget': ':$,.0f',
        'revenue': ':$,.0f',
        'ROI': ':.2f',
        'ROI_clipped': False,  # Hide this from hover data
        'title': False  # Hide duplicate title
    },
    labels={
        'budget': 'Budget',
        'revenue': 'Revenue',
        'ROI': 'ROI (Return on Investment)'
    },
    color_continuous_scale='viridis',
    title='Movie Budget vs Revenue with ROI Visualization'
)

# Format axes to show values in millions
fig.update_layout(
    xaxis=dict(
        title='Budget',
        tickformat='$,.0fM',
        tickprefix='$',
        ticksuffix='M'
    ),
    yaxis=dict(
        title='Revenue',
        tickformat='$,.0fM',
        tickprefix='$',
        ticksuffix='M'
    ),
    height=700,
    width=1000,
    coloraxis_colorbar=dict(
        title='ROI'
    )
)

# Add reference line for ROI = 0 (break-even point)
max_val = max(movies['budget'].max(), movies['revenue'].max())
fig.add_trace(
    go.Scatter(
        x=[0, max_val],
        y=[0, max_val],
        mode='lines',
        line=dict(color='red', width=2, dash='dash'),
        name='Break-even (ROI = 0)',
        hoverinfo='skip'
    )
)

# Add annotations for top 5 ROI movies
top_roi_movies = movies.nlargest(5, 'ROI')
for _, movie in top_roi_movies.iterrows():
    fig.add_annotation(
        x=movie['budget'],
        y=movie['revenue'],
        text=movie['title'],
        showarrow=True,
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor='black',
        ax=20,
        ay=-30,
        bgcolor='yellow',
        opacity=0.8,
        bordercolor='black',
        borderwidth=1
    )

# Update hover template to format currency values
fig.update_traces(
    hovertemplate='<b>%{hovertext}</b><br>Budget: $%{x:,.0f}<br>Revenue: $%{y:,.0f}<br>ROI: %{marker.color:.2f}'
)

fig.show()

# Print the top 10 movies by ROI
print("Top 10 Movies by ROI:")
top10_roi = movies.nlargest(10, 'ROI')[['title', 'budget', 'revenue', 'ROI']]
top10_roi['budget'] = top10_roi['budget'].apply(lambda x: f"${x/1000000:.2f}M")
top10_roi['revenue'] = top10_roi['revenue'].apply(lambda x: f"${x/1000000:.2f}M")
top10_roi['ROI'] = top10_roi['ROI'].apply(lambda x: f"{x:.2f}")
print(top10_roi)

#### What can we see regarding the relationship between ROI and popularity (there's actually a feature called 'popularity').

In [None]:
# Explore the relationship between ROI and popularity with Plotly
# Calculate the 99th percentile of ROI to set as initial upper limit
roi_upper_limit = np.percentile(movies['ROI'], 99)

# Create an interactive scatter plot with Plotly
fig = px.scatter(
    movies,
    x='popularity',
    y='ROI',
    color='vote_average',
    opacity=0.7,
    hover_name='title',
    hover_data={
        'popularity': ':.1f',
        'ROI': ':.2f',
        'vote_average': ':.1f',
        'title': False  # Hide duplicate title
    },
    labels={
        'popularity': 'Popularity',
        'ROI': 'ROI (Return on Investment)',
        'vote_average': 'Vote Average'
    },
    color_continuous_scale='plasma',
    title='Relationship Between ROI and Popularity',
    log_x=True  # Use log scale for x-axis since popularity has a wide range
)

# Format axes and layout
fig.update_layout(
    xaxis=dict(title='Popularity (log scale)'),
    yaxis=dict(
        title='ROI (Return on Investment)',
        range=[-1, roi_upper_limit]  # Set initial y-axis range
    ),
    height=600,
    width=1000,
    coloraxis_colorbar=dict(
        title='Vote Average'
    )
)

# Add a horizontal line at ROI = 0 (break-even)
fig.add_shape(
    type='line',
    x0=movies['popularity'].min() * 0.9,
    x1=movies['popularity'].max() * 1.1,
    y0=0,
    y1=0,
    line=dict(color='red', width=2, dash='dash'),
)

# Add a text annotation for the break-even line
fig.add_annotation(
    x=movies['popularity'].min() * 1.5,
    y=0.1,
    text="Break-even (ROI = 0)",
    showarrow=False,
    font=dict(color="red")
)

# Add annotations for interesting cases
# Top 3 movies by popularity
top_popular = movies.nlargest(3, 'popularity')
for _, movie in top_popular.iterrows():
    if movie['ROI'] <= roi_upper_limit:  # Only annotate if within our y-axis limits
        fig.add_annotation(
            x=movie['popularity'],
            y=movie['ROI'],
            text=movie['title'],
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=2,
            arrowcolor='black',
            ax=20,
            ay=-30,
            bgcolor='lightblue',
            opacity=0.8,
            bordercolor='black',
            borderwidth=1
        )

# Top 3 ROI movies that are also somewhat popular (popularity > median)
median_popularity = movies['popularity'].median()
top_roi_popular = movies[movies['popularity'] > median_popularity].nlargest(3, 'ROI')
for _, movie in top_roi_popular.iterrows():
    if movie['ROI'] <= roi_upper_limit:  # Only annotate if within our y-axis limits
        fig.add_annotation(
            x=movie['popularity'],
            y=movie['ROI'],
            text=movie['title'],
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=2,
            arrowcolor='black',
            ax=-20,
            ay=30,
            bgcolor='lightyellow',
            opacity=0.8,
            bordercolor='black',
            borderwidth=1
        )

# Calculate correlation coefficient
correlation = movies['ROI'].corr(movies['popularity'])
fig.add_annotation(
    x=0.05,
    y=0.95,
    xref="paper",
    yref="paper",
    text=f"Correlation: {correlation:.3f}",
    showarrow=False,
    font=dict(size=12),
    bgcolor="white",
    opacity=0.8,
    bordercolor="black",
    borderwidth=1,
    borderpad=4
)

# Add a range slider for the y-axis to adjust ROI limits
fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            direction="left",
            buttons=[
                dict(
                    args=[{"yaxis.range": [-1, np.percentile(movies['ROI'], 90)]}],
                    label="90th Percentile",
                    method="relayout"
                ),
                dict(
                    args=[{"yaxis.range": [-1, np.percentile(movies['ROI'], 95)]}],
                    label="95th Percentile",
                    method="relayout"
                ),
                dict(
                    args=[{"yaxis.range": [-1, np.percentile(movies['ROI'], 99)]}],
                    label="99th Percentile",
                    method="relayout"
                ),
                dict(
                    args=[{"yaxis.range": [-1, np.percentile(movies['ROI'], 100)]}],
                    label="Max",
                    method="relayout"
                ),
            ],
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.1,
            yanchor="top"
        ),
    ]
)

fig.show()

# Additional analysis: Calculate average ROI by popularity quartiles
movies['popularity_quartile'] = pd.qcut(movies['popularity'], 4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
roi_by_popularity = movies.groupby('popularity_quartile')['ROI'].agg(['mean', 'median', 'count'])
print("\nROI by Popularity Quartile:")
print(roi_by_popularity)

# Observations
print("\nObservations:")
print("1. There doesn't appear to be a strong linear correlation between ROI and popularity.")
print("2. Some movies with moderate popularity have extremely high ROI, suggesting they were low-budget successes.")
print("3. Highly popular movies tend to have more consistent but moderate ROI values.")
print("4. The most profitable investments (highest ROI) are often not the most popular movies.")
print("5. This suggests that while popularity can drive revenue, the budget-to-revenue ratio (ROI) depends on many other factors.")
