In [2]:
import pandas as pd
import seaborn as sns
import plotly.express as px

Data Preparation

In [14]:
# Read the CSV
df = pd.read_csv('data/movies.csv')

# 1. IDENTIFY: Keep rows that are NOT 'Unknown' AND have a '$'
# We use this to create our working dataset
mask = (
    (df['Income'] != 'Unknown') & (df['Income'].str.contains('\$', na=False)) &
    (df['Budget'] != 'Unknown') & (df['Budget'].str.contains('\$', na=False))
)
df_clean = df[mask].copy()

# 2. CLEAN & CONVERT: Now that we only have rows with money symbols, strip them
# We remove $ and commas so '1,000' becomes '1000'
for col in ['Income', 'Budget']:
    df_clean[col] = (
        df_clean[col]
        .str.replace(r'[^0-9.]', '', regex=True) # Keep ONLY digits and dots
        .astype(float)
    )

# 3. CALCULATE: Now you can finally do the ROI math
df_clean['ROI'] = (df_clean['Income'] - df_clean['Budget']) / df_clean['Budget']


print(df_clean[['Income', 'Budget']].head())


         Income       Budget
0  2.267947e+09  350000000.0
1  1.089670e+05   35000000.0
2  2.392686e+08   85900000.0
4  1.712539e+08   12000000.0
6  3.124581e+07   80000000.0


Data Visualization using Scatter Plot

In [None]:
# Create the scatter plot
fig = px.scatter(
    df_clean, 
    x="Budget", 
    y="Income",
    hover_name="Title",        # Shows the movie name when you hover
    log_x=True,                # Spreads out the low-budget movies
    log_y=True,                # Spreads out the low-income movies
    trendline="ols",           # Adds a 'Line of Best Fit' to see the correlation
    title="Market Reality: Budget vs. Income",
    labels={"Budget": "Budget (USD)", "Income": "Worldwide Income (USD)"},
    template="plotly_dark"     # 'plotly_dark' or 'plotly_white' 
)

# This helps the eye immediately see who is above/below the profit line
fig.add_shape(
    type='line', line=dict(dash='dash', color="red", width=2),
    x0=df_clean['Budget'].min(), y0=df_clean['Budget'].min(),
    x1=df_clean['Budget'].max(), y1=df_clean['Budget'].max()
)

fig.show()