# Box Office Revenue Prediction - Exploratory Data Analysis

This notebook explores the movie dataset to understand patterns and relationships that can help predict box office revenue.


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set visualization styles
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Display settings
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Load and Examine the Data

First, we'll load the dataset and examine its structure.


In [None]:
# Load data
# For demonstration, we'll create a sample dataset
import sys

sys.path.append("..")
from src.data.data_loader import DataLoader

data_loader = DataLoader("../data/raw/movies_data.csv")
df = data_loader.load_data()

# Display basic information
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Check data types and missing values
print("\nData Types:")
print(df.dtypes)

print("\nMissing Values:")
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_df = pd.DataFrame({"Missing Values": missing, "Percentage": missing_percent})
missing_df[missing_df["Missing Values"] > 0].sort_values(
    "Missing Values", ascending=False
)

## 2. Data Distribution Analysis

Let's analyze the distribution of key features, especially the target variable (revenue).


In [None]:
# Analyze the distribution of the target variable (revenue)
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(df["revenue"], kde=True)
plt.title("Revenue Distribution")
plt.xlabel("Revenue")

plt.subplot(1, 2, 2)
sns.histplot(np.log1p(df["revenue"]), kde=True)
plt.title("Log-transformed Revenue Distribution")
plt.xlabel("Log(Revenue+1)")

plt.tight_layout()
plt.show()

In [None]:
# Distribution of budget and its relationship with revenue
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(df["budget"], kde=True)
plt.title("Budget Distribution")
plt.xlabel("Budget")

plt.subplot(1, 2, 2)
sns.scatterplot(x="budget", y="revenue", data=df, alpha=0.6)
plt.title("Budget vs. Revenue")
plt.xlabel("Budget")
plt.ylabel("Revenue")

plt.tight_layout()
plt.show()

In [None]:
# Create an interactive plot for budget vs. revenue
fig = px.scatter(
    df,
    x="budget",
    y="revenue",
    opacity=0.6,
    hover_data=["title", "release_date"],
    title="Budget vs. Revenue",
    trendline="ols",
)
fig.show()

## 3. Correlation Analysis

Let's examine the correlations between numerical features.


In [None]:
# Select numerical columns
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Calculate correlation matrix
corr_matrix = df[num_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(
    corr_matrix,
    mask=mask,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    linewidths=0.5,
    vmin=-1,
    vmax=1,
)
plt.title("Correlation Matrix")
plt.tight_layout()
plt.show()

In [None]:
# Calculate correlation with target (revenue)
target_corr = df[num_cols].corrwith(df["revenue"]).sort_values(ascending=False)

# Plot top correlations
plt.figure(figsize=(12, 8))
sns.barplot(x=target_corr.values, y=target_corr.index)
plt.title("Correlation with Revenue")
plt.xlabel("Correlation Coefficient")
plt.tight_layout()
plt.show()

## 4. Categorical Feature Analysis

Let's analyze categorical features like genres and their impact on revenue.


In [None]:
# Analyze genres
if "genres" in df.columns:
    # Split the genres (assuming they're in a format like "Action|Adventure")
    genres_expanded = (
        df["genres"]
        .str.split("|", expand=True)
        .stack()
        .reset_index(level=1, drop=True)
        .rename("genre")
    )
    genres_df = pd.DataFrame({"genre": genres_expanded})

    # Count genre occurrences
    genre_counts = genres_df["genre"].value_counts()

    # Plot genre distribution
    plt.figure(figsize=(14, 8))
    sns.barplot(x=genre_counts.values, y=genre_counts.index)
    plt.title("Genre Distribution")
    plt.xlabel("Count")
    plt.tight_layout()
    plt.show()

    # Analyze revenue by genre
    genre_revenue = df.copy()
    for genre in genre_counts.index[:10]:  # Top 10 genres
        genre_revenue[f"is_{genre}"] = (
            genre_revenue["genres"].str.contains(genre).astype(int)
        )

    # Plot average revenue by genre
    genre_avg_revenue = []
    for genre in genre_counts.index[:10]:
        avg_rev = df[df["genres"].str.contains(genre)]["revenue"].mean()
        genre_avg_revenue.append({"Genre": genre, "Average Revenue": avg_rev})

    genre_avg_df = pd.DataFrame(genre_avg_revenue).sort_values(
        "Average Revenue", ascending=False
    )

    plt.figure(figsize=(14, 8))
    sns.barplot(x="Average Revenue", y="Genre", data=genre_avg_df)
    plt.title("Average Revenue by Genre")
    plt.tight_layout()
    plt.show()

## 5. Temporal Analysis

Let's analyze how revenue trends over time.


In [None]:
# Convert release_date to datetime
if "release_date" in df.columns:
    df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")

    # Extract year and month
    df["release_year"] = df["release_date"].dt.year
    df["release_month"] = df["release_date"].dt.month

    # Analyze revenue by year
    yearly_revenue = df.groupby("release_year")["revenue"].agg(
        ["mean", "median", "count"]
    )
    yearly_revenue = yearly_revenue[
        yearly_revenue["count"] > 5
    ]  # Filter years with few movies

    plt.figure(figsize=(14, 8))
    plt.subplot(2, 1, 1)
    sns.lineplot(x=yearly_revenue.index, y="mean", data=yearly_revenue)
    plt.title("Average Revenue by Year")
    plt.ylabel("Average Revenue")

    plt.subplot(2, 1, 2)
    sns.lineplot(x=yearly_revenue.index, y="count", data=yearly_revenue, color="orange")
    plt.title("Number of Movies by Year")
    plt.xlabel("Year")
    plt.ylabel("Count")

    plt.tight_layout()
    plt.show()

    # Analyze revenue by month
    monthly_revenue = df.groupby("release_month")["revenue"].agg(
        ["mean", "median", "count"]
    )

    plt.figure(figsize=(14, 6))
    sns.barplot(x=monthly_revenue.index, y="mean", data=monthly_revenue)
    plt.title("Average Revenue by Month")
    plt.xlabel("Month")
    plt.ylabel("Average Revenue")
    plt.xticks(
        range(12),
        [
            "Jan",
            "Feb",
            "Mar",
            "Apr",
            "May",
            "Jun",
            "Jul",
            "Aug",
            "Sep",
            "Oct",
            "Nov",
            "Dec",
        ],
    )
    plt.tight_layout()
    plt.show()

## 6. Feature Relationships

Let's explore relationships between key features.


In [None]:
# Analyze relationship between runtime and revenue
if "runtime" in df.columns:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x="runtime", y="revenue", data=df, alpha=0.6)
    plt.title("Runtime vs. Revenue")
    plt.xlabel("Runtime (minutes)")
    plt.ylabel("Revenue")
    plt.tight_layout()
    plt.show()

    # Create runtime bins and analyze average revenue
    df["runtime_bin"] = pd.cut(
        df["runtime"],
        bins=[0, 60, 90, 120, 150, 180, 300],
        labels=["<60", "60-90", "90-120", "120-150", "150-180", ">180"],
    )

    runtime_revenue = df.groupby("runtime_bin")["revenue"].mean().reset_index()

    plt.figure(figsize=(10, 6))
    sns.barplot(x="runtime_bin", y="revenue", data=runtime_revenue)
    plt.title("Average Revenue by Runtime")
    plt.xlabel("Runtime (minutes)")
    plt.ylabel("Average Revenue")
    plt.tight_layout()
    plt.show()

In [None]:
# Analyze relationship between popularity and revenue
if "popularity" in df.columns and "vote_average" in df.columns:
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    sns.scatterplot(x="popularity", y="revenue", data=df, alpha=0.6)
    plt.title("Popularity vs. Revenue")
    plt.xlabel("Popularity")
    plt.ylabel("Revenue")

    plt.subplot(1, 2, 2)
    sns.scatterplot(x="vote_average", y="revenue", data=df, alpha=0.6)
    plt.title("Vote Average vs. Revenue")
    plt.xlabel("Vote Average")
    plt.ylabel("Revenue")

    plt.tight_layout()
    plt.show()

## 7. Advanced Visualizations

Let's create some advanced visualizations to better understand the data.


In [None]:
# Create a 3D scatter plot of budget, popularity, and revenue
if all(col in df.columns for col in ["budget", "popularity", "revenue"]):
    fig = px.scatter_3d(
        df,
        x="budget",
        y="popularity",
        z="revenue",
        opacity=0.7,
        color="revenue",
        hover_data=["title", "release_date"],
        title="Budget, Popularity, and Revenue Relationship",
    )
    fig.show()

In [None]:
# Create a bubble chart of budget, revenue, and popularity
if all(col in df.columns for col in ["budget", "popularity", "revenue", "vote_count"]):
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        x="budget",
        y="revenue",
        size="vote_count",
        hue="popularity",
        sizes=(20, 500),
        alpha=0.6,
        palette="viridis",
        data=df,
    )
    plt.title("Budget vs. Revenue (Size: Vote Count, Color: Popularity)")
    plt.xlabel("Budget")
    plt.ylabel("Revenue")
    plt.tight_layout()
    plt.show()

## 8. Feature Engineering Ideas

Based on our exploratory analysis, here are some feature engineering ideas:


In [None]:
# Create some example engineered features
df_features = df.copy()

# 1. Log transformations for skewed features
if "budget" in df_features.columns:
    df_features["log_budget"] = np.log1p(df_features["budget"])

# 2. Budget per minute (production efficiency)
if "budget" in df_features.columns and "runtime" in df_features.columns:
    df_features["budget_per_minute"] = df_features["budget"] / df_features[
        "runtime"
    ].replace(0, 1)

# 3. Return on investment (ROI)
if "budget" in df_features.columns and "revenue" in df_features.columns:
    df_features["roi"] = (df_features["revenue"] - df_features["budget"]) / df_features[
        "budget"
    ].replace(0, 1)

# 4. Weighted rating (IMDB formula)
if "vote_average" in df_features.columns and "vote_count" in df_features.columns:
    m = df_features["vote_count"].quantile(0.75)
    C = df_features["vote_average"].mean()
    df_features["weighted_rating"] = (
        df_features["vote_count"]
        / (df_features["vote_count"] + m)
        * df_features["vote_average"]
    ) + (m / (df_features["vote_count"] + m) * C)

# 5. Season features
if "release_month" in df_features.columns:
    df_features["is_summer_release"] = (
        df_features["release_month"].isin([5, 6, 7, 8]).astype(int)
    )
    df_features["is_holiday_release"] = (
        df_features["release_month"].isin([11, 12]).astype(int)
    )

# Display the new features
new_features = [col for col in df_features.columns if col not in df.columns]
df_features[new_features].head()

## 9. Summary and Insights

Based on our exploratory data analysis, here are the key insights:

1. **Revenue Distribution**: Movie revenue is highly skewed, suggesting a log transformation may be beneficial for modeling.

2. **Budget-Revenue Relationship**: There's a strong positive correlation between budget and revenue, making budget one of the most important predictors.

3. **Genre Impact**: Certain genres (like Action, Adventure, and Sci-Fi) tend to generate higher revenues on average.

4. **Seasonal Patterns**: Movies released during summer months and holiday season tend to perform better at the box office.

5. **Popularity Metrics**: Both popularity and vote average show positive correlations with revenue, indicating audience reception is important.

6. **Runtime Relationship**: There appears to be an optimal runtime range (120-150 minutes) that maximizes revenue.

7. **Feature Engineering Opportunities**: Several derived features like ROI, budget per minute, and weighted ratings could improve model performance.

These insights will guide our feature engineering and modeling approaches in the next notebooks.
