# Movie Studio Box Office Analysis: Comprehensive Exploratory Data Analysis

## Domestic Box Office For 2025 and 2024 from scraped https://www.boxofficemojo.com/

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import itertools
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

def extract_movie_details(url):
    # Send a request to the URL
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the summary details section
    summary_details = soup.find('div', class_='mojo-summary-values')

    if not summary_details:
        return "Could not find summary details section"

    # Extract specific details
    details = {}

    # MPAA Rating
    mpaa_elem = summary_details.find('span', string='MPAA')
    if mpaa_elem:
        details['MPAA'] = mpaa_elem.find_next_sibling('span').text.strip()

    # Running Time
    runtime_elem = summary_details.find('span', string='Running Time')
    if runtime_elem:
        details['Running Time'] = runtime_elem.find_next_sibling('span').text.strip()

    # Genres
    genres_elem = summary_details.find('span', string='Genres')
    if genres_elem:
        genres = genres_elem.find_next_sibling('span').text.strip()
        details['Genres'] = [genre.strip() for genre in genres.split('\n') if genre.strip()]

    # In Release
    release_elem = summary_details.find('span', string='In Release')
    if release_elem:
        details['In Release'] = release_elem.find_next_sibling('span').text.strip()

    return details

def scrape_box_office_data(years):
  data = []
  for year in years:
    url = f'https://www.boxofficemojo.com/year/{year}/?grossesOption=totalGrosses'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    # Find the table rows
    rows = soup.find_all('tr')[1:]  # Skip the header rows

    # Prepare lists to store data


    # Extract data from each row
    for row in rows:
        cols = row.find_all('td')

        # Check if row has enough columns
        if len(cols) >= 12:
            # Find the movie link
            movie_link = cols[1].find('a', class_='a-link-normal')

            # Base movie data
            movie_data = {
                'Rank': cols[0].text.strip(),
                'Year': year,
                'Movie': movie_link.text.strip() if movie_link else 'N/A',
                'Movie Link': "https://www.boxofficemojo.com" + movie_link['href'] if movie_link else 'N/A',
                'Total Gross': cols[5].text.strip().replace('$', '').replace(',', ''),
                'Max Theaters': cols[6].text.strip(),
                'Opening Weekend Gross': cols[7].text.strip().replace('$', '').replace(',', ''),
                'Opening Weekend % of Total': cols[8].text.strip(),
                'Opening Theaters': cols[9].text.strip(),
                'Open Date': cols[10].text.strip(),
                'Distributor': cols[12].text.strip()
            }

            # Get additional movie details
            if movie_link:
                additional_details = extract_movie_details("https://www.boxofficemojo.com" + movie_link['href'])
                movie_data.update(additional_details)

            data.append(movie_data)

    # Create DataFrame
  df = pd.DataFrame(data)
  # Display the DataFrame
  return df

# Example usage
years = [2024, 2025]
box_office_data = scrape_box_office_data(years)

# Display the DataFrame
box_office_data.head(10)