In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing as mp

# Función para limpiar los nombres de las hojas
def clean_sheet_name(name):
    return re.sub(r'[\/:*?"<>|]', '_', name)

# Función para formatear la fecha de estreno
def format_release_date(date_str):
    try:
        formatted_date = pd.to_datetime(date_str, errors='coerce')
        if pd.isnull(formatted_date):
            return None
        return formatted_date.strftime('%Y-%m-%d')
    except Exception as e:
        return None

# Función para extraer detalles adicionales de la página de la película
def get_movie_details(movie_url):
    movie_details = {}
    try:
        movie_response = requests.get(movie_url, timeout=10)
        movie_response.raise_for_status()
        movie_soup = BeautifulSoup(movie_response.text, 'html.parser')

        domestic_distributor = movie_soup.find('span', text='Domestic Distributor')
        if domestic_distributor:
            movie_details['Domestic Distributor'] = domestic_distributor.find_next_sibling('span').text.strip()

        domestic_opening = movie_soup.find('span', text='Domestic Opening')
        if domestic_opening:
            movie_details['Domestic Opening'] = domestic_opening.find_next_sibling('span').text.strip()

        budget = movie_soup.find('span', text='Budget')
        if budget:
            movie_details['Budget'] = budget.find_next_sibling('span').text.strip()

        earliest_release_date = movie_soup.find('span', text='Earliest Release Date')
        if earliest_release_date:
            movie_details['Earliest Release Date'] = earliest_release_date.find_next_sibling('span').text.strip()

        mpaa = movie_soup.find('span', text='MPAA')
        if mpaa:
            movie_details['MPAA'] = mpaa.find_next_sibling('span').text.strip()

        running_time = movie_soup.find('span', text='Running Time')
        if running_time:
            movie_details['Running Time'] = running_time.find_next_sibling('span').text.strip()

        genres = movie_soup.find('span', text='Genres')
        if genres:
            movie_details['Genres'] = genres.find_next_sibling('span').text.strip()

    except Exception as e:
        print(f"Error al obtener detalles de la película: {e}")

    return movie_details

# Función para procesar un género y obtener detalles de películas
def process_genre(genre_data):
    genre_name, full_genre_url = genre_data
    try:
        genre_response = requests.get(full_genre_url, timeout=10)
        genre_response.raise_for_status()
        genre_soup = BeautifulSoup(genre_response.text, 'html.parser')
        movie_table = genre_soup.find('table')

        movie_details = []
        if movie_table:
            movie_rows = movie_table.find_all('tr')[1:]

            for movie_row in movie_rows:
                movie_cells = movie_row.find_all('td')
                if len(movie_cells) >= 6:
                    movie_title = movie_cells[1].text.strip()
                    distributor = movie_cells[2].text.strip()
                    release_date = format_release_date(movie_cells[3].text.strip())
                    production_cost = movie_cells[4].text.strip()
                    box_office = movie_cells[5].text.strip()

                    movie_link = movie_cells[1].find('a')['href']
                    full_movie_url = f"https://www.boxofficemojo.com{movie_link}"
                    additional_details = get_movie_details(full_movie_url)

                    movie_info = {
                        'Movie Title': movie_title,
                        'Distributor': distributor,
                        'Release Date': release_date,
                        'Production Cost': production_cost,
                        'Box Office': box_office
                    }
                    movie_info.update(additional_details)
                    movie_details.append(movie_info)

        cleaned_genre_name = clean_sheet_name(genre_name)
        genre_df = pd.DataFrame(movie_details).drop_duplicates()

        return cleaned_genre_name, genre_df
    except Exception as e:
        print(f"Error al procesar el género {genre_name}: {e}")
        return genre_name, pd.DataFrame()

# URL de la página de géneros en Box Office Mojo
url = "https://www.boxofficemojo.com/genre/?ref_=bo_lnav_hm_shrt"

response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

all_genres_data = []
genre_movie_dataframes = {}

genre_table = soup.find('table')

if genre_table:
    rows = genre_table.find_all('tr')[1:]

    genres_to_process = []
    for row in rows:
        cells = row.find_all('td')
        if len(cells) >= 5:
            genre_name = cells[0].text.strip()
            genre_link = cells[0].find('a')['href']
            full_genre_url = f"https://www.boxofficemojo.com{genre_link}"
            genres_to_process.append((genre_name, full_genre_url))

    # Usar concurrent.futures con 5 trabajadores (60% de 8 núcleos)
    max_workers = int(mp.cpu_count() * 0.6)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_genre, genre_data): genre_data for genre_data in genres_to_process}
        for future in as_completed(futures):
            genre_name, df = future.result()
            if not df.empty:
                genre_movie_dataframes[genre_name] = df

else:
    print("No se encontró la tabla de géneros.")

# Guardar la información en un archivo Excel con múltiples pestañas
with pd.ExcelWriter('movies_data.xlsx') as writer:
    for genre_name, df in genre_movie_dataframes.items():
        df.to_excel(writer, sheet_name=genre_name, index=False)

print("Datos guardados en 'movies_data.xlsx' con éxito.")


  domestic_distributor = movie_soup.find('span', text='Domestic Distributor')
  domestic_opening = movie_soup.find('span', text='Domestic Opening')
  budget = movie_soup.find('span', text='Budget')
  earliest_release_date = movie_soup.find('span', text='Earliest Release Date')
  mpaa = movie_soup.find('span', text='MPAA')
  running_time = movie_soup.find('span', text='Running Time')
  genres = movie_soup.find('span', text='Genres')


Datos guardados en 'movies_data.xlsx' con éxito.
