
## Extract the 30 best movies from the action, comedy, drama and horror genres on FilmAffinity, as well as movies from 1980 onwards.

In [2]:
import requests
from bs4 import BeautifulSoup

# Empty list to store the information about the movies
all_movies = []

# Set of genres
genres = {'action', 'comedy', 'drama', 'horror'}

# Iterate over each genre
for genre in genres:
    # Build the URL for each genre
    genre_url = f'https://www.filmaffinity.com/us/topgen.php?genres=%2B{genre[:2].upper()}&chv=0&orderby=rc&movietype=movie%7C&country=&fromyear=1980&toyear=2023&ratingcount=2&runtimemin=0&runtimemax=4'
    
    try:
        # Make a GET request to the genre URL and get the page
        req = requests.get(genre_url)
        req.raise_for_status()  # Check if the request was successful

        # Create a BeautifulSoup object to parse the HTML of the genre page
        soup = BeautifulSoup(req.text, 'html.parser')
        
        # Find all movies on the genre page using the 'mc-title' class
        for movie in soup.select('.mc-title'):
            # Get the full title of the movie
            full_title = movie.text.strip()

            # Extract the title and year if they are available
            try:
                title, year = full_title.split("(")
                year = year.replace(")", "").strip()
            except ValueError:
                # Skip the entry if it doesn't have the expected format
                continue

            # Find the next div containing the cast using the 'mc-cast' class
            cast_div = movie.find_next_sibling(class_='mc-cast')
            actors = []
            if cast_div:
                # Get the names of the actors from the '.nb' elements within the cast div
                actors = [actor.text.strip() for actor in cast_div.select('.nb') if actor.a]

            # Add the movie information to the list of movies
            all_movies.append({
                "title": title.strip(),
                "year": year,
                "genre": genre,
                "actors": ", ".join(actors)
            })
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for genre {genre}: {e}")

# Return the complete list of collected movies
all_movies


[{'title': 'Men in Black',
  'year': 'MIB',
  'genre': 'action',
  'actors': "Tommy Lee Jones,, Will Smith,, Linda Fiorentino,, Vincent D'Onofrio,, Rip Torn"},
 {'title': 'Sin City',
  'year': "Frank Miller's Sin City",
  'genre': 'action',
  'actors': 'Bruce Willis,, Mickey Rourke,, Clive Owen,, Rosario Dawson,, Jessica Alba'},
 {'title': 'Men in Black',
  'year': 'MIB',
  'genre': 'comedy',
  'actors': "Tommy Lee Jones,, Will Smith,, Linda Fiorentino,, Vincent D'Onofrio,, Rip Torn"}]

In [6]:
import requests
from bs4 import BeautifulSoup

full_peliculas = []  # Lista vacía para almacenar la información de las películas
generos = ['action', 'comedy', 'drama', 'horror']  # Lista de géneros

# Iterar sobre cada género
for genero in generos:
    # Construir la URL para cada género
    genre_url = f'https://www.filmaffinity.com/us/topgen.php?genres=%2B{genero[:2].upper()}&chv=0&orderby=rc&movietype=movie%7C&country=&fromyear=1980&toyear=2023&ratingcount=2&runtimemin=0&runtimemax=4'
    
    # Realizar una solicitud GET a la URL del género y obtener la página
    req = requests.get(genre_url)
    if req.status_code != 200:  # Verificar que la solicitud fue exitosa
        print(f"Error fetching page for genre {genero}: {req.status_code}")
        continue

    # Crear un objeto BeautifulSoup para analizar el HTML de la página del género
    soup = BeautifulSoup(req.text, 'html.parser')
    
    # Encontrar todas las películas en la página del género
    peliculas = soup.select('.mc-title')

    # Extraer información de las películas
    for pelicula in peliculas:
        # Obtener el título de la película
        titulo = pelicula.text.strip()
        
        # Encontrar el div siguiente que contiene el reparto
        cast_div = pelicula.find_next_sibling(class_='mc-cast')
        
        # Obtener los nombres de los actores
        actores = [actor.text.strip() for actor in cast_div.select('.nb') if actor.a]
        
        # Agregar la información de la película a la lista de películas
        full_peliculas.append({
            "title": titulo.split("(")[0].strip(),  # Título de la película
            "year": titulo.split("(")[1].strip().replace(")", ""),  # Año de la película
            "genre": genero,  # Género de la película
            "actors": ", ".join(actores)  # Lista de actores de la película
        })

# Devolver la lista completa de películas recolectadas
for i, pelicula in enumerate(full_peliculas, 1):
    print(f"{i}. Title: {pelicula['title']}, Year: {pelicula['year']}, Genre: {pelicula['genre']}, Actors: {pelicula['actors']}")


Error fetching page for genre action: 429
Error fetching page for genre comedy: 429
Error fetching page for genre drama: 429
Error fetching page for genre horror: 429


## Save the list of all movies as a csv file.

In [3]:
import csv

# Specify the filename
filename = "movies.csv"

# Define the header based on the dictionary keys in all_movies
header = ["title", "year", "genre", "actors"]

# Write the data to a CSV file
with open(filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()  # Write the header
    writer.writerows(all_movies)  # Write each movie as a row

print(f"Data successfully written to {filename}")


Data successfully written to movies.csv
