# Total Oscar candidate actors/actresses who have the most famous films by genre.

## And at the end answer two questions:

* What is the gender with the highest number of candidates?

* What is the film with the greatest percentage difference in nominations compared to the average for its genre?


In [19]:
import re
import json
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import pandas as pd


## Extract the 30 best movies from the action, comedy, drama and horror genres on FilmAffinity, as well as movies from 1980 onwards.

In [20]:
full_peliculas = []  # Lista vacía para almacenar la información de las películas
generos = {'action', 'comedy', 'drama', 'horror'}  # Conjunto de géneros

# Iterar sobre cada género
for genero in generos:
    # Construir la URL para cada género
    genre_url = f'https://www.filmaffinity.com/us/topgen.php?genres=%2B{genero[:2].upper()}&chv=0&orderby=rc&movietype=movie%7C&country=&fromyear=1980&toyear=2023&ratingcount=2&runtimemin=0&runtimemax=4'
    
    # Realizar una solicitud GET a la URL del género y obtener la página
    req = requests.get(genre_url)
    
    # Crear un objeto BeautifulSoup para analizar el HTML de la página del género
    soup = BeautifulSoup(req.text, 'html.parser')
    
    # Encontrar todas las películas en la página del género utilizando la clase 'mc-title'
    for pelicula in soup.select('.mc-title'):
        # Obtener el título de la película
        titulo = pelicula.text.strip()
        
        # Encontrar el div siguiente que contiene el reparto utilizando la clase 'mc-cast'
        cast_div = pelicula.find_next_sibling(class_='mc-cast')
        
        # Obtener los nombres de los actores de los elementos '.nb' dentro del div de reparto
        actores = [actor.text.strip() for actor in cast_div.select('.nb') if actor.a]
        
        # Agregar la información de la película a la lista de películas
        full_peliculas.append({
            "title": titulo.split("(")[0].strip(),  # Título de la película
            "year": titulo.split("(")[1].strip().replace(")", ""),  # Año de la película
            "genre": genero,  # Género de la película
            "actors": ", ".join(actores)  # Lista de actores de la película
        })    

# Devolver la lista completa de películas recolectadas
full_peliculas


[{'title': 'The Matrix',
  'year': '1999',
  'genre': 'action',
  'actors': 'Keanu Reeves,, Laurence Fishburne,, Carrie-Anne Moss,, Joe Pantoliano,, Hugo Weaving,, Marcus Chong,, Gloria Foster,, Matt Doran'},
 {'title': 'The Lord of the Rings: The Fellowship of the Ring',
  'year': '2001',
  'genre': 'action',
  'actors': 'Elijah Wood,, Ian McKellen,, Viggo Mortensen,, Sean Astin,, Sean Bean,, John Rhys-Davies,, Orlando Bloom,, Dominic Monaghan'},
 {'title': 'Gladiator',
  'year': '2000',
  'genre': 'action',
  'actors': 'Russell Crowe,, Joaquin Phoenix,, Connie Nielsen,, Oliver Reed,, Richard Harris,, Ralf Moeller,, Derek Jacobi,, Djimon Hounsou'},
 {'title': 'Kill Bill: Volume 1',
  'year': '2003',
  'genre': 'action',
  'actors': 'Uma Thurman,, Lucy Liu,, Daryl Hannah,, Vivica A. Fox,, Sonny Chiba,, Chiaki Kuriyama,, Michael Bowen,, Julie Dreyfus'},
 {'title': 'The Lord of the Rings: The Return of the King',
  'year': '2003',
  'genre': 'action',
  'actors': 'Elijah Wood,, Viggo Mor

In [21]:
movies = pd.DataFrame(full_peliculas)
movies["name"] = movies["actors"].str.split(",")
movies

Unnamed: 0,title,year,genre,actors,name
0,The Matrix,1999,action,"Keanu Reeves,, Laurence Fishburne,, Carrie-Ann...","[Keanu Reeves, , Laurence Fishburne, , Carri..."
1,The Lord of the Rings: The Fellowship of the Ring,2001,action,"Elijah Wood,, Ian McKellen,, Viggo Mortensen,,...","[Elijah Wood, , Ian McKellen, , Viggo Morten..."
2,Gladiator,2000,action,"Russell Crowe,, Joaquin Phoenix,, Connie Niels...","[Russell Crowe, , Joaquin Phoenix, , Connie ..."
3,Kill Bill: Volume 1,2003,action,"Uma Thurman,, Lucy Liu,, Daryl Hannah,, Vivica...","[Uma Thurman, , Lucy Liu, , Daryl Hannah, , ..."
4,The Lord of the Rings: The Return of the King,2003,action,"Elijah Wood,, Viggo Mortensen,, Ian McKellen,,...","[Elijah Wood, , Viggo Mortensen, , Ian McKel..."
...,...,...,...,...,...
85,City of God,2002,drama,"Alexandre Rodrigues,, Leandro Firmino,, Phelli...","[Alexandre Rodrigues, , Leandro Firmino, , P..."
86,The Truman Show,1998,drama,"Jim Carrey,, Laura Linney,, Noah Emmerich,, Ed...","[Jim Carrey, , Laura Linney, , Noah Emmerich..."
87,The Curious Case of Benjamin Button,2008,drama,"Brad Pitt,, Cate Blanchett,, Taraji P. Henson,...","[Brad Pitt, , Cate Blanchett, , Taraji P. He..."
88,Full Metal Jacket,1987,drama,"Matthew Modine,, Vincent D'Onofrio,, R. Lee Er...","[Matthew Modine, , Vincent D'Onofrio, , R. L..."


In [22]:
movies = movies.explode("name").drop(columns="actors")
movies["name"] = movies["name"].str.strip()

In [23]:
movies = movies.reset_index(drop=True)

movies

Unnamed: 0,title,year,genre,name
0,The Matrix,1999,action,Keanu Reeves
1,The Matrix,1999,action,
2,The Matrix,1999,action,Laurence Fishburne
3,The Matrix,1999,action,
4,The Matrix,1999,action,Carrie-Anne Moss
...,...,...,...,...
1316,Requiem for a Dream,2000,drama,Louise Lasser
1317,Requiem for a Dream,2000,drama,
1318,Requiem for a Dream,2000,drama,Marcia Jean Kurtz
1319,Requiem for a Dream,2000,drama,


## Extract the Oscar awards from the year 1980.

In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

actor_types = [
    ('al', 'actor'),
    ('a_la', 'actriz'),
    ('al', 'actor_de_reparto'),
    ('a_la', 'actriz_de_reparto')
]

all_actors_movies = []

for art, actor_type in actor_types:
    url = f'https://es.wikipedia.org/wiki/Anexo:%C3%93scar_{art}_mejor_{actor_type}'

    # Realizar la solicitud GET a la página
    response = requests.get(url)

    # Crear el objeto BeautifulSoup para analizar el HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Encontrar todas las tablas en la página
    tablas = soup.find_all('table')

    # Lista para almacenar los actores y películas de esta URL
    actors_movies = []

    # Iterar sobre cada tabla y extraer los nombres de los actores y películas
    for tabla in tablas:
        filas = tabla.find_all('tr')  # Encontrar todas las filas de la tabla
        for fila in filas:
            columnas = fila.find_all(['th', 'td'])  # Encontrar las columnas (celdas de encabezado y datos)
            datos_fila = [columna.text.strip() for columna in columnas]  # Extraer texto de cada celda
            # Añadir a la lista si hay datos en la fila y no es un encabezado
            if datos_fila and len(datos_fila) >= 4 and datos_fila[0].isdigit():
                actor_pelicula = {
                    'actor': datos_fila[2],
                    'pelicula': datos_fila[3]
                }
                actors_movies.append(actor_pelicula)

    # Añadir los actores y películas extraídos de esta URL a la lista principal
    all_actors_movies.append({
        'actor_type': actor_type,
        'actors_movies': actors_movies
    })

# Crear un DataFrame con los datos recolectados
data = []
for table_info in all_actors_movies:
    for actor_pelicula in table_info['actors_movies']:
        data.append([actor_pelicula['actor'], actor_pelicula['pelicula']])

oscars = pd.DataFrame(data, columns=['name', 'Película'])

oscars

Unnamed: 0,name,Película
0,Brendan Fraser,The Whale
1,Will Smith,King Richard
2,Anthony Hopkins,The Father
3,Joaquin Phoenix,Joker
4,Rami Malek,Bohemian Rhapsody
...,...,...
438,Jane Darwell,The Grapes of Wrath
439,Hattie McDaniel,Lo que el viento se llevó
440,Fay Bainter,Jezabel
441,Alice Brady,In Old Chicago


In [25]:
oscars = oscars.groupby("name").size().sort_values()
oscars = oscars.reset_index()
oscars.columns = ["name", "nominations"]
oscars

Unnamed: 0,name,nominations
0,Adrien Brody,1
1,Marcia Gay Harden,1
2,Lupita Nyong'o[1]​[2]​[3]​,1
3,Louise Fletcher,1
4,"Louis Gossett, Jr.",1
...,...,...
308,Frances McDormand,3
309,Daniel Day-Lewis,3
310,Ingrid Bergman,3
311,Katharine Hepburn,4


In [26]:
df = movies.merge(oscars, on="name", how='inner')
df

Unnamed: 0,title,year,genre,name,nominations
0,Gladiator,2000,action,Russell Crowe,1
1,Gladiator,2000,drama,Russell Crowe,1
2,Gladiator,2000,action,Joaquin Phoenix,1
3,Gladiator,2000,drama,Joaquin Phoenix,1
4,Inception,2010,action,Leonardo DiCaprio,1
...,...,...,...,...,...
98,Moulin Rouge,2001,drama,Jim Broadbent,1
99,The Curious Case of Benjamin Button,2008,drama,Cate Blanchett,2
100,The Curious Case of Benjamin Button,2008,drama,Tilda Swinton,1
101,Requiem for a Dream,2000,drama,Jennifer Connelly,1


In [27]:
df.to_json("movies_actors.json")

## Género con más nominaciones:

In [28]:
df = pd.read_json("movies_actors.json")

In [29]:
df.groupby("genre")["nominations"].sum().sort_values(ascending=False)

genre
drama     55
comedy    39
action    37
Name: nominations, dtype: int64

## The genre with the most nominations is drama.

In [30]:
df = df.groupby(["title", "genre"])["nominations"].sum()

In [31]:
df = df.reset_index()
df

Unnamed: 0,title,genre,nominations
0,American Beauty,comedy,4
1,American Beauty,drama,4
2,As Good As It Gets,comedy,5
3,As Good As It Gets,drama,5
4,Batman,action,5
5,Batman Begins,action,5
6,Big Fish,drama,2
7,Fight Club,drama,2
8,Finding Nemo,comedy,1
9,Forrest Gump,comedy,4


In [32]:
#Media por nominación por cada género
df.groupby("genre")["nominations"].mean()

genre
action    2.466667
comedy    2.052632
drama     2.894737
Name: nominations, dtype: float64

In [33]:
#ratio de nominación por cada película.
df["ratio_nominations"] = df.groupby("genre")["nominations"].transform(lambda x: x/x.mean())
df

Unnamed: 0,title,genre,nominations,ratio_nominations
0,American Beauty,comedy,4,1.948718
1,American Beauty,drama,4,1.381818
2,As Good As It Gets,comedy,5,2.435897
3,As Good As It Gets,drama,5,1.727273
4,Batman,action,5,2.027027
5,Batman Begins,action,5,2.027027
6,Big Fish,drama,2,0.690909
7,Fight Club,drama,2,0.690909
8,Finding Nemo,comedy,1,0.487179
9,Forrest Gump,comedy,4,1.948718


In [34]:
df.nlargest(3, columns="ratio_nominations")

Unnamed: 0,title,genre,nominations,ratio_nominations
2,As Good As It Gets,comedy,5,2.435897
38,The Dark Knight,action,6,2.432432
39,The Dark Knight,drama,6,2.072727


In [40]:
movies[movies['title']=='As Good As It Gets']

Unnamed: 0,title,year,genre,name
672,As Good As It Gets,1997,comedy,Jack Nicholson
673,As Good As It Gets,1997,comedy,
674,As Good As It Gets,1997,comedy,Helen Hunt
675,As Good As It Gets,1997,comedy,
676,As Good As It Gets,1997,comedy,Greg Kinnear
677,As Good As It Gets,1997,comedy,
678,As Good As It Gets,1997,comedy,Cuba Gooding Jr.
679,As Good As It Gets,1997,comedy,
680,As Good As It Gets,1997,comedy,Skeet Ulrich
681,As Good As It Gets,1997,comedy,


## Movie with the greatest percentage difference compared to the average for its genre is Better... Impossible (1997) starring Jack Nicholson.


In [36]:
df.nsmallest(3, columns="ratio_nominations")

Unnamed: 0,title,genre,nominations,ratio_nominations
20,Life Is Beautiful,drama,1,0.345455
31,Schindler's List,drama,1,0.345455
44,The Pianist,drama,1,0.345455


## Films with lower percentages compared to the average for their genre are The Pianist, Schindler's List and Life is Beautiful.