In [1]:
# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)


In [5]:
GENRES_IMDB = {
    28: "Action",
    12: "Adventure",
    16: "Animation",
    35: "Comedy",
    80: "Crime",
    99: "Documentary",
    18: "Drama",
    10751: "Family",
    14: "Fantasy",
    36: "History",
    27: "Horror",
    10402: "Music",
    9648: "Mystery",
    10749: "Romance",
    878: "Sci-Fi",
    10770: "TV Movie",
    53: "Thriller",
    10752: "War",
    37: "Western"
}

GENRES_IMDB_INVERTED = {
    "Action": 28,
    "Adventure": 12,
    "Animation": 16,
    "Comedy": 35,
    "Crime": 80,
    "Documentary": 99,
    "Drama": 18,
    "Family": 10751,
    "Fantasy": 14,
    "History": 36,
    "Horror": 27,
    "Music": 10402,
    "Mystery": 9648,
    "Romance": 10749,
    "Sci-Fi": 878,
    "TV Movie": 10770,
    "Thriller": 53,
    "War": 10752,
    "Western": 37
}



# Scrapping de información con API

In [3]:
import requests
import json
import time

# Configuración de la API
API_KEY = "c436a0598ba40f517d94fa3c9cc217d6"  # Reemplaza con tu API Key de TMDB
BASE_URL = "https://api.themoviedb.org/3/movie/popular"
NUM_PELICULAS = 10000  # Número total de películas a descargar
PELICULAS_POR_PAGINA = 20  # TMDB devuelve 20 películas por página
paginas_a_descargar = (NUM_PELICULAS // PELICULAS_POR_PAGINA) + 1

def obtener_peliculas(n=NUM_PELICULAS):
    peliculas = []
    for pagina in range(1, paginas_a_descargar + 1):
        url = f"{BASE_URL}?api_key={API_KEY}&language=en-EN&page={pagina}"
        respuesta = requests.get(url)
        if respuesta.status_code == 200:
            datos = respuesta.json()
            peliculas.extend(datos["results"])
        else:
            print(f"⚠ Error en la petición: {respuesta.status_code}")
            break
        # Pausa para no exceder los límites de la API (40 peticiones/10s)
        time.sleep(0.5)
        # Detener si alcanzamos el límite deseado
        if len(peliculas) >= n:
            break

    return peliculas[:n]

# Obtener las películas más populares y guardarlas en JSON
peliculas = obtener_peliculas(NUM_PELICULAS)

# Guardar en un archivo JSON
with open("peliculas.json", "w", encoding="utf-8") as f:
    json.dump(peliculas, f, indent=4, ensure_ascii=False)

print(f"✅ Se han guardado {len(peliculas)} películas en peliculas.json")

✅ Se han guardado 10000 películas en peliculas.json


# Limpieza DataSet

In [15]:
import pandas as pd

# Leer el archivo JSON
with open("peliculas.json", "r", encoding="utf-8") as f:
      peliculas = json.load(f)



# Crear un DataFrame con las columnas seleccionadas
df_peliculas = pd.DataFrame(peliculas, columns=["original_language", "title", "release_date", "overview", "genre_ids"])

# Remove duplicate rows based on all columns
# df_peliculas.drop_duplicates(inplace=True)

# Remove rows with missing values (NaN) in any column
#df_peliculas.dropna(inplace=True)

# Reset the index after removing rows
#df_peliculas.reset_index(drop=True, inplace=True)

df_peliculas.head()




Unnamed: 0,original_language,title,release_date,overview,genre_ids
0,en,Sonic the Hedgehog 3,2024-12-19,"Sonic, Knuckles, and Tails reunite against a p...","[28, 878, 35, 10751]"
1,en,Back in Action,2025-01-15,Fifteen years after vanishing from the CIA to ...,"[28, 35]"
2,en,Moana 2,2024-11-21,After receiving an unexpected call from her wa...,"[16, 12, 10751, 35, 9648]"
3,en,Kraven the Hunter,2024-12-11,Kraven Kravinoff's complex relationship with h...,"[28, 878, 12, 14, 53]"
4,en,Alarum,2025-01-16,Two married spies caught in the crosshairs of ...,"[28, 80, 53]"



# Guardar dataset en json




In [18]:
df_peliculas.to_json("./peliculas_limpias.json", orient="records", indent=4, force_ascii=False)
print("✅ Archivo JSON guardado en 'data/peliculas_limpias.json'")


✅ Archivo JSON guardado en 'data/peliculas_limpias.json'


#Funciones varias


In [12]:
def getDataFrameGenero(genero):
  if isinstance(genero, str):
    genero=GENRES_IMDB_INVERTED[genero]
  df_peliculas_genero = df_peliculas[df_peliculas['genre_ids'].apply(lambda x: genero in x)]
  return df_peliculas_genero

def getDataFrameLanguage(language):
  df_peliculas_language = df_peliculas[df_peliculas['original_language'] == language]
  return df_peliculas_language

def getDataFrameYearRange(start_year, end_year):
  """
  Filters the DataFrame to include movies released within a specified year range.

  Args:
    start_year: The starting year of the range (inclusive).
    end_year: The ending year of the range (inclusive).

  Returns:
    A filtered DataFrame containing movies released within the specified range.
  """
  df_peliculas_year_range = df_peliculas[
      df_peliculas['release_date'].str.slice(0, 4).between(str(start_year), str(end_year))
  ]
  return df_peliculas_year_range

# Example usage
df_2020_to_2023_movies = getDataFrameYearRange(2020, 2023)
# You can now work with the df_2020_to_2023_movies DataFrame

df_2020_to_2023_movies.head()


Unnamed: 0,original_language,title,release_date,overview,genre_ids
32,es,My Fault,2023-06-08,"Noah must leave her city, boyfriend, and frien...","[10749, 18]"
48,ko,Sex Game 6969,2022-01-27,Three married women had always been dissatisfi...,"[35, 18, 10749]"
52,en,Sonic the Hedgehog 2,2022-03-30,"After settling in Green Hills, Sonic is eager ...","[28, 12, 10751, 35]"
85,sv,Pleasure,2021-10-08,19 year old Linnéa leaves her small town in Sw...,[18]
89,pl,365 Days: This Day,2022-04-26,Laura and Massimo are back and hotter than eve...,"[10749, 18]"


In [38]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import os

class DenseRetriever:
    def __init__(self, df, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        """
        Inicializa el modelo de embeddings y almacena los embeddings en memoria.
        :param df: DataFrame con las columnas ["title", "overview"].
        :param model_name: Nombre del modelo de Hugging Face.
        """
        self.df = df
        self.model = SentenceTransformer(model_name)
        self.embeddings = None

        # Concatenar "title + overview" para generar embeddings
        self.df["text"] = self.df["title"] + " - " + self.df["overview"]

        # Generar embeddings para las películas
        self._generate_embeddings("./movie_embeddings.npy") # Se puede añadir un path

    def _generate_embeddings(self, pathEmbeddings=None):
        """Genera embeddings y los almacena en memoria."""
        if pathEmbeddings and os.path.exists(pathEmbeddings):
            self.embeddings = np.load(pathEmbeddings)
            print(f"Embeddings cargados desde el archivo: {pathEmbeddings}")
        else:
          print("🔹 Generando embeddings...")
          self.embeddings = self.model.encode(self.df["text"].tolist(), convert_to_numpy=True)
          np.save("movie_embeddings.npy", self.embeddings)  # Guardar embeddings en un archivo .npy

    def save_embeddings(self, path):
        """Guarda los embeddings en un archivo .npy."""
        np.save(path, self.embeddings)
        print(f"Embeddings guardados en: {path}")


    def search(self, query, top_k=5):
        """
        Realiza una búsqueda utilizando similitud del coseno.
        :param query: Texto de búsqueda.
        :param top_k: Número de resultados a devolver.
        :return: DataFrame con los resultados ordenados por similitud.
        """
        print(f"🔍 Buscando: {query}")

        # Convertir la query en embedding
        query_embedding = self.model.encode([query], convert_to_numpy=True)

        # Calcular similitud del coseno entre la query y los embeddings de las películas
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]

        # Obtener los índices de los mejores resultados
        best_indices = np.argsort(similarities)[::-1][:top_k]

        # Recuperar las películas coincidentes
        results = self.df.iloc[best_indices].copy()
        results["similarity"] = similarities[best_indices]

        return results.sort_values(by="similarity", ascending=False)



In [39]:
# prompt: carga el dataframe limpio desde ./peliculas_limpias.json

import pandas as pd

df = pd.read_json("./peliculas_limpias.json")
print(df.head())

  original_language                 title release_date  \
0                en  Sonic the Hedgehog 3   2024-12-19   
1                en        Back in Action   2025-01-15   
2                en               Moana 2   2024-11-21   
3                en     Kraven the Hunter   2024-12-11   
4                en                Alarum   2025-01-16   

                                            overview  \
0  Sonic, Knuckles, and Tails reunite against a p...   
1  Fifteen years after vanishing from the CIA to ...   
2  After receiving an unexpected call from her wa...   
3  Kraven Kravinoff's complex relationship with h...   
4  Two married spies caught in the crosshairs of ...   

                   genre_ids  
0       [28, 878, 35, 10751]  
1                   [28, 35]  
2  [16, 12, 10751, 35, 9648]  
3      [28, 878, 12, 14, 53]  
4               [28, 80, 53]  


In [40]:
denseR=DenseRetriever(df)

🔹 Generando embeddings...


In [55]:
query = "thiefs steal a casino"
results = denseR.search(query, 30)
results

🔍 Buscando: thiefs steal a casino


Unnamed: 0,original_language,title,release_date,overview,genre_ids,text,similarity
8410,ko,The Thieves,2012-07-25,A gang of South Korean thieves team up with a ...,"[28, 80]",The Thieves - A gang of South Korean thieves t...,0.632082
413,en,Dungeons & Dragons: Honor Among Thieves,2023-03-23,A charming thief and a band of unlikely advent...,"[12, 14, 35]",Dungeons & Dragons: Honor Among Thieves - A ch...,0.535732
3764,en,Diamonds Are Forever,1971-12-14,Diamonds are stolen only to be sold again in t...,"[28, 53]",Diamonds Are Forever - Diamonds are stolen onl...,0.51264
2484,zh,Breaking and Re-entering,2024-02-08,"Double-crossed after a bank heist, a team of p...","[28, 35]",Breaking and Re-entering - Double-crossed afte...,0.510807
5030,en,Honest Thief,2020-09-03,A bank robber tries to turn himself in because...,"[53, 28, 80]",Honest Thief - A bank robber tries to turn him...,0.50746
6863,en,Bullet Proof,2022-08-19,The Thief pulls off the robbery of a lifetime...,"[28, 80]",Bullet Proof - The Thief pulls off the robber...,0.500396
5566,en,Thick as Thieves,2009-01-09,A master thief recruits a notorious thief to h...,"[28, 53, 80]",Thick as Thieves - A master thief recruits a n...,0.495897
4985,en,Ocean's Eleven,1960-08-10,Danny Ocean and his gang attempt to rob the fi...,"[80, 35]",Ocean's Eleven - Danny Ocean and his gang atte...,0.495214
2857,en,Ocean's Eleven,1960-08-10,Danny Ocean and his gang attempt to rob the fi...,"[80, 35]",Ocean's Eleven - Danny Ocean and his gang atte...,0.495214
4575,en,"Lock, Stock and Two Smoking Barrels",1998-08-28,A card shark and his unwillingly-enlisted frie...,"[35, 80]","Lock, Stock and Two Smoking Barrels - A card s...",0.487557
