# Explore here

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score, make_scorer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.cluster import KMeans
from pickle import dump
import pickle



In [2]:
# Importando los datos sobre películas desde el enlace proporcionado
movie_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv")

# Visualizando el DataFrame para inspeccionar su contenido
movie_data.head()


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [3]:
# Cargando los datos relacionados con créditos desde el enlace proporcionado
credits_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv")

# Visualizando los datos cargados para revisión inicial
credits_data.head()


Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
# Estableciendo la conexión con la base de datos SQLite
db_connection = sqlite3.connect("../data/movies_database.db")

# Creando las tablas en la base de datos y añadiendo los datos a dichas tablas
credits_data.to_sql('credits_table', db_connection, if_exists='replace', index=False)
movie_data.to_sql('movies_table', db_connection, if_exists='replace', index=False)


4803

In [5]:
# Realizando un merge entre las dos tablas usando 'title' como clave
join_query = """
    SELECT *
    FROM movies_table
    INNER JOIN credits_table
    ON movies_table.title = credits_table.title;
"""

# Ejecutando la consulta SQL y cargando los datos combinados en un DataFrame
complete_data = pd.read_sql_query(join_query, db_connection)

# Cerrando la conexión a la base de datos
db_connection.close()


In [6]:
# Eliminando columnas duplicadas del DataFrame combinado
final_data = complete_data.loc[:, ~complete_data.columns.duplicated()]

# Mostrando el DataFrame resultante
final_data


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4804,220000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",,9367,"[{""id"": 5616, ""name"": ""united states\u2013mexi...",es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,"[{""name"": ""Columbia Pictures"", ""id"": 5}]",...,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238,9367,"[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...","[{""credit_id"": ""52fe44eec3a36847f80b280b"", ""de..."
4805,9000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",,72766,[],en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],...,85.0,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5,72766,"[{""cast_id"": 1, ""character"": ""Buzzy"", ""credit_...","[{""credit_id"": ""52fe487dc3a368484e0fb013"", ""de..."
4806,0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",http://www.hallmarkchannel.com/signedsealeddel...,231617,"[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...",en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,"[{""name"": ""Front Street Pictures"", ""id"": 3958}...",...,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Signed, Sealed, Delivered",7.0,6,231617,"[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de..."
4807,0,[],http://shanghaicalling.com/,126186,[],en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],...,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7,126186,"[{""cast_id"": 3, ""character"": ""Sam"", ""credit_id...","[{""credit_id"": ""52fe4ad9c3a368484e16a36b"", ""de..."


In [7]:
# Seleccionando únicamente las columnas específicas según las instrucciones
filtered_data = final_data[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Visualizando el DataFrame con las columnas seleccionadas
filtered_data


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
...,...,...,...,...,...,...,...
4804,9367,El Mariachi,El Mariachi just wants to play his guitar and ...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 5616, ""name"": ""united states\u2013mexi...","[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...","[{""credit_id"": ""52fe44eec3a36847f80b280b"", ""de..."
4805,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",[],"[{""cast_id"": 1, ""character"": ""Buzzy"", ""credit_...","[{""credit_id"": ""52fe487dc3a368484e0fb013"", ""de..."
4806,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...","[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...","[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de..."
4807,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,[],[],"[{""cast_id"": 3, ""character"": ""Sam"", ""credit_id...","[{""credit_id"": ""52fe4ad9c3a368484e16a36b"", ""de..."


In [8]:
def safe_json_parse(input_data):
    try:
        return json.loads(input_data) if isinstance(input_data, str) else []  # Verifica que sea una cadena
    except (json.JSONDecodeError, TypeError):
        return []  # Devuelve una lista vacía si falla la decodificación

# Aplicando transformaciones en el DataFrame usando .loc
filtered_data.loc[:, "genres"] = filtered_data["genres"].apply(
    lambda entry: [genre["name"] for genre in safe_json_parse(entry)] if isinstance(entry, str) else None
)
filtered_data.loc[:, "keywords"] = filtered_data["keywords"].apply(
    lambda entry: [keyword["name"] for keyword in safe_json_parse(entry)] if isinstance(entry, str) else None
)
filtered_data.loc[:, "cast"] = filtered_data["cast"].apply(
    lambda entry: [cast_member["name"] for cast_member in safe_json_parse(entry)][:3] if isinstance(entry, str) else None
)
filtered_data.loc[:, "crew"] = filtered_data["crew"].apply(
    lambda entry: " ".join(
        [member["name"] for member in safe_json_parse(entry) if member.get("job") == "Director"]
    )
)
filtered_data.loc[:, "overview"] = filtered_data["overview"].apply(
    lambda entry: [entry] if isinstance(entry, str) else None
)

filtered_data


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In the 22nd century, a paraplegic Marine is d...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron
1,285,Pirates of the Caribbean: At World's End,"[Captain Barbossa, long believed to be dead, h...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski
2,206647,Spectre,[A cryptic message from Bond’s past sends him ...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes
3,49026,The Dark Knight Rises,[Following the death of District Attorney Harv...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan
4,49529,John Carter,"[John Carter is a war-weary, former military c...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton
...,...,...,...,...,...,...,...
4804,9367,El Mariachi,[El Mariachi just wants to play his guitar and...,"[Action, Crime, Thriller]","[united states–mexico barrier, legs, arms, pap...","[Carlos Gallardo, Jaime de Hoyos, Peter Marqua...",Robert Rodriguez
4805,72766,Newlyweds,[A newlywed couple's honeymoon is upended by t...,"[Comedy, Romance]",[],"[Edward Burns, Kerry Bishé, Marsha Dietlein]",Edward Burns
4806,231617,"Signed, Sealed, Delivered","[""Signed, Sealed, Delivered"" introduces a dedi...","[Comedy, Drama, Romance, TV Movie]","[date, love at first sight, narration, investi...","[Eric Mabius, Kristin Booth, Crystal Lowe]",Scott Smith
4807,126186,Shanghai Calling,[When ambitious New York attorney Sam is sent ...,[],[],"[Daniel Henney, Eliza Coupe, Bill Paxton]",Daniel Hsia


In [10]:
# Transformando las columnas aplicando modificaciones personalizadas
filtered_data.loc[:, "genres"] = filtered_data["genres"].apply(lambda entry: [str(genre) for genre in entry])
filtered_data.loc[:, "keywords"] = filtered_data["keywords"].apply(lambda entry: [str(keyword) for keyword in entry])
filtered_data.loc[:, "cast"] = filtered_data["cast"].apply(lambda entry: [str(actor) for actor in entry])
filtered_data.loc[:, "crew"] = filtered_data["crew"].apply(lambda entry: [str(member) for member in entry])
filtered_data.loc[:, "overview"] = filtered_data["overview"].apply(lambda entry: [str(entry)])

filtered_data


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[['In the 22nd century, a paraplegic Marine is...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[J, a, m, e, s, , C, a, m, e, r, o, n]"
1,285,Pirates of the Caribbean: At World's End,"[['Captain Barbossa, long believed to be dead,...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[G, o, r, e, , V, e, r, b, i, n, s, k, i]"
2,206647,Spectre,[['A cryptic message from Bond’s past sends hi...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[S, a, m, , M, e, n, d, e, s]"
3,49026,The Dark Knight Rises,"[[""Following the death of District Attorney Ha...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]","[C, h, r, i, s, t, o, p, h, e, r, , N, o, l, ..."
4,49529,John Carter,"[[""John Carter is a war-weary, former military...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[A, n, d, r, e, w, , S, t, a, n, t, o, n]"
...,...,...,...,...,...,...,...
4804,9367,El Mariachi,[['El Mariachi just wants to play his guitar a...,"[Action, Crime, Thriller]","[united states–mexico barrier, legs, arms, pap...","[Carlos Gallardo, Jaime de Hoyos, Peter Marqua...","[R, o, b, e, r, t, , R, o, d, r, i, g, u, e, z]"
4805,72766,Newlyweds,"[[""A newlywed couple's honeymoon is upended by...","[Comedy, Romance]",[],"[Edward Burns, Kerry Bishé, Marsha Dietlein]","[E, d, w, a, r, d, , B, u, r, n, s]"
4806,231617,"Signed, Sealed, Delivered","[['""Signed, Sealed, Delivered"" introduces a de...","[Comedy, Drama, Romance, TV Movie]","[date, love at first sight, narration, investi...","[Eric Mabius, Kristin Booth, Crystal Lowe]","[S, c, o, t, t, , S, m, i, t, h]"
4807,126186,Shanghai Calling,[['When ambitious New York attorney Sam is sen...,[],[],"[Daniel Henney, Eliza Coupe, Bill Paxton]","[D, a, n, i, e, l, , H, s, i, a]"


In [11]:
# Crear una copia del DataFrame original para trabajar con seguridad
processed_data = filtered_data.copy()

# Concatenar listas de manera segura, asegurando que no haya valores None
processed_data["tags"] = processed_data.apply(
    lambda row: " ".join(
        sum(
            [row["overview"], row["genres"], row["keywords"], row["cast"], row["crew"]],
            []
        )
    ) if all(isinstance(row[col], list) for col in ["overview", "genres", "keywords", "cast", "crew"]) else "",
    axis=1
)

# Eliminar columnas innecesarias después de crear la columna 'tags'
processed_data.drop(columns=["genres", "keywords", "cast", "crew", "overview"], inplace=True)

# Inspeccionando los datos en la nueva columna 'tags' para un registro específico
processed_data.iloc[0].tags


"['In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'] Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d Sam Worthington Zoe Saldana Sigourney Weaver J a m e s   C a m e r o n"

In [12]:
# Exportando el DataFrame procesado a un archivo CSV
processed_data.to_csv("../data/processed/processed_data.csv", index=False)

# Estableciendo conexión con la base de datos SQLite
db_connection = sqlite3.connect("../data/movies_database.db")

# Guardando el DataFrame como una tabla en la base de datos SQLite
processed_data.to_sql("processed_movies_data", db_connection, if_exists="replace", index=False)


4809

In [13]:
# Creando el objeto TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transformando la columna 'tags' en una matriz TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_data["tags"])


In [14]:
# Inicializando el modelo KNN con el algoritmo automático
knn_model = NearestNeighbors(algorithm="auto")


In [15]:
def custom_silhouette_evaluator(estimator, data):
    # Generar clústeres utilizando KMeans (ya que no contamos con etiquetas)
    kmeans_model = KMeans(n_clusters=5, random_state=42, n_init=10).fit(data)
    cluster_labels = kmeans_model.labels_
    
    # Calcular el puntaje de Silhouette
    silhouette_score_value = silhouette_score(data, cluster_labels)
    return silhouette_score_value

# Crear un "scorer" personalizado basado en la evaluación de Silhouette
silhouette_metric = make_scorer(custom_silhouette_evaluator, greater_is_better=True)


In [16]:
# Definición de hiperparámetros para el modelo
parameters_grid = {
    'n_neighbors': [5, 6, 7, 8],
    'radius': [1.0, 0.8, 0.5, 0.3],
    'leaf_size': [30, 40, 50, 60],
    'metric': ['minkowski', 'cosine']
}


In [17]:
# Configurando la búsqueda HalvingGridSearchCV con el modelo y los hiperparámetros definidos
halving_search_model = HalvingGridSearchCV(
    knn_model,
    parameters_grid,
    scoring=silhouette_metric,
    cv=5
)

# Revisando la configuración del objeto HalvingGridSearchCV
halving_search_model


In [19]:
execute_halving_search = True

if execute_halving_search:
    import warnings
    warnings.filterwarnings("ignore")  # Ignorar advertencias durante la ejecución

    halving_search_model.fit(tfidf_matrix)
    print(f"Best hyperparameters: {halving_search_model.best_params_}")


Best hyperparameters: {'leaf_size': 40, 'metric': 'cosine', 'n_neighbors': 7, 'radius': 1.0}


In [20]:
# Configurando el modelo KNN ajustado con los mejores hiperparámetros obtenidos
optimized_knn_model = NearestNeighbors(
    algorithm="auto", 
    leaf_size=40, 
    metric="cosine", 
    n_neighbors=7, 
    radius=1.0
)

# Entrenando el modelo optimizado con la matriz TF-IDF
optimized_knn_model.fit(tfidf_matrix)


In [21]:
def recommend_similar_movies(movie_title):
    # Identificar el índice de la película seleccionada
    movie_idx = processed_data[processed_data["title"] == movie_title].index[0]
    
    # Obtener las distancias y los índices de las películas más similares
    distances, indices = optimized_knn_model.kneighbors(tfidf_matrix[movie_idx])
    
    # Crear una lista de películas similares con sus distancias
    similar_movies = [(processed_data["title"][i], distances[0][j]) for j, i in enumerate(indices[0])]
    return similar_movies[1:]  # Excluir la película original

# Película base para buscar recomendaciones
base_movie = "How to Train Your Dragon"
movie_recommendations = recommend_similar_movies(base_movie)

# Imprimir las recomendaciones
print(f"Recommendations based on the film: {base_movie}")
for title, distance in movie_recommendations:
    print("- Movie: {}".format(title))


Recommendations based on the film: How to Train Your Dragon
- Movie: How to Train Your Dragon 2
- Movie: Dragon Nest: Warriors' Dawn
- Movie: Pete's Dragon
- Movie: George and the Dragon
- Movie: Eragon
- Movie: Dragon Hunters


In [23]:
# Guardando el modelo KNN ajustado utilizando pickle
with open("../models/optimized_knn_model-7_algorithm-auto_metric-cosine_leaf_size-40_radius-1.0.sav", "wb") as file:
    pickle.dump(optimized_knn_model, file)
