# Proyecto K-Nearest Neighbours

In [331]:
# Librería para la declaración y uso de Data Frames:
import pandas as pd

# Librería para poder utilizar la base de datos SQLite:
import sqlite3

# Librería para poder trabajar con datos en formato json:
import json

# Librería para poder trabajar con un vectorizador tipo TfidVectorizer:
from sklearn.feature_extraction.text import TfidfVectorizer

# ALibrería para calcular la distancia de coseno:
from sklearn.metrics.pairwise import cosine_similarity

# Librería para utilizar el modelo KNN:
from sklearn.neighbors import NearestNeighbors

## Paso 1 - Lectura de Datos:

En primer lugar, es necesario **leer y guardar la información** en dos variables para poder empezar a trabajar con ellas.

Para ello, se has guaradado los archivos con todos los datos en la ruta: */workspaces/KNN-clara-ab/data/raw/tmdb_5000_credits.csv* y */workspaces/KNN-clara-ab/data/raw/tmdb_5000_movies.csv* y se han cargado en dos Data Frames:

In [332]:
# Lectura del CSV con los datos, dada la ruta donde se guarda el archivo:
df_movies = pd.read_csv ('/workspaces/KNN-clara-ab/data/raw/tmdb_5000_movies.csv');

# Configuración de pandas para mostrar todas las columnas del DataFrame sin truncarlas al visualizarlo
pd.set_option('display.max_columns', None);

# Se muestran las 2 primeras filas del Data Frame
df_movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [333]:
# Lectura del CSV con los datos, dada la ruta donde se guarda el archivo:
df_credits = pd.read_csv ('/workspaces/KNN-clara-ab/data/raw/tmdb_5000_credits.csv');

# Configuración de pandas para mostrar todas las columnas del DataFrame sin truncarlas al visualizarlo
pd.set_option('display.max_columns', None);

# Se muestran las 5 primeras filas del Data Frame
df_credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


## Paso 2 - Creación de una base de datos:

In [334]:
conn = sqlite3.connect("/workspaces/KNN-clara-ab/data/raw/movies_db.sqlite");  # Crea/conecta la base de datos
cursor = conn.cursor();  # Crea un cursor para ejecutar consultas SQL

In [335]:
df_movies.to_sql("movies", conn, if_exists="replace", index = False);
df_credits.to_sql("credits", conn, if_exists="replace", index = False);

In [336]:
pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", conn)

Unnamed: 0,name
0,movies
1,credits


In [337]:
query = """
    SELECT *
    FROM movies
    INNER JOIN credits
    ON movies.title = credits.title;
"""

df_final = pd.read_sql(query, conn);

In [338]:
conn.close()

In [339]:
df_final.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,title.1,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [340]:
# Seleccionamos solo las columnas que necesitamos
df_final = df_final[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]];

df_final.head()

Unnamed: 0,movie_id,title,title.1,overview,genres,keywords,cast,crew
0,19995,Avatar,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


## Paso 3 - Transformación de los datos:

In [341]:
# Limpiar 'genres': Extraemos solo el atributo 'name' de cada género
df_final['genres'] = df_final['genres'].apply(lambda x: [genre['name'] for genre in json.loads(x)]);

 # Limpiar 'keywords': Extraemos solo el atributo 'name' de cada palabra clave
df_final['keywords'] = df_final['keywords'].apply(lambda x: [keyword['name'] for keyword in json.loads(x)]);

# Limpiar 'cast': Extraemos los tres primeros actores/actrices
df_final['cast'] = df_final['cast'].apply(lambda x: [actor['name'] for actor in json.loads(x)[:3]]);

# Limpiar 'crew': Extraemos el nombre del director(s) de la lista de miembros del equipo
df_final['crew'] = df_final['crew'].apply(lambda x: " ".join([crew_member['name'] for crew_member in json.loads(x) if crew_member['job'] == 'Director']));

# Limpiar 'overview': Convertirlo en una lista (aquí solo estamos convirtiendo a una lista de un solo string)
df_final['overview'] = df_final['overview'].apply(lambda x: [x]);

In [342]:
df_final["overview"] = df_final["overview"].apply(lambda x: [str(x)]);
df_final["genres"] = df_final["genres"].apply(lambda x: [str(genre) for genre in x]);
df_final["keywords"] = df_final["keywords"].apply(lambda x: [str(keyword) for keyword in x]);
df_final["cast"] = df_final["cast"].apply(lambda x: [str(actor) for actor in x]);
df_final["crew"] = df_final["crew"].apply(lambda x: [str(crew_member) for crew_member in x]);



In [343]:
df_final["tags"] = df_final["overview"] + df_final["genres"] + df_final["keywords"] + df_final["cast"] + df_final["crew"]

# Unir todo con comas
df_final['tags'] = df_final['tags'].apply(lambda x: ",".join(x))

# Reemplazar las comas por espacios en blanco
df_final['tags'] = df_final['tags'].apply(lambda x: x.replace(",", " "))



In [344]:
df_processed = df_final.copy();
df_processed.drop(columns = ["genres", "keywords", "cast", "crew", "overview"], inplace = True)

df_processed.iloc[0].tags

"['In the 22nd century  a paraplegic Marine is dispatched to the moon Pandora on a unique mission  but becomes torn between following orders and protecting an alien civilization.'] Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d Sam Worthington Zoe Saldana Sigourney Weaver J a m e s   C a m e r o n"

## Paso 4 - Construcción KNN:

In [345]:
vectorizer = TfidfVectorizer();
vec_matrix = vectorizer.fit_transform(df_processed["tags"]);

In [346]:
knn_model = NearestNeighbors (n_neighbors = 6, algorithm = "brute", metric = "cosine");

# Entrenar el modelo con la matriz de similitud de coseno
knn_model.fit(vec_matrix);

In [347]:
# Eliminar duplicados basados en el título antes de la recomendación
#df_processed = df_processed.drop_duplicates()

# Restablecer el índice para asegurar que no haya duplicados en el índice
#df_processed = df_processed.reset_index(drop=True)
df_processed = df_processed.loc[:, ~df_processed.columns.duplicated()]

In [348]:
def recommend_movies(movie):
    # Buscar el índice de la película en el DataFrame
    movie_index = df_processed[df_processed["title"] == movie].index[0]
    
    # Obtener el vector de la película (su representación en el espacio vectorial)
    movie_vector = vec_matrix[movie_index]
    
    # Obtener los 5 vecinos más cercanos (sin incluir la película misma)
    distances, indices = knn_model.kneighbors(movie_vector)
    
    # Crear la lista de títulos recomendados (excluyendo la película misma)
    recommended_titles = []
    for i in range(1, len(indices[0])):  # Empezamos desde 1 para excluir la misma película
        recommended_titles.append(df_processed.iloc[indices[0][i]]['title'])
    
    return recommended_titles

In [349]:
recommended_movies = recommend_movies("How to Train Your Dragon")

# Mostrar las películas recomendadas
for movie in recommended_movies:
    print(movie)

How to Train Your Dragon 2
Dragon Nest: Warriors' Dawn
Pete's Dragon
George and the Dragon
Eragon
