# Explore here

In [1]:
import pandas as pd
import sqlite3
import json

In [2]:
# Paso 1: Carga del conjunto de datos
url_movies = "https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv"
url_credits = "https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv"

movies = pd.read_csv(url_movies)
credits = pd.read_csv(url_credits)

In [3]:
# Paso 2: Creación de una base de datos
# Conectar a la base de datos (o crearla si no existe)
conn = sqlite3.connect('movies_database.db')

# Almacenar los DataFrames en la base de datos como tablas
movies.to_sql('movies', conn, index=False, if_exists='replace')
credits.to_sql('credits', conn, index=False, if_exists='replace')

4803

Creamos un nuevo Dataframe

In [4]:

# Paso 3: Transforma los datos
# Unir las dos tablas con SQL
query = '''
    SELECT *
    FROM movies
    JOIN credits ON movies.title = credits.title
'''

# Crear un nuevo DataFrame con la información unificada
merged_df = pd.read_sql(query, conn)

# Cerrar la conexión a la base de datos
conn.close()

merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
merged_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [5]:
import pandas as pd
import json
def load_json_safe(json_str):
    """Intenta cargar una cadena JSON. Si falla, devuelve un valor vacío."""
    try:
        return json.loads(json_str)
    except (TypeError, json.JSONDecodeError):
        return []
def process_genres(genres_str):
    """Procesa la cadena de géneros y devuelve una lista de nombres de géneros."""
    genres_data = load_json_safe(genres_str)
    genres_list = []
    for item in genres_data:
        genres_list.append(item['name'])
    return genres_list
def process_keywords(keywords_str):
    """Procesa la cadena de palabras clave y devuelve una lista de ellas."""
    keywords_data = load_json_safe(keywords_str)
    keywords_list = []
    for item in keywords_data:
        keywords_list.append(item['name'])
    return keywords_list
def process_cast(cast_str):
    """Procesa la cadena de reparto y devuelve una lista con los nombres de los tres primeros actores."""
    cast_data = load_json_safe(cast_str)
    cast_list = []
    for item in cast_data[:3]:  # Limita a los primeros 3 actores
        cast_list.append(item['name'])
    return cast_list
def process_crew(crew_str):
    """Procesa la cadena del equipo y devuelve los nombres de los directores."""
    crew_data = load_json_safe(crew_str)
    directors = []
    for crew_member in crew_data:
        if crew_member['job'] == 'Director':
            directors.append(crew_member['name'])
    return ' '.join(directors)  # Une los nombres con espacios
def process_overview(overview_str):
    """Envuelve el resumen en una lista."""
    if overview_str:
        return [overview_str]
    else:
        return []
# Suponiendo que total_data es tu DataFrame
merged_df['genres'] = merged_df['genres'].apply(process_genres)
merged_df['keywords'] = merged_df['keywords'].apply(process_keywords)
merged_df['cast'] = merged_df['cast'].apply(process_cast)
merged_df['crew'] = merged_df['crew'].apply(process_crew)
merged_df['overview'] = merged_df['overview'].apply(process_overview)

In [6]:
merged_df["overview"] = merged_df["overview"].apply(lambda x: [str(x)])
merged_df["genres"] = merged_df["genres"].apply(lambda x: [str(genre) for genre in x])
merged_df["keywords"] = merged_df["keywords"].apply(lambda x: [str(keyword) for keyword in x])
merged_df["cast"] = merged_df["cast"].apply(lambda x: [str(actor) for actor in x])
merged_df["crew"] = merged_df["crew"].apply(lambda x: [str(crew_member) for crew_member in x])

merged_df["tags"] = merged_df["overview"] + merged_df["genres"] + merged_df["keywords"] + merged_df["cast"] + merged_df["crew"]
merged_df["tags"] = merged_df["tags"].apply(lambda x: ",".join(x).replace(",", " "))

merged_df.drop(columns = ["genres", "keywords", "cast", "crew", "overview"], inplace = True)

merged_df.iloc[0].tags

"['In the 22nd century  a paraplegic Marine is dispatched to the moon Pandora on a unique mission  but becomes torn between following orders and protecting an alien civilization.'] Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d Sam Worthington Zoe Saldana Sigourney Weaver J a m e s   C a m e r o n"

In [7]:
# Visualizar el DataFrame resultante
print(merged_df.head())

      budget                                      homepage      id  \
0  237000000                   http://www.avatarmovie.com/   19995   
1  300000000  http://disney.go.com/disneypictures/pirates/     285   
2  245000000   http://www.sonypictures.com/movies/spectre/  206647   
3  250000000            http://www.thedarkknightrises.com/   49026   
4  260000000          http://movies.disney.com/john-carter   49529   

  original_language                            original_title  popularity  \
0                en                                    Avatar  150.437577   
1                en  Pirates of the Caribbean: At World's End  139.082615   
2                en                                   Spectre  107.376788   
3                en                     The Dark Knight Rises  112.312950   
4                en                               John Carter   43.926995   

                                production_companies  \
0  [{"name": "Ingenious Film Partners", "id": 289...   
1  [

In [8]:
merged_df.to_csv("../data/processed/clean_data.csv", index = False)

conn = sqlite3.connect("../data/movies_database.db")

#movies_df.to_sql("clean_movies_data", conn, if_exists = "replace", index = False)
merged_df.to_sql("clean_movies_data", conn, if_exists = "replace", index = False)

4809

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(merged_df["tags"])

model = NearestNeighbors(n_neighbors = 6, algorithm = "brute", metric = "cosine")
model.fit(tfidf_matrix)

In [10]:

def get_movie_recommendations(movie_title):
    movie_index = merged_df[merged_df["title"] == movie_title].index[0]
    distances, indices = model.kneighbors(tfidf_matrix[movie_index])
    similar_movies = [(merged_df["title"][i], distances[0][j]) for j, i in enumerate(indices[0])]
    return similar_movies[1:]


In [11]:
input_movie = "Pirates of the Caribbean: At World's End"
recommendations = get_movie_recommendations(input_movie)
print("Film recommendations '{}'".format(input_movie))
for movie, distance in recommendations:
    print("- Film: {}".format(movie))

Film recommendations 'Pirates of the Caribbean: At World's End'
- Film: Pirates of the Caribbean: Dead Man's Chest
- Film: Pirates of the Caribbean: The Curse of the Black Pearl
- Film: Pirates of the Caribbean: On Stranger Tides
- Film: The Pirates! In an Adventure with Scientists!
- Film: Nim's Island
