In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
# Con los datos limpios y transformados anteriormente creamos dataframes de las tablas y sus tablas intermedias para poder relacionarlas

df_movies = pd.read_parquet("DataSet/Movies.parquet", engine= "pyarrow")
df_genres = pd.read_parquet("DataSet/Genres.parquet", engine= "pyarrow")
df_actors = pd.read_parquet("DataSet/Cast_Movies.parquet", engine= "pyarrow")
df_directors = pd.read_parquet("DataSet/Director_Movies.parquet", engine= "pyarrow")
df_collentions = pd.read_parquet("DataSet/Collections.parquet", engine= "pyarrow")

df_genres_movies = pd.read_parquet("DataSet/Relation_Movies_Genres.parquet", engine= "pyarrow")
df_movies_actors = pd.read_parquet("DataSet/Relation_Cast_Movies.parquet", engine= "pyarrow")
df_movies_directors = pd.read_parquet("DataSet/Relation_Director_Movies.parquet", engine= "pyarrow")

In [None]:
#Se eliminan películas con año de estreno menor a 1965.
df_movies = df_movies[df_movies["release_year"] >= 1965] 

In [None]:
#Se eliminan más películas según la popularidad para evitar sobrecargar la matriz de similitud coseno.
df_movies = df_movies[df_movies["popularity"] >= 10.0] 

In [11]:
df_movies = df_movies[["idMovies", "title", "idCollection"]]

In [13]:
df_final = df_movies.copy()

In [14]:
# Generamos una tabla 

df_final = df_movies.merge(df_movies_actors, on='idMovies', how='left')
df_final = df_final.merge(df_actors, on='idCast', how='left')
df_final = df_final.merge(df_movies_directors, on="idMovies", how="left")
df_final = df_final.merge(df_directors, on="idCrew", how="left")
df_final = df_final.merge(df_genres_movies, on="idMovies", how="left")
df_final = df_final.merge(df_genres, on="idGenres", how="left")
df_final = df_final.merge(df_collentions, on= "idCollection", how= "left")

In [None]:
#Se agrupan los datos en filas, de actores, directores y generos, por película. Creando una tabla combinada, sin valores repetidos.
df_final = df_final.groupby("idMovies").agg({
    "title": "first",
    "nameCollection": "first",
    "nameCast": lambda x: " ".join(x.dropna().unique()),
    "nameCrew": lambda x: " ".join(x.dropna().unique()),
    "nameGenres": lambda x: " ".join(x.dropna().unique())
}).reset_index()

In [16]:
df_final = df_final.fillna("")

In [None]:
# Se combinan los features en una sola columna, separados por espacios.
df_final["combined_features"] = (
    df_final["title"] + " " + 
    df_final["nameCollection"] + " " +
    df_final["nameCast"] + " " + 
    df_final["nameCrew"] + " " + 
    df_final["nameGenres"] + " " 
).str.strip()

In [18]:
# Se crea un último data frame para ser vectorizado con tf-idf
df_to_ML = df_final[["title","combined_features"]]

In [19]:
df_to_ML.to_parquet("DataSet/Data-to-ML.parquet", engine= "pyarrow", index= False)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer 

#Vectorizamos los caracteres de la columna combined_features
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df_to_ML["combined_features"])

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
#Calculamos la similitud del coseno con la matriz que contiene los vectores.
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [22]:
with open('DataSet/cosine_similarity_matrix.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)