In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Definir las rutas de los archivos correctamente como cadenas de texto
ruta_movies_dataset = r"C:\Users\claud\OneDrive\Escritorio\Henry 1erPI-ML\ETL\movies_cleaned.parquet"
credit_crew_dataset = r"C:\Users\claud\OneDrive\Escritorio\Henry 1erPI-ML\ETL\credit_crew_cleaned.parquet"
credit_cast_dataset = r"C:\Users\claud\OneDrive\Escritorio\Henry 1erPI-ML\ETL\credit_cast_cleaned.parquet"

# Leer los archivos parquet
df_movies = pd.read_parquet(ruta_movies_dataset)
df_credit_crew = pd.read_parquet(credit_crew_dataset)
df_credit_cast = pd.read_parquet(credit_cast_dataset)


In [3]:
df_movies

Unnamed: 0,budget,id,overview,popularity,release_date,revenue,runtime,title,vote_average,vote_count,release_year,return,genres_name,production_companies_name,production_countries_name,spoken_languages_name
0,30000000,862,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033,81.0,Toy Story,7.7,5415,1995,12.451801,Animation,Pixar Animation Studios,United States of America,English
1,30000000,862,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033,81.0,Toy Story,7.7,5415,1995,12.451801,Comedy,Pixar Animation Studios,United States of America,English
2,30000000,862,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033,81.0,Toy Story,7.7,5415,1995,12.451801,Family,Pixar Animation Studios,United States of America,English
3,65000000,8844,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249,104.0,Jumanji,6.9,2413,1995,4.043035,Adventure,TriStar Pictures,United States of America,English
4,65000000,8844,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249,104.0,Jumanji,6.9,2413,1995,4.043035,Adventure,TriStar Pictures,United States of America,Français
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414481,0,67758,"When one of her hits goes wrong, a professiona...",0.903007,2003-08-01,0,90.0,Betrayal,3.8,6,2003,0.000000,Action,American World Pictures,United States of America,English
414482,0,67758,"When one of her hits goes wrong, a professiona...",0.903007,2003-08-01,0,90.0,Betrayal,3.8,6,2003,0.000000,Drama,American World Pictures,United States of America,English
414483,0,67758,"When one of her hits goes wrong, a professiona...",0.903007,2003-08-01,0,90.0,Betrayal,3.8,6,2003,0.000000,Thriller,American World Pictures,United States of America,English
414484,0,227506,"In a small town live two brothers, one a minis...",0.003503,1917-10-21,0,87.0,Satan Triumphant,0.0,0,1917,0.000000,,Yermoliev,Russia,


Filtrar los datos:

Filtra las películas por año de lanzamiento, idioma y país de producción.

In [4]:
# Filtrar por año, idiomas y países principales
df_movies = df_movies[df_movies['release_year'] >= 1985]

principales_idiomas = ['English', 'Français', 'Deutsch', 'Español', 'Italiano']
df_movies = df_movies[df_movies['spoken_languages_name'].isin(principales_idiomas)]

paises_principales = ['United States of America', 'United Kingdom', 'France', 'Canada', 
                      'Japan', 'Germany', 'Italy', 'Russia', 'India', 'Spain', 'Argentina']
df_movies = df_movies[df_movies['production_countries_name'].isin(paises_principales)]


Unir DataFrames:

Se combina los DataFrames de películas, géneros, directores y actores.

In [5]:
# Transformar las columnas en listas
genero = (df_movies[['id', 'genres_name']]
          .drop_duplicates()
          .groupby('id')['genres_name']
          .apply(list)
          .reset_index(name='generos'))

directores = (df_credit_crew[['id', 'crew_name']]
              .drop_duplicates()
              .groupby('id')['crew_name']
              .apply(list)
              .reset_index(name='directores'))

actores = (df_credit_cast[['id', 'cast_name']]
           .drop_duplicates()
           .groupby('id')['cast_name']
           .apply(list)
           .reset_index(name='actores'))

# Combinar los DataFrames en uno solo
col_movies = df_movies[['id', 'title', 'overview']]
df_tags = pd.merge(col_movies, genero, on='id', how='inner')
df_tags = pd.merge(df_tags, directores, on='id', how='inner')
df_tags = pd.merge(df_tags, actores, on='id', how='inner')


Transformaciones:

Transforma y limpia los datos para prepararlos para la vectorización.

In [6]:
genero = (df_movies[['id', 'genres_name']]
                               .drop_duplicates()
                               .groupby('id')['genres_name']
                               .apply(list)
                               .reset_index(name='generos'))

In [7]:
directores = (df_credit_crew[['id', 'crew_name']]
                               .drop_duplicates()
                               .groupby('id')['crew_name']
                               .apply(list)
                               .reset_index(name='directores'))

In [8]:
actores = (df_credit_cast[['id', 'cast_name']]
                               .drop_duplicates()
                               .groupby('id')['cast_name']
                               .apply(list)
                               .reset_index(name='actores'))

In [9]:
col_movies = df_movies[['id', 'title','overview']]
col_genero = genero[['id', 'generos']]
col_directores = directores[['id', 'directores']]
col_actores = actores[['id', 'actores']]

# Realizar merge secuencial para combinar los DataFrames
df_tags = pd.merge(col_movies, col_genero, on='id', how='inner')
df_tags = pd.merge(df_tags, col_directores, on='id', how='inner')
df_tags = pd.merge(df_tags, col_actores, on='id', how='inner')

In [10]:
df_tags

Unnamed: 0,id,title,overview,generos,directores,actores
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",[John Lasseter],"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney..."
1,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",[John Lasseter],"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney..."
2,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",[John Lasseter],"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney..."
3,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",[Joe Johnston],"[Robin Williams, Jonathan Hyde, Kirsten Dunst,..."
4,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",[Joe Johnston],"[Robin Williams, Jonathan Hyde, Kirsten Dunst,..."
...,...,...,...,...,...,...
184083,30840,Robin Hood,"Yet another version of the classic epic, with ...","[Drama, Action, Romance]",[John Irvin],"[Patrick Bergin, Uma Thurman, David Morrissey,..."
184084,30840,Robin Hood,"Yet another version of the classic epic, with ...","[Drama, Action, Romance]",[John Irvin],"[Patrick Bergin, Uma Thurman, David Morrissey,..."
184085,67758,Betrayal,"When one of her hits goes wrong, a professiona...","[Action, Drama, Thriller]",[Mark L. Lester],"[Erika Eleniak, Adam Baldwin, Julie du Page, J..."
184086,67758,Betrayal,"When one of her hits goes wrong, a professiona...","[Action, Drama, Thriller]",[Mark L. Lester],"[Erika Eleniak, Adam Baldwin, Julie du Page, J..."


In [11]:
df_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184088 entries, 0 to 184087
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          184088 non-null  int64 
 1   title       184088 non-null  object
 2   overview    184088 non-null  object
 3   generos     184088 non-null  object
 4   directores  184088 non-null  object
 5   actores     184088 non-null  object
dtypes: int64(1), object(5)
memory usage: 8.4+ MB


 Se transforma y limpia la información para consolidarla en una columna llamada tags, que luego será vectorizada para calcular la similitud de coseno.

In [12]:
# Eliminar filas vacías en la columna 'overview'
df_tags.dropna(subset=['overview'], inplace=True)
# Asegurarse de que todos los valores en 'overview' sean cadenas antes de aplicar '.split()'
df_tags['overview'] = df_tags['overview'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
# Aplicar separación solo a las cadenas de texto
df_tags['overview'] = df_tags['overview'].apply(lambda x: x.split() if isinstance(x, str) else x)
# Verificar el resultado
print(df_tags['overview'].head())


0    [Led, by, Woody,, Andy's, toys, live, happily,...
1    [Led, by, Woody,, Andy's, toys, live, happily,...
2    [Led, by, Woody,, Andy's, toys, live, happily,...
3    [When, siblings, Judy, and, Peter, discover, a...
4    [When, siblings, Judy, and, Peter, discover, a...
Name: overview, dtype: object


In [13]:
# Definir la función para eliminar los espacios
def eliminar_espacios(cadena):
    if isinstance(cadena, list):
        return [i.replace(" ", "") for i in cadena if isinstance(i, str)]
    else:
        return []  # Devuelve una lista vacía si el valor es None o no es una lista

# Reemplazar valores None por listas vacías antes de aplicar la función
df_tags['generos'] = df_tags['generos'].apply(lambda x: x if isinstance(x, list) else [])
df_tags['directores'] = df_tags['directores'].apply(lambda x: x if isinstance(x, list) else [])
df_tags['actores'] = df_tags['actores'].apply(lambda x: x if isinstance(x, list) else [])

# Aplicar la función eliminar_espacios a las columnas correspondientes
df_tags['generos'] = df_tags['generos'].apply(eliminar_espacios)
df_tags['directores'] = df_tags['directores'].apply(eliminar_espacios)
df_tags['actores'] = df_tags['actores'].apply(eliminar_espacios)

# Verificar el resultado
print(df_tags[['generos', 'directores', 'actores']].head())


                        generos      directores  \
0   [Animation, Comedy, Family]  [JohnLasseter]   
1   [Animation, Comedy, Family]  [JohnLasseter]   
2   [Animation, Comedy, Family]  [JohnLasseter]   
3  [Adventure, Fantasy, Family]   [JoeJohnston]   
4  [Adventure, Fantasy, Family]   [JoeJohnston]   

                                             actores  
0  [TomHanks, TimAllen, DonRickles, JimVarney, Wa...  
1  [TomHanks, TimAllen, DonRickles, JimVarney, Wa...  
2  [TomHanks, TimAllen, DonRickles, JimVarney, Wa...  
3  [RobinWilliams, JonathanHyde, KirstenDunst, Br...  
4  [RobinWilliams, JonathanHyde, KirstenDunst, Br...  


In [14]:
df_tags['etiquetas'] = df_tags['overview'] + df_tags['generos'] + df_tags['directores'] + df_tags['actores']

In [15]:
#Eliminamos las columnas que ya no nos sirven.
df_tags = df_tags.drop(columns = ['overview', 'generos', 'directores', 'actores','id'])

In [16]:
#la línea de código convierte cada lista de strings en la columna 'etiquetas' del DataFrame df_tags en una cadena única de texto, donde cada elemento está separado por espacios, 
# utilizando una función lambda con apply().
df_tags['etiquetas'] = df_tags ['etiquetas'].apply(lambda x: " ".join(x))

In [17]:
df_tags = df_tags.drop_duplicates()

In [18]:
df_tags

Unnamed: 0,title,etiquetas
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
3,Jumanji,When siblings Judy and Peter discover an encha...
21,Grumpier Old Men,A family wedding reignites the ancient feud be...
25,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
28,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...
184030,The Morning After,The Morning After is a feature film that consi...
184034,The Burkittsville 7,A film archivist revisits the story of Rustin ...
184036,Caged Heat 3000,It's the year 3000 AD. The world's most danger...
184037,Robin Hood,"Yet another version of the classic epic, with ..."


In [19]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\claud\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Modelo de Recomendacion

In [20]:
df_tags.reset_index(drop=True, inplace=True)

In [21]:
cv = CountVectorizer(max_features = 10000 , stop_words= 'english')

In [22]:
vector = cv.fit_transform(df_tags['etiquetas']).toarray()

In [23]:
vector.shape

(19030, 10000)

In [24]:
similitud_coseno = cosine_similarity(vector)

similitud_coseno

array([[1.        , 0.04170288, 0.04914732, ..., 0.        , 0.        ,
        0.        ],
       [0.04170288, 1.        , 0.07071068, ..., 0.03885143, 0.        ,
        0.        ],
       [0.04914732, 0.07071068, 1.        , ..., 0.        , 0.03042903,
        0.        ],
       ...,
       [0.        , 0.03885143, 0.        , ..., 1.        , 0.02507849,
        0.06142951],
       [0.        , 0.        , 0.03042903, ..., 0.02507849, 1.        ,
        0.08164966],
       [0.        , 0.        , 0.        , ..., 0.06142951, 0.08164966,
        1.        ]])

In [25]:
def recomendacion(titulo):
    try:
        indice = df_tags[df_tags['title'] == titulo].index[0]
    
        distancia = sorted(list(enumerate(similitud_coseno[indice])), reverse = True, key = lambda x: x[1])

        recomendadas = [df_tags.iloc[i[0]].title for i in distancia[1:6]]
        
        print(f"Porque viste '{titulo}', tal vez te guste:")
        for pelicula in recomendadas:
            print(pelicula)
    except IndexError:
        print(f"No se encontró la película '{titulo}' en la base de datos.")

In [26]:
# Mostrar los nombres de las columnas en el DataFrame
print(df_tags.columns)


Index(['title', 'etiquetas'], dtype='object')


In [29]:
recomendacion("Toy Story")

Porque viste 'Toy Story', tal vez te guste:
Toy Story 2
Toy Story 3
The 40 Year Old Virgin
Andy Peters: Exclamation Mark Question Point
Hawaiian Vacation


In [30]:
#Descargamos la base de datos que utilizamos para crear la función de recomendación. 
df_tags.to_parquet('tags_ML.parquet', index=False)