# Imports

In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import ast
import nltk
import joblib

# Preprocesamiento de los datos

In [2]:
df_movies_modified = pd.read_csv('movies_dataset_modified.csv')

In [3]:
df_movies_modified.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45346 entries, 0 to 45345
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  4485 non-null   object 
 1   budget                 45346 non-null  int64  
 2   genres                 45346 non-null  object 
 3   id                     45346 non-null  int64  
 4   original_language      45335 non-null  object 
 5   overview               44405 non-null  object 
 6   popularity             45346 non-null  float64
 7   production_companies   45346 non-null  object 
 8   production_countries   45346 non-null  object 
 9   release_date           45346 non-null  object 
 10  revenue                45346 non-null  float64
 11  runtime                45100 non-null  float64
 12  spoken_languages       45346 non-null  object 
 13  status                 45266 non-null  object 
 14  tagline                20387 non-null  object 
 15  ti

## Extracción de columnas que se creen necesarias para el modelo de recomendación

In [4]:
df_movies_recomendacion = df_movies_modified[['id', 'title','genres', 'overview', 'tagline']]

## Creación de columnas nuevas y eliminación de columnas

### Generos

In [5]:
lista_generos = []
generos = []
generos_unicos = []

for i in df_movies_recomendacion['genres']:
    valores = ast.literal_eval(i)
    
    for j in range(len(valores)):
        lista_generos.append(valores[j]['name'])
        if valores[j]['name'] not in generos_unicos:
            generos_unicos.append(valores[j]['name'])
            
    generos.append(lista_generos)
    lista_generos = []

In [6]:
df_movies_recomendacion['generos'] = generos

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_recomendacion['generos'] = generos


In [7]:
lista_generos = []
generos_concatenados = ''

for i in range(len(df_movies_recomendacion['generos'])):
    if len(df_movies_recomendacion['generos'][i]) == 1:
        lista_generos.append(df_movies_recomendacion['generos'][i][0])
    else:
        generos_concatenados = ', '.join(df_movies_recomendacion['generos'][i])
        lista_generos.append(generos_concatenados)
        generos_concatenados = ''

lista_generos

['Animation, Comedy, Family',
 'Adventure, Fantasy, Family',
 'Romance, Comedy',
 'Comedy, Drama, Romance',
 'Comedy',
 'Action, Crime, Drama, Thriller',
 'Comedy, Romance',
 'Action, Adventure, Drama, Family',
 'Action, Adventure, Thriller',
 'Adventure, Action, Thriller',
 'Comedy, Drama, Romance',
 'Comedy, Horror',
 'Family, Animation, Adventure',
 'History, Drama',
 'Action, Adventure',
 'Drama, Crime',
 'Drama, Romance',
 'Crime, Comedy',
 'Crime, Comedy, Adventure',
 'Action, Comedy, Crime',
 'Comedy, Thriller, Crime',
 'Drama, Thriller',
 'Action, Adventure, Crime, Thriller',
 'Drama, Fantasy, Science Fiction, Thriller',
 'Drama, Romance',
 'Drama',
 'Comedy, Drama, Family',
 'Drama, Romance',
 'Fantasy, Science Fiction, Adventure',
 'Drama, Crime',
 'Drama, Crime',
 'Science Fiction, Thriller, Mystery',
 'Romance, Adventure',
 'Fantasy, Drama, Comedy, Family',
 'History, Drama, Romance',
 'Drama',
 'Adventure, History, Drama, Family',
 'Comedy, Family, Romance',
 'Comedy, Dram

In [8]:
df_movies_recomendacion['generos_concatenados'] = lista_generos

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_recomendacion['generos_concatenados'] = lista_generos


In [9]:
df_movies_recomendacion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45346 entries, 0 to 45345
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    45346 non-null  int64 
 1   title                 45346 non-null  object
 2   genres                45346 non-null  object
 3   overview              44405 non-null  object
 4   tagline               20387 non-null  object
 5   generos               45346 non-null  object
 6   generos_concatenados  45346 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.4+ MB


### Eliminación de características repetidas

In [10]:
df_movies_recomendacion.drop(['genres', 'generos'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_recomendacion.drop(['genres', 'generos'], axis=1, inplace=True)


In [11]:
df_movies_recomendacion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45346 entries, 0 to 45345
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    45346 non-null  int64 
 1   title                 45346 non-null  object
 2   overview              44405 non-null  object
 3   tagline               20387 non-null  object
 4   generos_concatenados  45346 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


## Juntar todo en una sola columna

In [12]:
print("Hay valores nulos en la columna de title?", df_movies_recomendacion['title'].isnull().any())
print("Hay valores nulos en la columna de overview?", df_movies_recomendacion['overview'].isnull().any())
print("Hay valores nulos en la columna de tagline?", df_movies_recomendacion['tagline'].isnull().any())
print("Hay valores nulos en la columna de generos_concatenados?", df_movies_recomendacion['generos_concatenados'].isnull().any())

Hay valores nulos en la columna de title? False
Hay valores nulos en la columna de overview? True
Hay valores nulos en la columna de tagline? True
Hay valores nulos en la columna de generos_concatenados? False


In [13]:
df_movies_recomendacion['overview'] = df_movies_recomendacion['overview'].fillna('')
df_movies_recomendacion['tagline'] = df_movies_recomendacion['tagline'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_recomendacion['overview'] = df_movies_recomendacion['overview'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_recomendacion['tagline'] = df_movies_recomendacion['tagline'].fillna('')


In [14]:
print("Hay valores nulos en la columna de overview?", df_movies_recomendacion['overview'].isnull().any())
print("Hay valores nulos en la columna de tagline?", df_movies_recomendacion['tagline'].isnull().any())

Hay valores nulos en la columna de overview? False
Hay valores nulos en la columna de tagline? False


In [63]:
#texto_concatenado = df_movies_recomendacion['title'] + '. ' + df_movies_recomendacion['overview'] + '. ' + df_movies_recomendacion['tagline'] + '. ' + df_movies_recomendacion['generos_concatenados']

texto_concatenado = df_movies_recomendacion['title'][0:20001] + '. ' + df_movies_recomendacion['overview'][0:20001] + '. ' + df_movies_recomendacion['tagline'][0:20001]

#texto_concatenado = df_movies_recomendacion['title'] + '. ' + df_movies_recomendacion['overview']

#texto_concatenado = df_movies_recomendacion['overview'][0:30001]

In [65]:
texto_concatenado

0        Toy Story. Led by Woody, Andy's toys live happ...
1        Jumanji. When siblings Judy and Peter discover...
2        Grumpier Old Men. A family wedding reignites t...
3        Waiting to Exhale. Cheated on, mistreated and ...
4        Father of the Bride Part II. Just when George ...
                               ...                        
19996    Warm Bodies. After a zombie becomes involved w...
19997    The Gay Bed and Breakfast of Terror. Helen and...
19998    Dororo. The hero, Hyakkimaru is a wandering "d...
19999    Let Fury Have the Hour. A documentary chronicl...
20000    Asylum Blackout. A group of cooks at an asylum...
Length: 20001, dtype: object

## Modelo de recomendación

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\javig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\javig\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [19]:
stopwords_english = set(stopwords.words('english'))
stopwords_english.update(['.', ',', "'s", '--', '!', '..', "'ll", '``', "''", '...', ':', '....'])

In [64]:
textos_filtrados = []

for i in texto_concatenado:
    palabras_filtradas = [palabra for palabra in word_tokenize(i) if palabra.lower() not in stopwords_english]
    textos_filtrados.append(' '.join(palabras_filtradas))

In [66]:
vectorizador = TfidfVectorizer(max_features = 100)
#vectorizador2 = CountVectorizer()

matriz_tfidf = vectorizador.fit_transform(textos_filtrados)
#matriz_count = vectorizador2.fit_transform(textos_filtrados)

In [67]:
similitud_coseno = cosine_similarity(matriz_tfidf)

In [68]:
joblib.dump(matriz_tfidf, 'matriz_tfidf.pkl')

['matriz_tfidf.pkl']

In [57]:
df_movies_recomendacion.to_csv('movies_dataset_recomendacion.csv', index=False)