In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_distances

In [18]:
# Cargar el dataset
df = pd.read_csv('DATASET\Movies_EDA_ILB.csv', encoding='utf-8')

In [19]:
df.head(3)

Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,runtime,status,title,vote_average,vote_count,return,release_year,genres_name,idgenres,iso_spokelang,namescast,idcast,directors
0,30.0,862,en,"Led by Woody, Andys toys live happily in his r...",21.946943,1995-10-30,373.554033,81.0,Released,Toy Story,7.7,5415.0,1.2e-05,1995,"Animation, Comedy, Family","16, 35, 10751",en,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...","31, 12898, 7167, 12899, 12900, 7907, 8873, 111...",John Lasseter
1,65.0,8844,en,When siblings Judy and Peter discover an encha...,17.01554,1995-12-15,262.797249,104.0,Released,Jumanji,6.9,2413.0,4e-06,1995,"Adventure, Fantasy, Family","12, 14, 10751","en, fr","Robin Williams, Jonathan Hyde, Kirsten Dunst, ...","2157, 8537, 205, 145151, 5149, 10739, 58563, 1...",Joe Johnston
2,0.0,15602,en,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,0.0,101.0,Released,Grumpier Old Men,6.5,92.0,0.0,1995,"Romance, Comedy","10749, 35",en,"Walter Matthau, Jack Lemmon, Ann-Margret, Soph...","6837, 3151, 13567, 16757, 589, 16523, 7166",Howard Deutch


In [20]:
model_df=df[['overview','title','genres_name','namescast','directors']]

In [21]:
def convert_columns_to_lowercase(df):
    for column in df.select_dtypes(include='object'):
        df[column] = df[column].str.lower()
    return df

In [22]:
model_df = convert_columns_to_lowercase(model_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.lower()


In [23]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Alarcon
[nltk_data]     Ilbert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Alarcon
[nltk_data]     Ilbert\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
def create_text(model_data):
    text = model_data['title']
    for column in ['genres_name', 'overview', 'namescast', 'directors']:
        if pd.notnull(model_data[column]) and isinstance(model_data[column], str):
            text += ' ' + model_data[column].lower()
    return text

In [25]:
stop_words = set(stopwords.words('english'))

def extract_keywords(overview):
    if pd.notnull(overview) and isinstance(overview, str):
        tokens = word_tokenize(overview.lower())  # Tokenizar y convertir a minúsculas
        keywords = [token for token in tokens if token.isalpha() and token not in stop_words]  # Filtrar palabras clave
        return keywords
    else:
        return []

# Iterar sobre el dataset y encontrar las palabras clave para cada fila
model_df['keywords'] = None  # Crear la columna 'keywords' inicialmente con valores nulos
for index, row in model_df.iterrows():
    plot = row['overview']
    keywords = extract_keywords(plot)
    
    model_df.at[index, 'keywords'] = keywords

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['keywords'] = None  # Crear la columna 'keywords' inicialmente con valores nulos


In [26]:
model_df.head(3)

Unnamed: 0,overview,title,genres_name,namescast,directors,keywords
0,"led by woody, andys toys live happily in his r...",toy story,"animation, comedy, family","tom hanks, tim allen, don rickles, jim varney,...",john lasseter,"[led, woody, andys, toys, live, happily, room,..."
1,when siblings judy and peter discover an encha...,jumanji,"adventure, fantasy, family","robin williams, jonathan hyde, kirsten dunst, ...",joe johnston,"[siblings, judy, peter, discover, enchanted, b..."
2,a family wedding reignites the ancient feud be...,grumpier old men,"romance, comedy","walter matthau, jack lemmon, ann-margret, soph...",howard deutch,"[family, wedding, reignites, ancient, feud, ne..."


In [27]:
model_df.drop(columns=['overview'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df.drop(columns=['overview'],inplace=True)


In [28]:
def remove_commas(text):
    if isinstance(text, str):
        return text.replace(',', '')
    else:
        return ''

In [29]:
def clean_keywords(keywords):
    if isinstance(keywords, list):
        cleaned_keywords = ' '.join(keywords)
        cleaned_keywords = cleaned_keywords.replace('[', '').replace(']', '')
        return cleaned_keywords
    else:
        return ''

# Aplicar la función clean_keywords a la columna 'keywords'
model_df['keywords'] = model_df['keywords'].apply(clean_keywords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['keywords'] = model_df['keywords'].apply(clean_keywords)


In [30]:
model_df['texto'] = (
    model_df['title'].apply(remove_commas) + ' ' +
    model_df['genres_name'].apply(remove_commas) + ' ' +
    model_df['keywords'].apply(remove_commas) + ' ' +
    model_df['namescast'].apply(remove_commas) + ' ' +
    model_df['directors'].apply(remove_commas) + ' '
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['texto'] = (


In [31]:
model_df.head(3)

Unnamed: 0,title,genres_name,namescast,directors,keywords,texto
0,toy story,"animation, comedy, family","tom hanks, tim allen, don rickles, jim varney,...",john lasseter,led woody andys toys live happily room andys b...,toy story animation comedy family led woody an...
1,jumanji,"adventure, fantasy, family","robin williams, jonathan hyde, kirsten dunst, ...",joe johnston,siblings judy peter discover enchanted board g...,jumanji adventure fantasy family siblings judy...
2,grumpier old men,"romance, comedy","walter matthau, jack lemmon, ann-margret, soph...",howard deutch,family wedding reignites ancient feud neighbor...,grumpier old men romance comedy family wedding...


In [32]:
model_df.to_csv('DATASET\Movies_prueba_ILB.csv',index=False)

In [33]:
model_df.drop(columns=['keywords','genres_name','namescast','directors'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df.drop(columns=['keywords','genres_name','namescast','directors'], inplace=True)


In [34]:
model_df=model_df.reset_index(drop=True)

In [35]:
model_df.head(3)

Unnamed: 0,title,texto
0,toy story,toy story animation comedy family led woody an...
1,jumanji,jumanji adventure fantasy family siblings judy...
2,grumpier old men,grumpier old men romance comedy family wedding...


In [36]:
# Reemplazar los valores np.nan con una cadena vacía
model_df['texto'] = model_df['texto'].fillna('')

In [37]:
# Crear una instancia del CountVectorizer con los parámetros deseados
cv = CountVectorizer(stop_words='english', max_features=10000)  # Ajusta el valor de max_features según tus necesidades

In [38]:
# Definir el tamaño de lote para el cálculo de similitud
batch_size = 1000

In [39]:
# Obtener el número total de muestras y calcular el número de lotes
n_samples = len(model_df)
n_batches = (n_samples // batch_size) + 1

cosine_sim_list = []

for i in range(n_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, n_samples)

    # Obtener el subconjunto de datos
    subset_data = model_df.iloc[start:end]

    # Ajustar y transformar el CountVectorizer en el subconjunto de datos
    cv_matrix = cv.fit_transform(subset_data['texto'])

    # Calcular la similitud coseno entre los textos
    similarity_matrix = cosine_similarity(cv_matrix, cv_matrix)

    # Agregar la matriz de similitud a la lista
    cosine_sim_list.append(similarity_matrix)

In [40]:
# Encontrar la dimensión máxima a lo largo del eje 1
max_dim_1 = max(matrix.shape[1] for matrix in cosine_sim_list)

In [41]:
# Ajustar las dimensiones de las matrices al número máximo de columnas
cosine_sim_list = [np.pad(matrix, ((0, 0), (0, max_dim_1 - matrix.shape[1])), mode='constant') for matrix in cosine_sim_list]

In [42]:
# Combinar las matrices de similitud coseno en una única matriz
cosine_sim = np.concatenate(cosine_sim_list, axis=0)

In [43]:
def recomendaciones(titulo, cosine_sim = cosine_sim):
    # Getting the index of the movie that matches the title
    idx = model_df[model_df['title'] == str(titulo).lower()].index[0]
    # Getting the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    #Sorting the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Getting the top 5 recommendations
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    recommendations=list(model_df['title'].iloc[movie_indices].str.title())
    return {'lista recomendada': recommendations} 

In [44]:
recomendaciones('superman')

{'lista recomendada': ['Fear Of A Black Hat',
  'Lost Horizon',
  'Short Cuts',
  'Touki Bouki',
  'The Boys Of St. Vincent']}