# Probamos varios modelos de recomendación

# 1.

In [33]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import StandardScaler



def recomendacion(titulo):

    # Cargar datos dentro de la función
    movies_df = pd.read_parquet('../Datasets/movies.parquet')

    # Asegúrate de que la columna de título esté en minúsculas para la búsqueda
    titulo = titulo.lower()
    
    # Filtra el DataFrame para la película de entrada
    pelicula = movies_df[movies_df['title'].str.lower() == titulo]
    
    if pelicula.empty:
        return "Película no encontrada."
    
    # Preparar datos para vectorización
    # Crear el vectorizador para el título
    title_vectorizer = TfidfVectorizer(stop_words='english')
    # Crear el codificador para el género
    genre_encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    
    # Encajamos el codificador y el vectorizador
    title_vectorizer.fit(movies_df['title'])
    genre_encoder.fit(movies_df['genre_name'].apply(lambda x: x.split('|')).tolist())
    
    # Transformamos los datos
    title_vectors = title_vectorizer.transform(movies_df['title'])
    genre_vectors = genre_encoder.transform(movies_df['genre_name'].apply(lambda x: x.split('|')).tolist())
    popularity = movies_df['popularity'].values.reshape(-1, 1)
    
    # Normalizamos la popularidad
    scaler = StandardScaler()
    popularity = scaler.fit_transform(popularity)
    
    # Convertimos popularidad a una matriz dispersa
    popularity = csr_matrix(popularity)
    
    # Concatenamos todos los vectores
    combined_vectors = hstack([title_vectors, genre_vectors, popularity])
    
    # Encuentra el índice de la película de entrada
    idx = movies_df.index[movies_df['title'].str.lower() == titulo].tolist()[0]
    
    # Calcula la similitud de coseno
    cosine_sim = cosine_similarity(combined_vectors[idx:idx+1], combined_vectors).flatten()
    
    # Crea una lista de similitud excluyendo la película de entrada
    sim_scores = list(enumerate(cosine_sim))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Obtén las 5 películas más similares
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    
    # Devuelve los nombres de las películas recomendadas
    recommended_movies = movies_df['title'].iloc[movie_indices].tolist()
    
    return recommended_movies

# Ejemplo de uso
# movies_df = pd.read_csv('path_to_movies_data.csv')
print(recomendacion('toy story'))



['Toy Story 2', 'Toy Story 3', 'Up', '9', 'Your Name.']


# 2.

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
import numpy as np

# Cargar datos
movies_df = pd.read_parquet('../Datasets/movies.parquet')

# Agrupar los géneros por título para evitar duplicados
movies_df_grouped = movies_df.groupby('title').agg({
    'genre_name': lambda x: list(set(x)),  # Crear una lista única de géneros
    'popularity': 'first',
    'release_date': 'first',
    'budget': 'first',
    'revenue': 'first',
    'runtime': 'first',
    'vote_average': 'first',
    'vote_count': 'first',
    'release_year': 'first',
    'id_movie': 'first'
}).reset_index()

def recomendacion(titulo, movies_df_grouped, randomize=True, genre_weight=3):
    titulo = titulo.lower()
    
    pelicula = movies_df_grouped[movies_df_grouped['title'].str.lower() == titulo]
    
    if pelicula.empty:
        return "Película no encontrada."
    
    # Vectorizar los títulos
    title_vectorizer = TfidfVectorizer(stop_words='english')
    title_vectors = title_vectorizer.fit_transform(movies_df_grouped['title'])
    
    # Codificar los géneros
    genre_encoder = MultiLabelBinarizer()
    genre_vectors = genre_encoder.fit_transform(movies_df_grouped['genre_name'])
    
    # Ponderar más los géneros
    genre_vectors = genre_vectors * genre_weight
    
    # Combinar vectores de título y género
    combined_vectors = hstack([title_vectors, genre_vectors]).tocsr()
    
    # Obtener el índice de la película de entrada
    idx = movies_df_grouped.index[movies_df_grouped['title'].str.lower() == titulo].tolist()[0]
    
    # Calcular la similitud de coseno
    input_vector = combined_vectors[idx]
    cosine_sim = cosine_similarity(input_vector, combined_vectors).flatten()
    
    # Excluir la película de entrada y ordenar por similitud
    sim_scores = sorted(list(enumerate(cosine_sim)), key=lambda x: x[1], reverse=True)
    sim_scores = [score for score in sim_scores if score[0] != idx]
    
    # Seleccionar las películas recomendadas (máximo 5)
    recommended_titles = set()
    top_similar = [score for score in sim_scores if score[1] > 0.7]  # Filtrar las más similares
    
    if len(top_similar) >= 5:
        selected_similar = np.random.choice([score[0] for score in top_similar], 5, replace=False)
    else:
        selected_similar = [score[0] for score in top_similar[:5]]
    
    for movie_index in selected_similar:
        movie_title = movies_df_grouped['title'].iloc[movie_index]
        recommended_titles.add(movie_title)
    
    final_recommendations = list(recommended_titles)
    
    if len(final_recommendations) == 0:
        return "No hay suficientes recomendaciones disponibles."

    return final_recommendations

# Ejemplo de uso
print(recomendacion('GoldenEye', movies_df_grouped, randomize=True, genre_weight=3))


['Krakatoa, East of Java', 'The Son of Monte Cristo', 'Blackbeard, the Pirate', 'X-Men: The Last Stand', 'Bang Bang!']


# 3. Este fue el mejor modelo de los 3

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
import numpy as np


def recomendacion(titulo, randomize=True, genre_weight=3):

    # Datos
    movies_df = pd.read_parquet('../Datasets/movies_df.parquet')

    # Agrupar los géneros por título para evitar duplicados
    movies_df_grouped = movies_df.groupby('title').agg({
    'genre_name': lambda x: list(set(x))
    }).reset_index()

    titulo = titulo.lower()
    
    pelicula = movies_df_grouped[movies_df_grouped['title'].str.lower() == titulo]
    
    if pelicula.empty:
        return "Película no encontrada."
    
    # Vectorizar los títulos
    title_vectorizer = TfidfVectorizer(stop_words='english')
    title_vectors = title_vectorizer.fit_transform(movies_df_grouped['title'])
    
    # Codificar los géneros
    genre_encoder = MultiLabelBinarizer()
    genre_vectors = genre_encoder.fit_transform(movies_df_grouped['genre_name'])
    
    # Ponderar más los géneros
    genre_vectors = genre_vectors * genre_weight
    
    # Combinar vectores de título y género
    combined_vectors = hstack([title_vectors, genre_vectors]).tocsr()
    
    # Obtener el índice de la película de entrada
    idx = movies_df_grouped.index[movies_df_grouped['title'].str.lower() == titulo].tolist()[0]
    
    # Calcular la similitud de coseno
    input_vector = combined_vectors[idx]
    cosine_sim = cosine_similarity(input_vector, combined_vectors).flatten()
    
    # Excluir la película de entrada y ordenar por similitud
    sim_scores = sorted(list(enumerate(cosine_sim)), key=lambda x: x[1], reverse=True)
    sim_scores = [score for score in sim_scores if score[0] != idx]
    
    # Seleccionar las películas recomendadas (máximo 5)
    recommended_titles = set()
    top_similar = [score for score in sim_scores if score[1] > 0.7]  # Filtrar las más similares
    
    if len(top_similar) >= 5:
        selected_similar = np.random.choice([score[0] for score in top_similar], 5, replace=False)
    else:
        selected_similar = [score[0] for score in top_similar[:5]]
    
    for movie_index in selected_similar:
        movie_title = movies_df_grouped['title'].iloc[movie_index]
        recommended_titles.add(movie_title)
    
    final_recommendations = list(recommended_titles)
    
    if len(final_recommendations) == 0:
        return "No hay suficientes recomendaciones disponibles."

    return final_recommendations

# Ejemplo de uso correcto
pelicula_recomendada = "Toy Story"  # Aquí simplemente asignas el nombre de la película directamente
print(recomendacion(pelicula_recomendada))


['Space Chimps', 'Batman Beyond: Return of the Joker', 'Ice Age: The Great Egg-Scapade', "It's the Great Pumpkin, Charlie Brown", 'Rhinoceros']


# Pruebas de precision

In [26]:
import pandas as pd

def evaluate_recommendations(test_df, recommender_func):
    results = []
    
    for _, row in test_df.iterrows():
        title = row['title']
        expected_recommendations = set(row['expected_recommendations'])
        recommended_titles = recommender_func(title)
        
        if isinstance(recommended_titles, str):  # Verifica si el resultado es un mensaje de error
            recommended_titles = set()
        else:
            recommended_titles = set(recommended_titles)
        
        # Depurar
        print(f"Title: {title}")
        print(f"Expected Recommendations: {expected_recommendations}")
        print(f"Recommended Titles: {recommended_titles}")
        
        # Calcular precisión y recuperación
        true_positive = len(expected_recommendations & recommended_titles)
        precision = true_positive / len(recommended_titles) if len(recommended_titles) > 0 else 0
        recall = true_positive / len(expected_recommendations) if len(expected_recommendations) > 0 else 0
        
        # Depurar los cálculos
        print(f"True Positives: {true_positive}")
        print(f"Precision Calculation: {true_positive} / {len(recommended_titles)}")
        print(f"Recall Calculation: {true_positive} / {len(expected_recommendations)}")
        
        results.append({
            'title': title,
            'precision': precision,
            'recall': recall
        })
    
    return pd.DataFrame(results)

# Evaluar recomendaciones
evaluation_results = evaluate_recommendations(recomendacion)
print(evaluation_results)


Similitudes para goldeneye:
A Dangerous Man: 0.9642857142857144
A View to a Kill: 0.9642857142857144
A Viking Saga: The Darkest Day: 0.9642857142857144
Angel of Destruction: 0.9642857142857144
Arabesque: 0.9642857142857144
As Good As Dead: 0.9642857142857144
Ballistic: Ecks vs. Sever: 0.9642857142857144
Beatdown: 0.9642857142857144
Behind Enemy Lines II: Axis of Evil: 0.9642857142857144
Black Dog: 0.9642857142857144
Title: GoldenEye
Expected Recommendations: {'Tomorrow Never Dies', 'The World Is Not Enough', 'Die Another Day'}
True Positives: 3
Precision Calculation: 3 / 773
Recall Calculation: 3 / 3
Similitudes para toy story:
Toy Story 2: 1.0
Toy Story 3: 1.0
Toy Story of Terror!: 0.9936096415994076
A Boy Named Charlie Brown: 0.9642857142857144
A Close Shave: 0.9642857142857144
A Flintstones Christmas Carol: 0.9642857142857144
A Matter of Loaf and Death: 0.9642857142857144
Ali Baba Bunny: 0.9642857142857144
Amazon Jack 3: Jungo Goes Bananas: 0.9642857142857144
An All Dogs Christmas C

In [25]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommendations_extended(test_df, recommender_func, movies_df_grouped):
    y_true = []
    y_pred = []
    
    # Obtener todos los títulos en el DataFrame de películas
    all_titles = set(movies_df_grouped['title'])
    
    for _, row in test_df.iterrows():
        title = row['title']
        expected_recommendations = set(row['expected_recommendations'])
        recommended_titles = recommender_func(title)
        
        if isinstance(recommended_titles, str):
            recommended_titles = set()
        else:
            recommended_titles = set(recommended_titles)
        
        # Crear etiquetas verdaderas y predicciones
        true_labels = [1 if movie in expected_recommendations else 0 for movie in all_titles]
        pred_labels = [1 if movie in recommended_titles else 0 for movie in all_titles]
        
        y_true.extend(true_labels)
        y_pred.extend(pred_labels)
    
    # Calcular precisión, recuperación y F1-score
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    f1 = f1_score(y_true, y_pred, average='binary')
    
    return precision, recall, f1

# Evaluar recomendaciones extendidas
precision, recall, f1 = evaluate_recommendations_extended(recomendacion, movies_df_grouped)
print(f"Precisión: {precision}")
print(f"Recuperación: {recall}")
print(f"F1-Score: {f1}")


Similitudes para goldeneye:
A Dangerous Man: 0.9642857142857144
A View to a Kill: 0.9642857142857144
A Viking Saga: The Darkest Day: 0.9642857142857144
Angel of Destruction: 0.9642857142857144
Arabesque: 0.9642857142857144
As Good As Dead: 0.9642857142857144
Ballistic: Ecks vs. Sever: 0.9642857142857144
Beatdown: 0.9642857142857144
Behind Enemy Lines II: Axis of Evil: 0.9642857142857144
Black Dog: 0.9642857142857144
Similitudes para toy story:
Toy Story 2: 1.0
Toy Story 3: 1.0
Toy Story of Terror!: 0.9936096415994076
A Boy Named Charlie Brown: 0.9642857142857144
A Close Shave: 0.9642857142857144
A Flintstones Christmas Carol: 0.9642857142857144
A Matter of Loaf and Death: 0.9642857142857144
Ali Baba Bunny: 0.9642857142857144
Amazon Jack 3: Jungo Goes Bananas: 0.9642857142857144
An All Dogs Christmas Carol: 0.9642857142857144
Similitudes para jumanji:
Adventures in Dinosaur City: 0.9642857142857144
City of Ember: 0.9642857142857144
Cry Wilderness: 0.9642857142857144
Fantastic Beasts and