In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
import gzip
import json
import re
import urllib.request
import os
from collections import defaultdict, Counter
from openai import OpenAI
from getpass import getpass
from langchain_openai import OpenAIEmbeddings

In [2]:
# OPENAI_API_KEY
print("Insertar el API Key de OpenAI")
openai_key = getpass()

Insertar el API Key de OpenAI


In [28]:
reviews_final = pd.read_csv('reviews_final.csv')
metadata_final = pd.read_csv('metadata_final.csv', dtype={10: str})

In [49]:
from scipy.sparse import csr_matrix

class ContentBasedFiltering:
    def __init__(self, metadata_df, reviews_df):
        """
        Content-based filtering using TF-IDF on movie features

        Parameters:
        - metadata_df: DataFrame with movie metadata
        - reviews_df: DataFrame with ratings for building user profiles
        """
        self.metadata_df = metadata_df.copy()
        self.reviews_df = reviews_df
        self.tfidf_vectorizer = None
        self.content_matrix = None
        self.movie_similarity_matrix = None

        self.embeddings_model = OpenAIEmbeddings(
            model="text-embedding-3-small",#text-embedding-3-small", #"text-embedding-3-large",
            openai_api_key=openai_key
        )

    def preprocess_content(self):
        """Preprocess and combine content features"""
        print("Preprocessing content features...")

        def convert_to_string(value):
            """Convert value to string, handling lists, arrays and None values"""
            # Handle numpy arrays and pandas arrays first
            if hasattr(value, '__array__') or isinstance(value, (list, tuple)):
                try:
                    # Convert to list if it's an array
                    if hasattr(value, 'tolist'):
                        value_list = value.tolist()
                    else:
                        value_list = list(value)
                    return ', '.join([str(movie) for movie in value_list if movie is not None])
                except:
                    return str(value)

            # Handle scalar values
            try:
                if pd.isna(value) or value is None:
                    return ''
                else:
                    return str(value)
            except (ValueError, TypeError):
                # If pd.isna() fails, just convert to string
                return str(value) if value is not None else ''

        # Convert all columns to strings, handling lists
        self.metadata_df['title'] = self.metadata_df['title'].apply(convert_to_string)
        self.metadata_df['overview'] = self.metadata_df['overview'].apply(convert_to_string)
        self.metadata_df['genres'] = self.metadata_df['genres'].apply(convert_to_string)

        # Combine all text features
        # Give different weights to different features
        self.metadata_df['combined_features'] = (
            self.metadata_df['title'] * 3 + ' ' +  # Title is most important
            self.metadata_df['overview'] * 2 + ' ' +  # genres are moderately important
            self.metadata_df['genres']  # overview provides context
        )

        # Clean the text
        self.metadata_df['combined_features'] = (
            self.metadata_df['combined_features']
            .str.lower()
            .str.replace(r'[^\w\s]', ' ', regex=True)
            .str.replace(r'\s+', ' ', regex=True)
            .str.strip()
        )

        print(f"Combined features created for {len(self.metadata_df)} movies")

    def build_model(self, max_features=5000, min_df=2, max_df=0.8):
        """Build Embeddings and calculate movie similarities"""
        if 'combined_features' not in self.metadata_df.columns:
            self.preprocess_content()

        print("Getting embeddings...")

        embeddings = self.embeddings_model.embed_documents(self.metadata_df['combined_features'])

        print("Calculating embeddings...")
        embeddings = self.embeddings_model.embed_documents(self.metadata_df['combined_features'].tolist())
        self.content_matrix = np.array(embeddings)

        print("Calculating cosine similarities...")
        self.movie_similarity_matrix = cosine_similarity(self.content_matrix)

        # Create mapping from movie_id to matrix index
        self.movie_to_idx = {movie_id: idx for idx, movie_id in enumerate(self.metadata_df['movie_id'])}
        self.idx_to_movie = {idx: movie_id for movie_id, idx in self.movie_to_idx.items()}

        print("Content-based model built successfully!")

    def get_content_similarities(self, movie_id, top_k=10):
        """Get movies most similar to a given movie based on content"""
        if self.movie_similarity_matrix is None:
            raise ValueError("Model not built yet. Call build_model() first.")

        if movie_id not in self.movie_to_idx:
            return f"Movie {movie_id} not found in metadata"

        idx = self.movie_to_idx[movie_id]
        similarities = self.movie_similarity_matrix[idx]

        # Get top similar movies (excluding the movie itself)
        similar_indices = similarities.argsort()[::-1][1:top_k+1]

        results = []
        for similar_idx in similar_indices:
            similar_movie_id = self.idx_to_movie[similar_idx]
            similarity_score = similarities[similar_idx]
            results.append({
                'movie_id': similar_movie_id,
                'similarity': similarity_score
            })

        return results

    def build_user_profile(self, user_id, min_rating=3.0):
        """Build user profile based on highly rated movies"""
        user_ratings = self.reviews_df[
            (self.reviews_df['user_id'] == user_id) &
            (self.reviews_df['rating'] >= min_rating)
        ]

        if len(user_ratings) == 0:
            return None

        # Get content vectors for user's liked movies
        liked_movies = user_ratings['movie_id'].tolist()
        liked_indices = [self.movie_to_idx[movie] for movie in liked_movies if movie in self.movie_to_idx]

        if len(liked_indices) == 0:
            return None

        # Weight by rating and create weighted average profile
        profile_vector = np.zeros(self.content_matrix.shape[1])
        total_weight = 0

        for movie_id, rating in zip(user_ratings['movie_id'], user_ratings['rating']):
            if movie_id in self.movie_to_idx:
                idx = self.movie_to_idx[movie_id]
                weight = rating
                profile_vector += weight * self.content_matrix[idx].flatten()
                total_weight += weight

        if total_weight > 0:
            profile_vector /= total_weight

        return profile_vector

    def recommend_movies(self, user_id, n_recommendations=10, exclude_rated=False):
        """Generate content-based recommendations for a user"""
        if self.movie_similarity_matrix is None:
            raise ValueError("Model not built yet. Call build_model() first.")

        # Build user profile
        user_profile = self.build_user_profile(user_id)

        if user_profile is None:
            # Fallback: recommend popular movies
            popular_movies = (
                self.reviews_df.groupby('movie_id')
                .agg({'rating': ['count', 'mean']})
                .round(2)
            )
            popular_movies.columns = ['count', 'mean_rating']
            popular_movies = popular_movies.sort_values(['mean_rating', 'count'], ascending=False)
            return popular_movies.head(n_recommendations).reset_index()

        # Calculate similarities between user profile and all movies
        profile_similarities = cosine_similarity([user_profile], self.content_matrix).flatten()

        # Get movies user has rated (for exclusion)
        rated_movies = set(self.reviews_df[self.reviews_df['user_id'] == user_id]['movie_id']) if exclude_rated else set()

        # Create recommendations
        recommendations = []
        for idx, similarity in enumerate(profile_similarities):
            movie_id = self.idx_to_movie[idx]
            if exclude_rated and movie_id in rated_movies:
                continue

            recommendations.append({
                'movie_id': movie_id,
                'similarity_score': similarity,
                'title': self.metadata_df[self.metadata_df['movie_id'] == movie_id]['title'].iloc[0]
            })

        # Sort by similarity and return top N
        recommendations = sorted(recommendations, key=lambda x: x['similarity_score'], reverse=True)

        return recommendations[:n_recommendations]

    def get_feature_importance(self, movie_id, top_k=10):
        """Get most influential words in the embedding space for a given movie"""

        if self.content_matrix is None:
            raise ValueError("Model not built yet. Call build_model() first.")

        if movie_id not in self.movie_to_idx:
            return f"Movie {movie_id} not found"

        idx = self.movie_to_idx[movie_id]
        movie_vector = self.content_matrix[idx]

        # Extraer texto combinado del metadata
        text = self.metadata_df.loc[self.metadata_df['movie_id'] == movie_id, 'combined_features'].values[0]
        words = list(set(text.split()))  # palabras únicas

        if not words:
            return []

        # Embeddings de cada palabra
        word_embeddings = self.embeddings_model.embed_documents(words)

        # Calcular similitud coseno entre movie_vector y cada palabra
        sims = cosine_similarity([movie_vector], word_embeddings).flatten()

        # Top-k palabras más similares
        top_indices = sims.argsort()[::-1][:top_k]

        results = [
            {"feature": words[i], "similarity": sims[i]}
            for i in top_indices
        ]

        return results

In [30]:
# Initialize and build the content-based model
print("CONTENT-BASED FILTERING")
print("="*30)

content_cf = ContentBasedFiltering(metadata_final, reviews_final)
content_cf.build_model(max_features=3000)

CONTENT-BASED FILTERING
Preprocessing content features...
Combined features created for 7437 movies
Getting embeddings...
Calculating embeddings...
Calculating cosine similarities...
Content-based model built successfully!


In [31]:
print(f"\nMODEL STATISTICS")
print("-"*20)
print(f"movies in content matrix: {content_cf.content_matrix.shape[0]}")


MODEL STATISTICS
--------------------
movies in content matrix: 7437


In [33]:
# Example: Get recommendations for a user with complete profile
sample_user = reviews_final['user_id'].iloc[25]  # Choose user with reasonable history
user_history = reviews_final[reviews_final['user_id'] == sample_user].sort_values('rating', ascending=False)
content_recommendations = content_cf.recommend_movies(sample_user)

print(f"\n\nUSER PROFILE FOR CONTENT-BASED RECOMMENDATIONS:")
print("="*60)
print(f"USER ID: {sample_user}")
print(f"Total ratings: {len(user_history)}")
print(f"Average rating: {user_history['rating'].mean():.2f}")

# Analyze user's genres preferences
user_genres = []
high_rated_movies = user_history[user_history['rating'] >= 4]
for _, rating_row in high_rated_movies.iterrows():
    movie_cats = metadata_final[metadata_final['movie_id'] == rating_row['movie_id']]['genres']
    if not movie_cats.empty:
        try:
            # Try to check if value is not NA, handle arrays
            cat_value = movie_cats.iloc[0]
            if hasattr(cat_value, '__array__') or isinstance(cat_value, (list, tuple)):
                is_valid = len(cat_value) > 0 if hasattr(cat_value, '__len__') else True
            else:
                is_valid = pd.notna(cat_value) and cat_value != ''
            if is_valid:
                cats = str(movie_cats.iloc[0]).split(',')
                user_genres.extend([cat.strip() for cat in cats])
        except:
            pass  # Skip this movie if there's an issue processing genres

from collections import Counter
if user_genres:
    top_genres = Counter(user_genres).most_common(3)
    print(f"Preferred genres: {', '.join([f'{cat} ({count})' for cat, count in top_genres])}")

print(f"\nUSER'S HIGHLY RATED movieS (4+ stars):")
print("-"*40)
for i, (_, rating_row) in enumerate(high_rated_movies.head(5).iterrows(), 1):
    try:
        movie_info = metadata_final[metadata_final['movie_id'] == rating_row['movie_id']].iloc[0]
        print(f"{i}. Rating: {rating_row['rating']}/5")
        print(f"   Title: {movie_info['title']}")
        print(f"   genres: {movie_info['genres']}")
    except:
        print(f"{i}. movie {rating_row['movie_id']}: {rating_row['rating']}/5")

print(f"\nCONTENT-BASED RECOMMENDATIONS FOR USER {sample_user}:")
print("="*65)
for i, rec in enumerate(content_recommendations, 1):
    try:
        # Get detailed movie information
        movie_info = metadata_final[metadata_final['movie_id'] == rec['movie_id']].iloc[0]
        avg_rating = reviews_final[reviews_final['movie_id'] == rec['movie_id']]['rating'].mean()
        total_reviews = len(reviews_final[reviews_final['movie_id'] == rec['movie_id']])

        print(f"\n{i}. SIMILARITY SCORE: {rec['similarity_score']:.3f}")
        print(f"   movie ID: {rec['movie_id']}")
        print(f"   Title: {rec['title']}")
        print(f"   genres: {movie_info['genres']}")
        print(f"   Description: {str(movie_info['overview'])[:150]}...")
        print(f"   Community rating: {avg_rating:.2f}/5 ({total_reviews} reviews)")

        # Show why this movie was recommended (top content features)
        important_features = content_cf.get_feature_importance(rec['movie_id'], 3)
        if important_features:
            print(f"   Key features: {', '.join([f['feature'] for f in important_features])}")

    except Exception as e:
        print(f"\n{i}. {rec['title'][:50]}... - Score: {rec['similarity_score']:.3f}")



USER PROFILE FOR CONTENT-BASED RECOMMENDATIONS:
USER ID: 2
Total ratings: 17
Average rating: 3.24
Preferred genres: ['Drama' (3), 'Romance'] (3), 'Action' (2)

USER'S HIGHLY RATED movieS (4+ stars):
----------------------------------------
1. Rating: 5.0/5
   Title: Night on Earth
   genres: ['Comedy', 'Drama']
2. Rating: 4.0/5
   Title: A Nightmare on Elm Street
   genres: ['Horror']
3. Rating: 4.0/5
   Title: Hero
   genres: ['Drama', 'Adventure', 'Action', 'History']
4. Rating: 4.0/5
   Title: The 39 Steps
   genres: ['Action', 'Thriller', 'Mystery']
5. Rating: 4.0/5
   Title: Talk to Her
   genres: ['Drama', 'Romance']

CONTENT-BASED RECOMMENDATIONS FOR USER 2:

1. SIMILARITY SCORE: 0.670
   movie ID: 65642
   Title: The Story of a Cheat
   genres: ['Comedy', 'Drama']
   Description: Life story of a charming scoundrel, with little dialogue other than the star/director's witty narration. As a boy, only he survives a family tragedy w...
   Community rating: 3.74/5 (1166 reviews)
  

Evaluacion del Modelo

In [60]:
import numpy as np
from sklearn.metrics import ndcg_score
import random

def evaluate_content_cf_sampled(model, reviews_df, K=10, n_users=50, min_ratings=5, random_state=42):
    """
    Evaluación de Content-Based Filtering sobre una muestra de usuarios.
    
    Parameters:
    - model: instancia de ContentBasedFiltering ya inicializada
    - reviews_df: DataFrame con ratings
    - K: top-K recomendaciones
    - n_users: cantidad de usuarios a evaluar
    - min_ratings: mínimo de ratings por usuario para incluirlos
    - random_state: semilla para reproducibilidad
    """
    np.random.seed(random_state)
    random.seed(random_state)

    precision_list = []
    recall_list = []
    ndcg_list = []


    # Seleccionar usuarios con suficiente historial
    print("Seleccionando usuarios con suficiente historial...")
    users = reviews_df['user_id'].value_counts()
    eligible_users = users[users >= min_ratings].index.tolist()

    if len(eligible_users) == 0:
        print("No hay usuarios con suficiente historial.")
        return None

    # Elegir aleatoriamente n usuarios
    print(f"Evaluando {n_users} usuarios de un total de {len(eligible_users)} elegibles...")
    sampled_users = random.sample(eligible_users, min(n_users, len(eligible_users)))

    # Evaluar cada usuario
    print("Evaluando usuarios...")
    print("-"*50)
    for user_id in sampled_users:
        user_ratings = reviews_df[reviews_df['user_id'] == user_id]
        train = user_ratings.sample(frac=0.7, random_state=random_state)
        test = user_ratings.drop(train.index)

        # Películas relevantes en test (rating >= 4)
        relevant_ids = test[test['rating'] >= 4]['movie_id'].tolist()
        if not relevant_ids:
            continue  # saltar usuario sin relevantes en test

        # Obtener recomendaciones
        print(f"Recomendando para usuario {user_id}...")
        recommendations = model.recommend_movies(user_id, n_recommendations=K, exclude_rated=False)
        recommended_ids = [rec['movie_id'] for rec in recommendations]
        print(f"Recomendaciones: {recommended_ids}")
        print(f"Relevantes: {relevant_ids}")
        print("-"*50)

        # Precision y Recall
        hits = len(set(recommended_ids) & set(relevant_ids))
        precision = hits / K
        recall = hits / len(relevant_ids)

        # NDCG
        relevance_vector = [1 if movie_id in relevant_ids else 0 for movie_id in recommended_ids]
        ndcg = ndcg_score([relevance_vector], [relevance_vector])

        precision_list.append(precision)
        recall_list.append(recall)
        ndcg_list.append(ndcg)

    # Resultados promedio
    results = {
        "Precision@K": np.mean(precision_list) if precision_list else 0,
        "Recall@K": np.mean(recall_list) if recall_list else 0,
        "NDCG@K": np.mean(ndcg_list) if ndcg_list else 0,
        "Evaluated Users": len(precision_list)
    }

    return results

# ---------------------------
# Ejemplo de uso
# ---------------------------
metrics = evaluate_content_cf_sampled(content_cf, reviews_final, K=10, n_users=30)
print("RESULTADOS DE EVALUACIÓN CONTENT-BASED (MUESTRA)")
print("="*50)
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")

Seleccionando usuarios con suficiente historial...
Evaluando 30 usuarios de un total de 226831 elegibles...
Evaluando usuarios...
--------------------------------------------------
Recomendando para usuario 101684...
Recomendaciones: [171045, 65642, 4034, 93006, 94204, 73818, 592, 5731, 105406, 4031]
Relevantes: [296, 480, 586, 626, 653, 708, 780, 786, 805, 809, 832, 849]
--------------------------------------------------
Recomendando para usuario 82059...
Recomendaciones: [171045, 4034, 65642, 93006, 105406, 4031, 94204, 1126, 116231, 6171]
Relevantes: [17, 96, 187, 319, 342, 457, 465, 480, 497, 508, 527, 539, 562, 613, 638, 767, 838, 858, 892, 920, 953, 955, 1059, 1249, 1394, 1396, 1404, 1413, 1680, 1807, 1934, 1951, 1967, 2155, 2160, 2171, 2291, 2302, 2617, 2890, 2978, 3087]
--------------------------------------------------
Recomendando para usuario 189932...
Recomendaciones: [171045, 65642, 158, 5731, 8224, 87417, 84300, 8848, 4031, 111443]
Relevantes: [153, 235, 434, 581]
-------