## PRUEBAS PARA EL STAR RATIG RECOMMENDATOR


IMPORTS

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
import os
import ipdb

CLASES

In [28]:
class MovieRecommenderBase:
    def __init__(self, data_path='data/movie.csv'):
        """
        Base class for movie recommenders
        """
        self.movies = self._load_and_preprocess_data(data_path)
        self.tfidf_matrix, self.sim_matrix = self._build_similarity_matrix()
        self.title_to_indices = self._create_title_to_indices_mapping()
    
    def _load_and_preprocess_data(self, data_path):
        """Common data loading and preprocessing"""
        movies = pd.read_csv(data_path)
        movies = self._clean_movie_titles(movies)
        movies = self._clean_movie_genres(movies)
        return movies
    
    def _clean_movie_titles(self, movies):
        """Clean and separate movie titles and years"""
        movies.rename(columns={'title':'title_year'}, inplace=True)
        movies['title_year'] = movies['title_year'].apply(lambda x: x.strip())
        movies['title'] = movies['title_year'].apply(self._extract_title)
        movies['year'] = movies['title_year'].apply(self._extract_year)
        return movies
    
    def _extract_title(self, title):
        """Helper to extract movie title"""
        year = title[len(title)-5:len(title)-1]
        return title[:len(title)-7] if year.isnumeric() else title
    
    def _extract_year(self, title):
        """Helper to extract movie year"""
        year = title[len(title)-5:len(title)-1]
        return int(year) if year.isnumeric() else np.nan
    
    def _clean_movie_genres(self, movies):
        """Clean and standardize movie genres"""
        movies['genres'] = movies['genres'].str.replace('Sci-Fi','SciFi')
        movies['genres'] = movies['genres'].str.replace('Film-Noir','Noir')
        movies = movies[~(movies['genres']=='(no genres listed)')].reset_index(drop=True)
        return movies
    
    def _build_similarity_matrix(self):
        """Build similarity matrices"""
        tfidf_vector = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf_vector.fit_transform(self.movies['genres'])
        sim_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)
        return tfidf_matrix, sim_matrix
    
    def _create_title_to_indices_mapping(self):
        """Create title to indices mapping"""
        return self.movies.groupby('title').apply(lambda x: list(x.index)).to_dict()
    
    def _process_user_preferences(self, user_preferences):
        """
        Process user preferences (to be overridden by child classes)
        """
        raise NotImplementedError("This method should be implemented by child classes")
    
    def _get_all_rated_movies(self, user_preferences):
        """Get all rated movies"""
        all_rated_movies = set()
        for movie in user_preferences.get('liked_movies', []) + user_preferences.get('disliked_movies', []):
            if movie in self.title_to_indices:
                all_rated_movies.add(movie)
        return all_rated_movies
    
    def get_movie_details(self, movie_title):
        """Get details for a specific movie"""
        if movie_title in self.title_to_indices:
            indices = self.title_to_indices[movie_title]
            return self.movies.iloc[indices][['title', 'genres', 'year']]
        return None


In [None]:
class StarRatingRecommender(MovieRecommenderBase):
    def __init__(self, data_path='data/movie.csv'):
        """
        Recommender that uses 5-star rating system
        """
        super().__init__(data_path)
        
    def _convert_id_to_title(self, user_preferences):
        """
        Convert movie IDs in user preferences to titles
        
        Parameters:
        - user_preferences: dict with 'ratings' as a list of tuples (movie_title, rating)
        
        Returns:
        - user_preferences_filtered: dict with movie titles instead of IDs
        """
        user_preferences_filtered = {}
        user_preferences_filtered['ratings'] = []
        
        for id, rating in user_preferences.get('ratings', []):
            user_preferences_filtered['ratings'].append((self.movies[self.movies['movieId'] == int(id) + 1]['title'].values[0], rating))
        
        return user_preferences_filtered

    def _process_user_preferences(self, user_preferences):
        """
        Process user preferences with 5-star ratings
        
        Parameters:
        - user_preferences: dict with 'ratings' as a list of tuples (movie_title, rating)
                            where rating is 1-5 stars
        
        Returns:
        - combined_sim_scores: numpy array of combined similarity scores
        - total_weight: total weight of all ratings
        """
        combined_sim_scores = np.zeros(self.sim_matrix.shape[0])
        total_weight = 0
        
        # Process rated movies
        ratings = user_preferences.get('ratings', [])
        
        for movie_title, rating in ratings:
            if movie_title in self.title_to_indices:
                # Convert rating to weight (1-5 stars to 0.2-1.0 scale)
                weight = (rating - 1) / 4  # Normalize to 0-1 range
                
                for idx in self.title_to_indices[movie_title]:
                    # Apply weighted influence (positive for ratings > 3, negative for < 3)
                    influence = (weight - 0.5) * 2  # Convert to -1 to +1 range
                    combined_sim_scores += self.sim_matrix[idx] * influence
                    total_weight += abs(influence)
        
        # Normalize by total weight if any ratings were processed
        if total_weight > 0:
            combined_sim_scores /= total_weight
            
        return combined_sim_scores, total_weight

    def get_personalized_recommendations(self, user_preferences, top_n=5, genre_diversity=True):
        """
        Get recommendations based on star ratings
        
        Parameters:
        - user_preferences: dict with 'ratings' list of (movie_title, rating) tuples
        - top_n: number of recommendations
        - genre_diversity: whether to ensure genre variety
        
        Returns:
        - DataFrame with recommended movies
        """
        user_preferences_filtered = self._convert_id_to_title(user_preferences)
        combined_sim_scores, _ = self._process_user_preferences(user_preferences_filtered)
        
        # Get all rated movie titles to exclude from recommendations
        all_rated_movies = set()
        for movie_title, _ in user_preferences_filtered.get('ratings', []):
            if movie_title in self.title_to_indices:
                all_rated_movies.add(movie_title)
        
        # Get and sort similarity scores
        sim_scores = list(enumerate(combined_sim_scores))
        sim_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Generate recommendations
        recommendations = []
        selected_genres = set()
        
        for i, score in sim_scores:
            if len(recommendations) >= top_n:
                break
                
            movie_data = self.movies.iloc[i]
            title = movie_data['title']
            genres = set(movie_data['genres'].split('|'))
            
            # Skip already rated movies
            if title in all_rated_movies:
                continue
                
            # Apply genre diversity if enabled
            if genre_diversity:
                if not genres.issubset(selected_genres):
                    recommendations.append(i)
                    selected_genres.update(genres)
            else:
                recommendations.append(i)
        
        # Format output
        result = self.movies.iloc[recommendations][['movieId', 'title', 'genres', 'year']]
        result['genres'] = result['genres'].str.replace('|', ', ')
        return result.reset_index(drop=True)

PROVES

In [58]:
data_path = '../src/data/movie.csv'

# 1. Demonstrate Star Rating Recommender
print("\n=== STAR RATING RECOMMENDER ===")
star_rec = StarRatingRecommender(data_path=data_path)

# Example ratings (movie_title, rating) where rating is 1-5 stars
user_ratings = {
    'ratings': [
        ('Toy Story', 1),  # Loved it
        ('Jumanji', 2),    # Liked it
        ('GoldenEye', 4),   # Didn't like it
        ('Richard III', 5)  # Hated it
    ]
}

user_ratings = {
    'ratings': [
        ('0', 1),  # Loved it
        ('1', 2),    # Liked it
        ('9', 4),   # Didn't like it
        ('40', 5)  # Hated it
    ]
}

print("\nTop 5 Recommended Movies (based on star ratings):")
star_recommendations = star_rec.get_personalized_recommendations(user_ratings)
print(star_recommendations)


=== STAR RATING RECOMMENDER ===


  return self.movies.groupby('title').apply(lambda x: list(x.index)).to_dict()



Top 5 Recommended Movies (based on star ratings):
{'ratings': [('Toy Story', 1), ('Jumanji', 2), ('GoldenEye', 4), ('Richard III', 5)]}
{'ratings': [('0', 1), ('1', 2), ('9', 4), ('40', 5)]}
   movieId                title                genres    year
0       41          Richard III            Drama, War  1995.0
1      161         Crimson Tide  Drama, Thriller, War  1995.0
2      110           Braveheart    Action, Drama, War  1995.0
3      665          Underground    Comedy, Drama, War  1995.0
4      389  Colonel Chabert, Le   Drama, Romance, War  1994.0


In [47]:
star_rec.movies[star_rec.movies['movieId'] == 2]['title'].values[0]

'Jumanji'

In [27]:
for movie in star_rec.title_to_indices.items():
    # if len(movie[1]) > 1:
        if 9 in movie[1]:
            print(movie)

('GoldenEye', [9])


In [42]:
star_rec.movies

Unnamed: 0,movieId,title_year,genres,title,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995.0
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995.0
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995.0
...,...,...,...,...,...
27027,131252,Forklift Driver Klaus: The First Day on the Jo...,Comedy|Horror,Forklift Driver Klaus: The First Day on the Job,2001.0
27028,131254,Kein Bund für's Leben (2007),Comedy,Kein Bund für's Leben,2007.0
27029,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,"Feuer, Eis & Dosenbier",2002.0
27030,131258,The Pirates (2014),Adventure,The Pirates,2014.0


In [None]:
len(star_rec.id_to_indices)

27032