In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [3]:
movie_data = pd.read_csv('data/movies_grouplens/movies.dat',
                         delimiter='::', header=None, engine='python', encoding='latin1')
rating_data = pd.read_csv('data/movies_grouplens/ratings.dat',
                          delimiter='::', header=None, engine='python', encoding='latin1')
user_data = pd.read_csv('data/movies_grouplens/users.dat',
                        delimiter='::', header=None, engine='python', encoding='latin1')

In [4]:
movie_data.columns = ['MovieID', 'Title', 'Genres']
rating_data.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
user_data.columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']

In [5]:
movie_data['Genres'] = movie_data['Genres'].fillna('')

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie_data['Genres'])

In [6]:
user_movie_data = rating_data.merge(movie_data, on='MovieID')

In [7]:
movie_index = pd.Series(
    movie_data.index, index=movie_data['MovieID']).drop_duplicates()

In [8]:
user_profiles = {}

In [9]:
for user_id in rating_data['UserID'].unique():
    user_ratings = user_movie_data[user_movie_data['UserID'] == user_id]

    tfidf_weights = tfidf_matrix[user_ratings['MovieID'].apply(
        lambda x: movie_index[x])]
    user_profile = np.dot(
        user_ratings['Rating'], tfidf_weights.toarray()) / user_ratings['Rating'].sum()

    user_profile = user_profile / np.linalg.norm(user_profile)

    user_profiles[user_id] = user_profile

In [10]:
def recommend_movies_for_user(user_id, top_n=10):
    user_profile = user_profiles[user_id]

    sim_scores = cosine_similarity([user_profile], tfidf_matrix)[0]

    movie_indices = sim_scores.argsort()[-top_n:][::-1]

    return movie_data['Title'].iloc[movie_indices]

In [11]:
def calculate_mse_for_user(user_id):
    user_ratings = user_movie_data[user_movie_data['UserID'] == user_id]
    mse_scores = []

    for movie_id in user_ratings['MovieID']:
        movie_tfidf = tfidf_matrix[movie_index[movie_id]].toarray()[0]
        user_profile = user_profiles[user_id]

        mse = mean_squared_error(user_profile, movie_tfidf)
        mse_scores.append(mse)

    return np.mean(mse_scores)

In [12]:
user_id = 582
print(f"Recommendations for User {user_id}:")
print(recommend_movies_for_user(user_id))

Recommendations for User 582:
3158    Not Love, Just Frenzy (Más que amor, frenesí) ...
1602                             House of Yes, The (1997)
2630                                 Arachnophobia (1990)
1807                                   Deep Impact (1998)
1525                              Head Above Water (1996)
3823                                   Nurse Betty (2000)
411                               Another Stakeout (1993)
1776                                   Zero Effect (1998)
1792                                     Junk Mail (1997)
2280                                     Mona Lisa (1986)
Name: Title, dtype: object
