In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [2]:
movie_data = pd.read_csv('data/movies_grouplens/movies.dat',
                         delimiter='::', header=None, engine='python', encoding='latin1')
rating_data = pd.read_csv('data/movies_grouplens/ratings.dat',
                          delimiter='::', header=None, engine='python', encoding='latin1')
user_data = pd.read_csv('data/movies_grouplens/users.dat',
                        delimiter='::', header=None, engine='python', encoding='latin1')

In [3]:
movie_data.columns = ['MovieID', 'Title', 'Genres']
rating_data.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
user_data.columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']

In [23]:
movie_data.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [24]:
user_data.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [25]:
rating_data.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [27]:
len(rating_data['MovieID'].unique())

3706

In [4]:
movie_data['Genres'] = movie_data['Genres'].fillna('')

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie_data['Genres'])

len(tfidf_matrix.toarray()[0])

20

In [5]:
user_movie_data = rating_data.merge(movie_data, on='MovieID')

In [6]:
movie_index = pd.Series(
    movie_data.index, index=movie_data['MovieID']).drop_duplicates()

In [7]:
user_profiles = {}

In [8]:
for user_id in rating_data['UserID'].unique():
    user_ratings = user_movie_data[user_movie_data['UserID'] == user_id]

    tfidf_weights = tfidf_matrix[user_ratings['MovieID'].apply(
        lambda x: movie_index[x])]
    user_profile = np.dot(
        user_ratings['Rating'], tfidf_weights.toarray()) / user_ratings['Rating'].sum()

    user_profile = user_profile / np.linalg.norm(user_profile)

    user_profiles[user_id] = user_profile

In [9]:
def recommend_movies_for_user(user_id, top_n=10):
    user_profile = user_profiles[user_id]

    sim_scores = cosine_similarity([user_profile], tfidf_matrix)[0]

    movie_indices = sim_scores.argsort()[-top_n:][::-1]

    return movie_data['Title'].iloc[movie_indices]

### Not Correct, MSE Calculation

In [10]:
def calculate_mse_for_user(user_id):
    user_ratings = user_movie_data[user_movie_data['UserID'] == user_id]
    mse_scores = []

    for movie_id in user_ratings['MovieID']:
        movie_tfidf = tfidf_matrix[movie_index[movie_id]].toarray()[0]
        user_profile = user_profiles[user_id]

        mse = mean_squared_error(user_profile, movie_tfidf)
        mse_scores.append(mse)

    return np.mean(mse_scores)

In [11]:
user_id = 181
print(f"Recommendations for User {user_id}:")
print(recommend_movies_for_user(user_id))

Recommendations for User 181:
20                                      Get Shorty (1995)
1847                                    Buffalo 66 (1998)
386                   Faster Pussycat! Kill! Kill! (1965)
3115                                       Montana (1998)
1931                                 Lethal Weapon (1987)
1932                               Lethal Weapon 2 (1989)
1933                               Lethal Weapon 3 (1992)
3197    Man Bites Dog (C'est arrivé près de chez vous)...
1849                               Lethal Weapon 4 (1998)
1445                                      Best Men (1997)
Name: Title, dtype: object


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import numpy as np


def predict_rating(user_profile, movie_tfidf):
    # Predict a rating based on similarity (cosine similarity can be interpreted as a weight)
    similarity = cosine_similarity([user_profile], [movie_tfidf])[0][0]
    # Scaling similarity to the rating range (e.g., from 0 to 5)
    print(similarity)
    return similarity * 5


def calculate_mse_for_user(user_id):
    # Get the user's actual ratings
    user_ratings = user_movie_data[user_movie_data['UserID'] == user_id]

    predicted_ratings = []
    actual_ratings = []

    for _, row in user_ratings.iterrows():
        movie_id = row['MovieID']
        actual_rating = row['Rating']

        # Get the TF-IDF vector for this movie
        movie_tfidf = tfidf_matrix[movie_index[movie_id]].toarray()[0]

        # Predict the rating based on user's profile and movie's TF-IDF
        user_profile = user_profiles[user_id]
        predicted_rating = predict_rating(user_profile, movie_tfidf)

        # Append to lists
        predicted_ratings.append(predicted_rating)
        actual_ratings.append(actual_rating)

    # Calculate MSE between predicted and actual ratings
    mse = mean_squared_error(actual_ratings, predicted_ratings)
    return mse

In [15]:
# Example user_id you want to calculate MSE for
user_id = 582  # Replace with a valid user ID from your dataset

# Call the function to calculate MSE for this user
mse_score = calculate_mse_for_user(user_id)

# Print the result
print(f"The MSE for user {user_id} is: {mse_score}")

0.10468005407906097
0.33044994931254923
0.45539749868118823
0.46989112244028564
0.46989112244028564
0.5625667563660809
0.3040685092621638
0.21110630045428067
0.4075614105488988
0.6275015360417028
0.32023226318653025
0.25850814994274934
0.5139825313108635
0.4219243237871921
0.702689845379634
0.5520927361586284
0.5036536431944061
0.6688911808419717
0.6275015360417028
0.5139825313108635
0.5072723926706089
0.5520927361586284
0.5520927361586284
0.46989112244028564
0.46989112244028564
0.7013120916638118
0.6019068115756046
0.5738244751963557
0.4219243237871921
0.46989112244028564
0.5647598924392907
0.4337955007696814
0.5520927361586284
0.5139825313108635
0.28729555928709444
0.5520927361586284
0.33498818925126916
0.28729555928709444
0.45539749868118823
0.4219243237871921
0.7013120916638118
0.536887645262828
0.5520927361586284
0.4337955007696814
0.37755933392691726
0.31074943853296144
0.376204384384116
0.4337955007696814
0.5520927361586284
0.46989112244028564
0.1682615225890151
0.43873770281464