# Question 3



In [1]:
import pandas as pd
import numpy as np #provides numerical arrays and functions to manipulate the arrays efficiently
from sklearn.decomposition import TruncatedSVD


In [2]:
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

df_D_NA = ratings_df.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

df_D_NA.head()

μ = np.mean(pd.unique(ratings_df["rating"]))

df_D = df_D_NA.fillna(μ)  #fill unobserved entries with μ
df_D.columns = df_D.columns.astype(int)
Id_NA = df_D_NA.isna().to_numpy()
print(Id_NA.shape)
assert Id_NA.shape == df_D.shape #I checked manually that they have same shape but it's in the HW2 file so it'll be here too
print(df_D.shape)
df_D_NA.head()



(610, 9724)
(610, 9724)


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [3]:
#useful functions for navigating the df
def search_movie_by_title(query):
    return movies_df.loc[movies_df['title'].str.contains(query, na=False, regex=False)]

def get_movie_index_D(df, movieId):
    return df.columns.get_loc(movieId)

def get_user_index_D(df,userId):
    return df.index.get_loc(userId)

def get_index_D(df, movieId, userId):
    return (get_user_index_D(df,userId), get_movie_index_D(df, movieId))

def get_movieId(df, data_index):
    return df.columns[data_index]

def get_userId(df, data_index):
    return df.index[data_index]

def get_movies(movieIds):
    return movies_df.loc[movies_df["movieId"].isin(movieIds)]

In [4]:
    
    
def truncated_svd(D, r):
    user_mean = D.mean(axis=1).values.reshape(-1, 1)  # Mean per user (column vector)
    print(user_mean)
    df_D_centered = D - user_mean  # Subtract the mean for each user (row-wise centering)
    
    # Apply Truncated SVD to the centered data
    svd = TruncatedSVD(n_components=r)
    U_sigma = svd.fit_transform(df_D_centered)  # U * Sigma
    Vt = svd.components_  # V^T (right singular vectors)
    sigma = svd.singular_values_  # Singular values
    
    # Diagonalize the singular values
    truncated_sigma = np.diag(sigma)
    return U_sigma, truncated_sigma, Vt, user_mean

# check the dimensions
n_components = 5
truncated_U, truncated_sigma, truncated_Vt, user_mean = truncated_svd(df_D, n_components)

print(truncated_U, truncated_sigma, truncated_Vt)
#assert truncated_U.shape == (df_D.shape[0], n_components)
#assert truncated_sigma.shape == (n_components, n_components)
#assert truncated_Vt.shape == (n_components, df_D.shape[1])

[[2.78856438]
 [2.75357363]
 [2.74874023]
 [2.76789387]
 [2.7540107 ]
 [2.77401275]
 [2.7575072 ]
 [2.75398499]
 [2.7524167 ]
 [2.75761004]
 [2.75678733]
 [2.75539901]
 [2.75285376]
 [2.75318799]
 [2.75969251]
 [2.75982106]
 [2.76575998]
 [2.8006993 ]
 [2.73969046]
 [2.7709276 ]
 [2.77326717]
 [2.74781469]
 [2.76118367]
 [2.760181  ]
 [2.75550185]
 [2.75105409]
 [2.76108083]
 [2.7658371 ]
 [2.76159502]
 [2.75694159]
 [2.75601604]
 [2.76054093]
 [2.76665981]
 [2.7559132 ]
 [2.75316228]
 [2.74928013]
 [2.75300802]
 [2.7537536 ]
 [2.76285479]
 [2.76077232]
 [2.76123509]
 [2.78691896]
 [2.77113328]
 [2.75298231]
 [2.79620012]
 [2.75539901]
 [2.75437063]
 [2.75434492]
 [2.75326512]
 [2.75097696]
 [2.78787022]
 [2.77308721]
 [2.75462773]
 [2.75095125]
 [2.75023139]
 [2.75498766]
 [2.78146853]
 [2.76326615]
 [2.76766248]
 [2.75221102]
 [2.75521905]
 [2.80013369]
 [2.77455265]
 [2.80417009]
 [2.75447347]
 [2.7950689 ]
 [2.75452489]
 [2.81267997]
 [2.75766146]
 [2.76002674]
 [2.75305944]
 [2.75

In [5]:
# Reconstruct the approximated ratings matrix
df_D_approx_centered = np.dot(truncated_U, np.dot(truncated_sigma, truncated_Vt))
df_D_approx = df_D_approx_centered + user_mean
# Helper function to get the predicted rating for a specific user and movie
def get_predicted_rating(userId, movieId, df_D_approx, df_D):
    user_idx = df_D.index.get_loc(userId)
    movie_idx = df_D.columns.get_loc(movieId)
    return df_D_approx[user_idx, movie_idx]

print(df_D_approx)



[[143.60255779  42.47137111  19.07798182 ...  -2.31633564  -2.31633564
   -2.43256684]
 [  9.05660691   5.6963931    1.32273398 ...   2.30899039   2.30899039
    2.47784808]
 [ -6.50239908   0.78768054   1.18276488 ...   3.12209792   3.12209792
    3.07737991]
 ...
 [123.61979454  37.83417936   3.20878111 ...  -3.33945809  -3.33945809
   -2.41640864]
 [ 18.05198115   8.17930275   5.62743928 ...   2.41792025   2.41792025
    2.46787866]
 [169.58699159  51.63364305 -13.65016006 ...  -7.51841855  -7.51841855
   -6.12260634]]


In [6]:
# Question 3a: Terminator 2 recommendation
movie_id_terminator = search_movie_by_title("Terminator 2: Judgment Day (1991)")
print(movie_id_terminator) #shows movieID =589
# Identify users who have not rated Terminator 2 (missing values)
unrated_users_terminator = df_D_NA[589].isna()


# Define the movieId and required variables for Terminator 2
movie_id_terminator = 589  # Terminator 2 movieId
user_ids = df_D.index  # All user IDs

# Identify users who have not rated Terminator 2
unrated_users_terminator = df_D_NA[movie_id_terminator].isna()

# Initialize a list to store predicted ratings for users who haven't rated Terminator 2
predicted_ratings_unrated_users = []

# Loop through all users and get the predicted rating for Terminator 2
for user_id in user_ids:
    if unrated_users_terminator[user_id]:  # If user hasn't rated Terminator 2
        predicted_rating = get_predicted_rating(user_id, movie_id_terminator, df_D_approx, df_D)
        predicted_ratings_unrated_users.append(predicted_rating)

# Calculate the percentage of users who have a predicted rating above 2.75
recommendation_percentage = np.mean(np.array(predicted_ratings_unrated_users) > 2.75) * 100

# Print the result
recommendation_percentage # shows >95%
#since 95.85% != 93.44%, FALSE

     movieId                              title         genres
507      589  Terminator 2: Judgment Day (1991)  Action|Sci-Fi


95.85492227979275

In [7]:
#3b
#Some recommendations for the user with userId=10 would be "GoldenEye (1995)" and "Aliens (1986)"
movie_id_GE = search_movie_by_title("GoldenEye (1995)")
movie_id_Aliens = search_movie_by_title("Aliens (1986)")
print(movie_id_GE) #id = 10
print(movie_id_Aliens)#id = 1200
movie_ids = [10,1200]

high_rating_GE = get_predicted_rating(10, 10, df_D_approx, df_D) > 2.75 
high_rating_Aliens = get_predicted_rating(10, 1200, df_D_approx, df_D) > 2.75 
print(high_rating_GE)
print(high_rating_Aliens)
#False, only goldenEye is a good recommendation

   movieId             title                     genres
9       10  GoldenEye (1995)  Action|Adventure|Thriller
     movieId          title                          genres
902     1200  Aliens (1986)  Action|Adventure|Horror|Sci-Fi
True
False


In [8]:
#3c
#Two of the favorite movies of the tenth user (userId=10) are "Spectre (2015)" and "The Intern (2015)". 
#As the low-rank matrix approximation approximates the original data, it maintains a strong recommendation
#for these movies, with predicted rating above 4
movie_id_Spectre = search_movie_by_title("Spectre (2015)")
movie_id_Intern= search_movie_by_title("The Intern (2015)")
print(movie_id_Spectre) #id = 136020
print(movie_id_Intern)#id = 140110
print(get_predicted_rating(10, 136020, df_D_approx, df_D))
print(get_predicted_rating(10, 140110, df_D_approx, df_D))
#False, only The Intern is has a predicted rating above 4

      movieId           title                  genres
8929   136020  Spectre (2015)  Action|Adventure|Crime
      movieId              title  genres
9006   140110  The Intern (2015)  Comedy
3.849826274473096
4.979018470959764


In [9]:
#3d
#Over ten users get an extremely high rating (above 4.9) for an unseen movie.
def users_with_extremely_high_ratings(threshold, df_D_approx, df_D_NA):
    # Find the maximum predicted rating for each user for unseen movies (NaN in the original matrix)
    extremely_high_ratings = []
    for user_id in df_D.index:
        # Get the unseen movies for this user (where the original data is NaN)
        unseen_movies = df_D_NA.loc[user_id].isna()
        # Get the predicted ratings for the unseen movies for this user
        predicted_ratings_unseen = df_D_approx[df_D.index.get_loc(user_id), unseen_movies]
        # Check if any of the unseen movies have a predicted rating above the threshold
        if np.any(predicted_ratings_unseen > threshold):
            extremely_high_ratings.append(user_id)
    # Return the number of users with extremely high ratings
    return len(extremely_high_ratings)

# Set threshold for extremely high rating
threshold = 4.9

print(users_with_extremely_high_ratings(threshold, df_D_approx, df_D_NA)) #it says 602 which is almost all users. 


602


In [10]:
#3e
#The two movies with the highest recommendation score for the user with userID=300 are: 
#"Pulp Fiction(1994)" and "Dark Knight, The (2008)".
movie_id_PulpFiction = search_movie_by_title("Pulp Fiction (1994)")
movie_id_DarkKnight= search_movie_by_title("Dark Knight, The (2008)")
print(movie_id_PulpFiction) #id = 296
print(movie_id_DarkKnight) #id = 58559

def get_top_2_recommendations(userId, df_D_approx, df_D):
    # Get predicted ratings for all movies for the user
    predicted_ratings = df_D_approx[df_D.index.get_loc(userId), :]
    
    # Get the indices of the top 2 movies with the highest predicted ratings
    top_2_movie_indices = np.argsort(predicted_ratings)[-2:]
    
    # Get the movie IDs for the top 2 recommended movies
    top_2_movie_ids = df_D.columns[top_2_movie_indices]
    
    return top_2_movie_ids

top_2_movie_ids = get_top_2_recommendations(300, df_D_approx, df_D)

print(top_2_movie_ids)
#False, the two movies with the highest reccomendation score for userID=300, are 296 and 318

     movieId                title                       genres
257      296  Pulp Fiction (1994)  Comedy|Crime|Drama|Thriller
      movieId                    title                   genres
6710    58559  Dark Knight, The (2008)  Action|Crime|Drama|IMAX
Index([296, 318], dtype='int32', name='movieId')


# Question 6


did these by hand. If I have the time I can write a script to double check these
### 6a
0.378
### 6b
0.387
### 6c
-0.375
