In [1]:
movies_path = r"C:\Users\Hamed\asgmnt 1 (IRS)\dataset\movies.dat"
ratings_path = r"C:\Users\Hamed\asgmnt 1 (IRS)\dataset\ratings.dat"
users_path = r"C:\Users\Hamed\asgmnt 1 (IRS)\dataset\users.dat"


In [2]:
import pandas as pd

movies = pd.read_csv(
    movies_path, 
    delimiter='::', 
    engine='python', 
    encoding='ISO-8859-1', 
    names=['MovieID', 'Title', 'Genres']
)

ratings = pd.read_csv(
    ratings_path, 
    delimiter='::', 
    engine='python', 
    encoding='ISO-8859-1', 
    names=['UserID', 'MovieID', 'Rating', 'Timestamp']
)

users = pd.read_csv(
    users_path, 
    delimiter='::', 
    engine='python', 
    encoding='ISO-8859-1', 
    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
)


In [3]:
movies.to_csv(r"C:\Users\Hamed\asgmnt 1 (IRS)\dataset\movies.csv", index=False)
ratings.to_csv(r"C:\Users\Hamed\asgmnt 1 (IRS)\dataset\ratings.csv", index=False)
users.to_csv(r"C:\Users\Hamed\asgmnt 1 (IRS)\dataset\users.csv", index=False)

# 3.2 part 1 (PCA Method with Mean-Filling)

3.2.1: Calculate the average rating for each of the target items (I1 and I2)

In [4]:
item_avg_ratings = ratings.groupby('MovieID')['Rating'].mean()
I1, I2 = item_avg_ratings.nsmallest(2).index
I1_avg_rating = item_avg_ratings.loc[I1]
I2_avg_rating = item_avg_ratings.loc[I2]
print(f"Average rating for I1 (MovieID: {I1}): {I1_avg_rating}")
print(f"Average rating for I2 (MovieID: {I2}): {I2_avg_rating}")

Average rating for I1 (MovieID: 127): 1.0
Average rating for I2 (MovieID: 133): 1.0


3.2.2: Use the mean-filling method to replace the unspecified ratings of each of the target items (I1 and I2) with its corresponding mean value

In [5]:
ratings_filled = ratings.copy()

ratings_filled.loc[ratings_filled['MovieID'] == I1, 'Rating'] = ratings_filled.loc[ratings_filled['MovieID'] == I1, 'Rating'].fillna(I1_avg_rating)
ratings_filled.loc[ratings_filled['MovieID'] == I2, 'Rating'] = ratings_filled.loc[ratings_filled['MovieID'] == I2, 'Rating'].fillna(I2_avg_rating)
print(ratings_filled.head())

   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


3.2.3: Calculate the average rating for each item

In [6]:
item_avg_ratings_filled = ratings_filled.groupby('MovieID')['Rating'].mean()

print(item_avg_ratings_filled.head())

MovieID
1    4.146846
2    3.201141
3    3.016736
4    2.729412
5    3.006757
Name: Rating, dtype: float64


3.2.4: For each item, calculate the difference between ratings and the mean rating of the item

In [7]:
ratings_filled['Rating_Diff'] = ratings_filled['Rating'] - ratings_filled['MovieID'].map(item_avg_ratings_filled)

print(ratings_filled[['UserID', 'MovieID', 'Rating', 'Rating_Diff']].head())

   UserID  MovieID  Rating  Rating_Diff
0       1     1193       5     0.609275
1       1      661       3    -0.464762
2       1      914       3    -1.154088
3       1     3408       4     0.136122
4       1     2355       5     1.145625


3.2.5: Compute the covariance for each pair of items

In [8]:
ratings_matrix = ratings_filled.pivot(index='UserID', columns='MovieID', values='Rating')
covariance_matrix = ratings_matrix.cov()
print(covariance_matrix.head())

MovieID      1         2         3         4         5         6         7     \
MovieID                                                                         
1        0.726499  0.154599  0.145391  0.334540  0.139716  0.041210  0.117566   
2        0.154599  0.966627  0.142157  0.062013  0.335331  0.068046  0.221806   
3        0.145391  0.142157  1.148566  0.278689  0.451023  0.176220  0.225447   
4        0.334540  0.062013  0.278689  1.026940  0.440977 -0.011202  0.075448   
5        0.139716  0.335331  0.451023  0.440977  1.050802  0.073866  0.253444   

MovieID      8         9         10    ...      3943      3944      3945  \
MovieID                                ...                                 
1        0.138824 -0.035088  0.108824  ... -0.099915  0.083333  0.154839   
2        0.266888  0.401786  0.224986  ...  0.010526 -0.500000  0.187500   
3        0.327485  0.173950  0.252696  ... -0.133333  0.333333 -0.266667   
4        0.071429 -0.660256  0.009351  ...  0.785714

3.2.6: Generate the covariance matrix

In [9]:
covariance_matrix = ratings_matrix.cov()

print(covariance_matrix.head())

MovieID      1         2         3         4         5         6         7     \
MovieID                                                                         
1        0.726499  0.154599  0.145391  0.334540  0.139716  0.041210  0.117566   
2        0.154599  0.966627  0.142157  0.062013  0.335331  0.068046  0.221806   
3        0.145391  0.142157  1.148566  0.278689  0.451023  0.176220  0.225447   
4        0.334540  0.062013  0.278689  1.026940  0.440977 -0.011202  0.075448   
5        0.139716  0.335331  0.451023  0.440977  1.050802  0.073866  0.253444   

MovieID      8         9         10    ...      3943      3944      3945  \
MovieID                                ...                                 
1        0.138824 -0.035088  0.108824  ... -0.099915  0.083333  0.154839   
2        0.266888  0.401786  0.224986  ...  0.010526 -0.500000  0.187500   
3        0.327485  0.173950  0.252696  ... -0.133333  0.333333 -0.266667   
4        0.071429 -0.660256  0.009351  ...  0.785714

3.2.7: Determine the top 5-peers and top 10-peers for each of the target items (I1 and I2) using the transformed representation (covariance matrix)

In [10]:
top_5_peers_I1 = covariance_matrix[I1].sort_values(ascending=False).iloc[1:6]  
top_5_peers_I2 = covariance_matrix[I2].sort_values(ascending=False).iloc[1:6] 

top_10_peers_I1 = covariance_matrix[I1].sort_values(ascending=False).iloc[1:11] 
top_10_peers_I2 = covariance_matrix[I2].sort_values(ascending=False).iloc[1:11]  

print(f"Top 5 peers for I1 (MovieID: {I1}): {top_5_peers_I1}")
print(f"Top 5 peers for I2 (MovieID: {I2}): {top_5_peers_I2}")
print(f"Top 10 peers for I1 (MovieID: {I1}): {top_10_peers_I1}")
print(f"Top 10 peers for I2 (MovieID: {I2}): {top_10_peers_I2}")

Top 5 peers for I1 (MovieID: 127): MovieID
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
Name: 127, dtype: float64
Top 5 peers for I2 (MovieID: 133): MovieID
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
Name: 133, dtype: float64
Top 10 peers for I1 (MovieID: 127): MovieID
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
Name: 127, dtype: float64
Top 10 peers for I2 (MovieID: 133): MovieID
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
Name: 133, dtype: float64


3.2.8: Determine reduced dimensional space for each user in case of using the top 5-peers

In [11]:
from sklearn.decomposition import PCA

top_5_peers_I1_and_I2 = list(set(top_5_peers_I1.index) | set(top_5_peers_I2.index))  
ratings_reduced_5 = ratings_matrix[top_5_peers_I1_and_I2].fillna(0)

pca_5 = PCA(n_components=2)
ratings_reduced_5_pca = pca_5.fit_transform(ratings_reduced_5)

print("Reduced dimensional space (Top 5 peers):")
print(ratings_reduced_5_pca[:5])  

Reduced dimensional space (Top 5 peers):
[[-0.70905081 -0.27783914]
 [-0.70905081 -0.27783914]
 [-0.70905081 -0.27783914]
 [-0.70905081 -0.27783914]
 [ 1.21437332 -0.82227991]]


3.2.9: Compute the rating predictions of the original missing rating for each of the target items (I1 and I2) using the top 5-peers

In [12]:
predicted_rating_I1_5 = ratings_reduced_5[top_5_peers_I1.index].mean(axis=1)
predicted_rating_I2_5 = ratings_reduced_5[top_5_peers_I2.index].mean(axis=1)

print(f"Predicted rating for I1 (using top 5 peers): {predicted_rating_I1_5[:5]}")
print(f"Predicted rating for I2 (using top 5 peers): {predicted_rating_I2_5[:5]}")

Predicted rating for I1 (using top 5 peers): UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.4
dtype: float64
Predicted rating for I2 (using top 5 peers): UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.4
dtype: float64


3.2.10: Determine reduced dimensional space for each user in case of using the top 10-peers

In [13]:
top_10_peers_I1_and_I2 = list(set(top_10_peers_I1.index) | set(top_10_peers_I2.index))  
ratings_reduced_10 = ratings_matrix[top_10_peers_I1_and_I2].fillna(0)


pca_10 = PCA(n_components=2) 
ratings_reduced_10_pca = pca_10.fit_transform(ratings_reduced_10)

print("Reduced dimensional space (Top 10 peers):")
print(ratings_reduced_10_pca[:5]) 

Reduced dimensional space (Top 10 peers):
[[-1.15850786  0.00702471]
 [-1.15850786  0.00702471]
 [-1.15850786  0.00702471]
 [-1.15850786  0.00702471]
 [-0.12305775 -1.27991222]]


3.2.11: Compute the rating predictions of the original missing rating for each of the target items (I1 and I2) using the top 10-peers

In [14]:
predicted_rating_I1_10 = ratings_reduced_10[top_10_peers_I1.index].mean(axis=1)
predicted_rating_I2_10 = ratings_reduced_10[top_10_peers_I2.index].mean(axis=1)

print(f"Predicted rating for I1 (using top 10 peers): {predicted_rating_I1_10[:5]}")
print(f"Predicted rating for I2 (using top 10 peers): {predicted_rating_I2_10[:5]}")

Predicted rating for I1 (using top 10 peers): UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.2
dtype: float64
Predicted rating for I2 (using top 10 peers): UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.2
dtype: float64


3.2.12: Compare the results of point 3.2.9 with results of point 3.2.11. Comment on your answer

In [15]:
print(f"Comparison of predicted ratings for I1:\nTop 5: {predicted_rating_I1_5[:5]}\nTop 10: {predicted_rating_I1_10[:5]}")
print(f"Comparison of predicted ratings for I2:\nTop 5: {predicted_rating_I2_5[:5]}\nTop 10: {predicted_rating_I2_10[:5]}")

Comparison of predicted ratings for I1:
Top 5: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.4
dtype: float64
Top 10: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.2
dtype: float64
Comparison of predicted ratings for I2:
Top 5: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.4
dtype: float64
Top 10: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.2
dtype: float64


# Part2: PCA Method with Maximum Likelihood Estimation 

3.3.1: Generate the covariance matrix

In [16]:
ratings_matrix_mle = ratings_filled.pivot(index='UserID', columns='MovieID', values='Rating')
covariance_matrix_mle = ratings_matrix_mle.cov()

print("Covariance Matrix (MLE):")
print(covariance_matrix_mle.head())

Covariance Matrix (MLE):
MovieID      1         2         3         4         5         6         7     \
MovieID                                                                         
1        0.726499  0.154599  0.145391  0.334540  0.139716  0.041210  0.117566   
2        0.154599  0.966627  0.142157  0.062013  0.335331  0.068046  0.221806   
3        0.145391  0.142157  1.148566  0.278689  0.451023  0.176220  0.225447   
4        0.334540  0.062013  0.278689  1.026940  0.440977 -0.011202  0.075448   
5        0.139716  0.335331  0.451023  0.440977  1.050802  0.073866  0.253444   

MovieID      8         9         10    ...      3943      3944      3945  \
MovieID                                ...                                 
1        0.138824 -0.035088  0.108824  ... -0.099915  0.083333  0.154839   
2        0.266888  0.401786  0.224986  ...  0.010526 -0.500000  0.187500   
3        0.327485  0.173950  0.252696  ... -0.133333  0.333333 -0.266667   
4        0.071429 -0.660256

3.3.2: Determine the top 5-peers and top 10-peers for each of the target items (I1 and I2) using the transformed representation (covariance matrix)

In [17]:
top_5_peers_mle_I1 = covariance_matrix_mle[I1].sort_values(ascending=False).iloc[1:6]  
top_5_peers_mle_I2 = covariance_matrix_mle[I2].sort_values(ascending=False).iloc[1:6] 

top_10_peers_mle_I1 = covariance_matrix_mle[I1].sort_values(ascending=False).iloc[1:11]  
top_10_peers_mle_I2 = covariance_matrix_mle[I2].sort_values(ascending=False).iloc[1:11]  

print(f"Top 5 peers for I1 (MovieID: {I1}): {top_5_peers_mle_I1}")
print(f"Top 5 peers for I2 (MovieID: {I2}): {top_5_peers_mle_I2}")
print(f"Top 10 peers for I1 (MovieID: {I1}): {top_10_peers_mle_I1}")
print(f"Top 10 peers for I2 (MovieID: {I2}): {top_10_peers_mle_I2}")

Top 5 peers for I1 (MovieID: 127): MovieID
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
Name: 127, dtype: float64
Top 5 peers for I2 (MovieID: 133): MovieID
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
Name: 133, dtype: float64
Top 10 peers for I1 (MovieID: 127): MovieID
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
Name: 127, dtype: float64
Top 10 peers for I2 (MovieID: 133): MovieID
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
Name: 133, dtype: float64


3.3.3: Determine reduced dimensional space for each user in case of using the top 5-peers

In [18]:
top_5_peers_I1_and_I2_mle = list(set(top_5_peers_mle_I1.index) | set(top_5_peers_mle_I2.index)) 
ratings_reduced_5_mle = ratings_matrix_mle[top_5_peers_I1_and_I2_mle].fillna(0)
pca_5_mle = PCA(n_components=2)
ratings_reduced_5_mle_pca = pca_5_mle.fit_transform(ratings_reduced_5_mle)

print("Reduced dimensional space (Top 5 peers - MLE):")
print(ratings_reduced_5_mle_pca[:5])  

Reduced dimensional space (Top 5 peers - MLE):
[[-0.70905081 -0.27783914]
 [-0.70905081 -0.27783914]
 [-0.70905081 -0.27783914]
 [-0.70905081 -0.27783914]
 [ 1.21437332 -0.82227991]]


3.3.4: Use the results from point 3.3.3 to compute the rating predictions of the original missing rating for each of the target items (I1 and I2) using the top 5-peers

In [19]:
predicted_rating_I1_5_mle = ratings_reduced_5_mle[top_5_peers_mle_I1.index].mean(axis=1)
predicted_rating_I2_5_mle = ratings_reduced_5_mle[top_5_peers_mle_I2.index].mean(axis=1)

print(f"Predicted rating for I1 (using top 5 peers - MLE): {predicted_rating_I1_5_mle[:5]}")
print(f"Predicted rating for I2 (using top 5 peers - MLE): {predicted_rating_I2_5_mle[:5]}")

Predicted rating for I1 (using top 5 peers - MLE): UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.4
dtype: float64
Predicted rating for I2 (using top 5 peers - MLE): UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.4
dtype: float64


3.3.5: Determine reduced dimensional space for each user in case of using the top 10-peers

In [20]:

top_10_peers_I1_and_I2_mle = list(set(top_10_peers_mle_I1.index) | set(top_10_peers_mle_I2.index)) 
ratings_reduced_10_mle = ratings_matrix_mle[top_10_peers_I1_and_I2_mle].fillna(0)

pca_10_mle = PCA(n_components=2)
ratings_reduced_10_mle_pca = pca_10_mle.fit_transform(ratings_reduced_10_mle)

print("Reduced dimensional space (Top 10 peers - MLE):")
print(ratings_reduced_10_mle_pca[:5])  

Reduced dimensional space (Top 10 peers - MLE):
[[-1.15850786  0.00702471]
 [-1.15850786  0.00702471]
 [-1.15850786  0.00702471]
 [-1.15850786  0.00702471]
 [-0.12305775 -1.27991222]]


3.3.6: Use the results from point 3.3.5 to compute the rating predictions of the original missing rating for each of the target items (I1 and I2) using the top 10-peers

In [21]:

predicted_rating_I1_10_mle = ratings_reduced_10_mle[top_10_peers_mle_I1.index].mean(axis=1)
predicted_rating_I2_10_mle = ratings_reduced_10_mle[top_10_peers_mle_I2.index].mean(axis=1)

print(f"Predicted rating for I1 (using top 10 peers - MLE): {predicted_rating_I1_10_mle[:5]}")
print(f"Predicted rating for I2 (using top 10 peers - MLE): {predicted_rating_I2_10_mle[:5]}")


Predicted rating for I1 (using top 10 peers - MLE): UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.2
dtype: float64
Predicted rating for I2 (using top 10 peers - MLE): UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.2
dtype: float64


3.3.7: Compare the results of point 3.3.3 with results of point 3.3.6. Comment on your answer

In [22]:

print(f"Comparison of predicted ratings for I1:\nTop 5: {predicted_rating_I1_5_mle[:5]}\nTop 10: {predicted_rating_I1_10_mle[:5]}")
print(f"Comparison of predicted ratings for I2:\nTop 5: {predicted_rating_I2_5_mle[:5]}\nTop 10: {predicted_rating_I2_10_mle[:5]}")


Comparison of predicted ratings for I1:
Top 5: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.4
dtype: float64
Top 10: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.2
dtype: float64
Comparison of predicted ratings for I2:
Top 5: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.4
dtype: float64
Top 10: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.2
dtype: float64


3.3.8: Compare the results of point 3.2.9 with results of point 3.3.4. Comment on your answer

In [23]:

print(f"Comparison of predicted ratings for I1 (PCA vs MLE):\nPCA Top 5: {predicted_rating_I1_5[:5]}\nMLE Top 5: {predicted_rating_I1_5_mle[:5]}")
print(f"Comparison of predicted ratings for I2 (PCA vs MLE):\nPCA Top 5: {predicted_rating_I2_5[:5]}\nMLE Top 5: {predicted_rating_I2_5_mle[:5]}")


Comparison of predicted ratings for I1 (PCA vs MLE):
PCA Top 5: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.4
dtype: float64
MLE Top 5: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.4
dtype: float64
Comparison of predicted ratings for I2 (PCA vs MLE):
PCA Top 5: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.4
dtype: float64
MLE Top 5: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.4
dtype: float64


3.3.9: Compare the results of point 3.2.11 with results of point 3.3.6. Comment on your answer

In [24]:

print(f"Comparison of predicted ratings for I1 (PCA vs MLE):\nPCA Top 10: {predicted_rating_I1_10[:5]}\nMLE Top 10: {predicted_rating_I1_10_mle[:5]}")
print(f"Comparison of predicted ratings for I2 (PCA vs MLE):\nPCA Top 10: {predicted_rating_I2_10[:5]}\nMLE Top 10: {predicted_rating_I2_10_mle[:5]}")


Comparison of predicted ratings for I1 (PCA vs MLE):
PCA Top 10: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.2
dtype: float64
MLE Top 10: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.2
dtype: float64
Comparison of predicted ratings for I2 (PCA vs MLE):
PCA Top 10: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.2
dtype: float64
MLE Top 10: UserID
1    0.0
2    0.0
3    0.0
4    0.0
5    0.2
dtype: float64


# Part 3: Singular Value Decomposition (SVD) method 

3.4.1 Calculate the average rating for each item

In [27]:
average_item_ratings = ratings_filled.groupby('MovieID')['Rating'].mean()

print("Average ratings for each item:")
print(average_item_ratings.head())

Average ratings for each item:
MovieID
1    4.146846
2    3.201141
3    3.016736
4    2.729412
5    3.006757
Name: Rating, dtype: float64


3.4.2 Use the mean-filling method to replace unspecified ratings for each item

In [28]:

ratings_filled_mean = ratings_filled.copy()
ratings_filled_mean['Rating'] = ratings_filled_mean.apply(
    lambda row: row['Rating'] if pd.notna(row['Rating']) else average_item_ratings[row['MovieID']], axis=1)

ratings_matrix_filled = ratings_filled_mean.pivot(index='UserID', columns='MovieID', values='Rating')

print("Ratings matrix after mean-filling:")
print(ratings_matrix_filled.head())

Ratings matrix after mean-filling:
MovieID  1     2     3     4     5     6     7     8     9     10    ...  \
UserID                                                               ...   
1         5.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         NaN   NaN   NaN   NaN   NaN   2.0   NaN   NaN   NaN   NaN  ...   

MovieID  3943  3944  3945  3946  3947  3948  3949  3950  3951  3952  
UserID                                                               
1         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
5         Na

3.4.3: Compute the eigenvalues (λ1, λ2, λ3, ...) and their corresponding eigenvectors (U1, U2, U3, ...) of the ratings matrix

In [31]:
import numpy as np
U, S, Vt = np.linalg.svd(ratings_matrix_filled.fillna(0), full_matrices=False)

eigenvalues = S

eigenvectors_u = U
eigenvectors_v = Vt

print("Eigenvalues:", eigenvalues[:5])
print("First 5 eigenvectors (U):", eigenvectors_u[:, :5])
print("First 5 eigenvectors (Vt):", eigenvectors_v[:5, :])


Eigenvalues: [1893.21055869  671.34356538  574.85275997  518.08422502  444.85478082]
First 5 eigenvectors (U): [[-0.00471786  0.00164551  0.00267141  0.00137288  0.01872592]
 [-0.00928856 -0.00269782  0.00038215 -0.00709548 -0.00676515]
 [-0.00501018 -0.00334292 -0.00334366 -0.0030347   0.01176653]
 ...
 [-0.00138885  0.00181339 -0.00011879  0.0005557   0.00488343]
 [-0.00700793  0.0187647  -0.01071225  0.00858309  0.0219811 ]
 [-0.01896102  0.04080244 -0.00304316 -0.01862121 -0.00312631]]
First 5 eigenvectors (Vt): [[-0.07013714 -0.02354382 -0.01376584 ... -0.00261526 -0.00116636
  -0.01325659]
 [-0.02094015 -0.02979245 -0.0167039  ...  0.0018744   0.00226511
   0.00502213]
 [ 0.03016472 -0.01018907  0.01257242 ...  0.00178319  0.00352092
   0.02235768]
 [-0.00486156  0.03109033  0.02714274 ... -0.00036543 -0.0011091
  -0.01275768]
 [ 0.12477815  0.00958079  0.0058329  ... -0.00526835 -0.00086585
  -0.01589169]]


3.4.4: Check if the set of eigenvectors are mutually orthogonal

In [33]:

orthogonality_check = np.dot(eigenvectors_u.T, eigenvectors_u)

print("Orthogonality check (should be close to 0 off-diagonal):")
print(orthogonality_check)


Orthogonality check (should be close to 0 off-diagonal):
[[ 1.00000000e+00 -5.55111512e-17  6.50521303e-17 ... -1.06251813e-17
   8.00954355e-18 -7.58941521e-19]
 [-5.55111512e-17  1.00000000e+00 -2.32236105e-16 ...  2.11419424e-18
   3.36102673e-18  3.30681663e-18]
 [ 6.50521303e-17 -2.32236105e-16  1.00000000e+00 ...  1.08420217e-18
   2.16840434e-18 -8.67361738e-19]
 ...
 [-1.06251813e-17  2.11419424e-18  1.08420217e-18 ...  1.00000000e+00
  -5.70290343e-17  3.90312782e-17]
 [ 8.00954355e-18  3.36102673e-18  2.16840434e-18 ... -5.70290343e-17
   1.00000000e+00  3.10081821e-17]
 [-7.58941521e-19  3.30681663e-18 -8.67361738e-19 ...  3.90312782e-17
   3.10081821e-17  1.00000000e+00]]


3.4.5: Perform Vector Normalization if the eigenvectors are not orthogonal

In [35]:

if not np.allclose(orthogonality_check, np.eye(len(orthogonality_check))):
    eigenvectors_u = np.linalg.qr(eigenvectors_u)[0]  

    print("Eigenvectors after orthogonalization:")
    print(eigenvectors_u[:5, :])


3.4.6: Check if the eigenvectors are orthonormal (each vector must have magnitude 1)

In [36]:

norm_check = np.linalg.norm(eigenvectors_u, axis=0)

print("Magnitude of eigenvectors (should be 1):")
print(norm_check)


Magnitude of eigenvectors (should be 1):
[1. 1. 1. ... 1. 1. 1.]


3.4.7: Apply Gram-Schmidt method to convert the eigenvectors into an orthonormal set of vectors

In [37]:

def gram_schmidt_process(vectors):
    orthonormal_vectors = []
    for v in vectors.T:
        for u in orthonormal_vectors:
            v -= np.dot(v, u) * u  
        v /= np.linalg.norm(v)  
        orthonormal_vectors.append(v)
    return np.array(orthonormal_vectors).T

eigenvectors_u_orthonormal = gram_schmidt_process(eigenvectors_u)

print("Eigenvectors after Gram-Schmidt orthonormalization:")
print(eigenvectors_u_orthonormal[:5, :])


Eigenvectors after Gram-Schmidt orthonormalization:
[[-0.00471786  0.00164551  0.00267141 ... -0.01776587  0.00369828
  -0.00422689]
 [-0.00928856 -0.00269782  0.00038215 ...  0.01474961 -0.00965117
   0.01119521]
 [-0.00501018 -0.00334292 -0.00334366 ... -0.00230599 -0.00307979
  -0.00582591]
 [-0.00267746 -0.00129703 -0.00767513 ... -0.03719729  0.00644174
   0.02108675]
 [-0.00889607  0.00382517  0.02209941 ... -0.00329437 -0.00147973
   0.00681981]]


3.4.8: Construct the predicted waiting matrix Z from the eigenvalues on the main diagonal

In [38]:

Z = np.diag(eigenvalues)

print("Predicted waiting matrix (Z):")
print(Z[:5, :5])  


Predicted waiting matrix (Z):
[[1893.21055869    0.            0.            0.            0.        ]
 [   0.          671.34356538    0.            0.            0.        ]
 [   0.            0.          574.85275997    0.            0.        ]
 [   0.            0.            0.          518.08422502    0.        ]
 [   0.            0.            0.            0.          444.85478082]]


3.4.9: Construct the items matrix 𝑉 whose columns are the set of orthonormal vectors

In [39]:

items_matrix = eigenvectors_v.T

print("Items matrix (V):")
print(items_matrix[:5, :5])  


Items matrix (V):
[[-0.07013714 -0.02094015  0.03016472 -0.00486156  0.12477815]
 [-0.02354382 -0.02979245 -0.01018907  0.03109033  0.00958079]
 [-0.01376584 -0.0167039   0.01257242  0.02714274  0.0058329 ]
 [-0.0053234  -0.00296276  0.01235569  0.01507195 -0.0026277 ]
 [-0.00971651 -0.01348858  0.01247469  0.02888498  0.00431142]]


3.4.10: Construct the predicted user matrix 𝑈 whose columns are the predicted vectors

In [40]:

users_matrix = eigenvectors_u_orthonormal

print("Users matrix (U):")
print(users_matrix[:5, :5])  


Users matrix (U):
[[-0.00471786  0.00164551  0.00267141  0.00137288  0.01872592]
 [-0.00928856 -0.00269782  0.00038215 -0.00709548 -0.00676515]
 [-0.00501018 -0.00334292 -0.00334366 -0.0030347   0.01176653]
 [-0.00267746 -0.00129703 -0.00767513 -0.00727068  0.0020769 ]
 [-0.00889607  0.00382517  0.02209941 -0.0184782  -0.00481466]]


3.4.11: Use the results from points 3.4.8 to 3.4.10 to construct the newly reduced rating matrix 

In [41]:

R_reduced = np.dot(np.dot(users_matrix, Z), items_matrix)

print("Reconstructed rating matrix (R = U Z V^T):")
print(R_reduced[:5, :5]) 


Reconstructed rating matrix (R = U Z V^T):
[[ 1.26112513  0.60642308 -0.05374419  1.01412053 -0.4537866 ]
 [ 1.93248762 -0.32874299 -0.45476319  0.81237333 -2.06747525]
 [ 0.7226417  -0.24510323 -0.94350653 -0.94320407 -0.71339379]
 [ 0.24257233  0.60698915  0.04299004 -0.41757629 -0.28051981]
 [ 1.74019636 -0.5079428  -1.08920025 -0.34960232 -2.10334771]]


3.4.12: Use the results from point 3.4.11 to find missing ratings in the original rating matrix for each of the target items (I1 and I2)


In [45]:
user_indices = ratings_matrix_filled.index - 1  

predicted_rating_I1_svd = R_reduced[user_indices, I1_zero_indexed]
predicted_rating_I2_svd = R_reduced[user_indices, I2_zero_indexed]

print(f"Predicted ratings for I1 (MovieID: {I1}) using SVD:")
print(predicted_rating_I1_svd[:5]) 

print(f"Predicted ratings for I2 (MovieID: {I2}) using SVD:")
print(predicted_rating_I2_svd[:5])

Predicted ratings for I1 (MovieID: 127) using SVD:
[1.50975508 1.64953463 0.78586411 0.52828894 1.56169789]
Predicted ratings for I2 (MovieID: 133) using SVD:
[ 0.53995994  1.3911938  -0.09262471  0.05804123 -1.17768289]
