In [4]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357253 sha256=c39918ff14feefdb954d314221ba337aab36803c729613b7d87f29d27125b2ff
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [8]:
from surprise import Dataset
from surprise import SVD, SVDpp, NMF
from surprise.model_selection import cross_validate

data = Dataset.load_builtin('ml-100k')

In [9]:
svd = SVD()
svdpp = SVDpp()
nmf = NMF()

# Крос-валідація для кожного алгоритму
svd_results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
svdpp_results = cross_validate(svdpp, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
nmf_results = cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print("SVD results:", svd_results)
print("SVD++ results:", svdpp_results)
print("NMF results:", nmf_results)

# Вибір оптимального алгоритму на основі RMSE
best_algorithm = None
best_rmse = float('inf')

for algo, results in zip(['SVD', 'SVD++', 'NMF'], [svd_results, svdpp_results, nmf_results]):
    mean_rmse = results['test_rmse'].mean()
    if mean_rmse < best_rmse:
        best_rmse = mean_rmse
        best_algorithm = algo

print(f"Best algorithm based on RMSE: {best_algorithm} with RMSE: {best_rmse}")

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9314  0.9432  0.9334  0.9401  0.9349  0.9366  0.0044  
MAE (testset)     0.7350  0.7427  0.7357  0.7402  0.7362  0.7380  0.0030  
Fit time          1.42    2.19    1.48    1.50    1.44    1.61    0.29    
Test time         0.13    0.40    0.12    0.21    0.17    0.20    0.10    
Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9204  0.9220  0.9137  0.9218  0.9164  0.9189  0.0033  
MAE (testset)     0.7184  0.7210  0.7168  0.7258  0.7227  0.7209  0.0032  
Fit time          27.12   29.10   29.49   27.31   27.70   28.15   0.96    
Test time         5.68    4.73    4.89    5.58    5.04    5.18    0.38    
Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (tests

In [10]:
# Поекспериментуем з різними параметрами алгоритмів
svd = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
svdpp = SVDpp(n_factors=50, n_epochs=15, lr_all=0.007, reg_all=0.01)
nmf = NMF(n_factors=20, n_epochs=30, reg_pu=0.06, reg_qi=0.06)

# Крос-валідація для кожного алгоритму
svd_results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
svdpp_results = cross_validate(svdpp, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
nmf_results = cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print("SVD results:", svd_results)
print("SVD++ results:", svdpp_results)
print("NMF results:", nmf_results)

# Вибір оптимального алгоритму на основі RMSE
best_algorithm = None
best_rmse = float('inf')

for algo, results in zip(['SVD', 'SVD++', 'NMF'], [svd_results, svdpp_results, nmf_results]):
    mean_rmse = results['test_rmse'].mean()
    if mean_rmse < best_rmse:
        best_rmse = mean_rmse
        best_algorithm = algo

print(f"Best algorithm based on RMSE: {best_algorithm} with RMSE: {best_rmse}")

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9382  0.9393  0.9349  0.9387  0.9343  0.9371  0.0021  
MAE (testset)     0.7415  0.7397  0.7368  0.7405  0.7361  0.7389  0.0021  
Fit time          2.31    1.44    1.46    1.45    1.44    1.62    0.34    
Test time         0.21    0.21    0.12    0.21    0.17    0.18    0.04    
Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9216  0.9224  0.9204  0.9282  0.9154  0.9216  0.0041  
MAE (testset)     0.7237  0.7239  0.7204  0.7276  0.7205  0.7232  0.0027  
Fit time          47.15   47.70   52.96   47.96   47.06   48.57   2.22    
Test time         4.75    5.65    5.19    5.66    5.66    5.38    0.36    
Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (tests

In [21]:
# Додаткове завдання з зірочкою

import numpy as np
import pandas as pd
from scipy.io import loadmat

def loadMovieList():
    """
    Reads the fixed movie list in movie_ids.txt and returns a list of movie names.
    Returns
    -------
    movieNames : list
        A list of strings, representing all movie names.
    """
    with open('/content/movie_ids.txt', encoding='ISO-8859-1') as fid:
        movies = fid.readlines()

    movieNames = []
    for movie in movies:
        parts = movie.split()
        movieNames.append(' '.join(parts[1:]).strip())
    return movieNames

movies = loadMovieList()
data = loadmat('/content/movies.mat')

ratings = data['Y']
rated = data['R']

# Перетворення масиву рейтингів у DataFrame
num_users, num_movies = ratings.shape
user_ids = np.arange(1, num_users + 1)
movie_ids = np.arange(1, num_movies + 1)

ratings_df = pd.DataFrame(ratings, index=user_ids, columns=movie_ids)
rated_df = pd.DataFrame(rated, index=user_ids, columns=movie_ids)

# Створення DataFrame з ідентифікаторами та назвами фільмів
movies_df = pd.DataFrame({'movieId': range(1, len(movies) + 1), 'movieName': movies})

# Об'єднання датасету рейтингів з DataFrame назв фільмів
merged_df = ratings_df.stack().reset_index()
merged_df.columns = ['userId', 'movieId', 'rating']
merged_df = pd.merge(merged_df, movies_df, on='movieId')

# Ініціалізація параметрів моделі
num_users = merged_df['userId'].nunique()
num_movies = merged_df['movieId'].nunique()
num_factors = 10  # Кількість факторів

P = np.random.rand(num_users, num_factors) * 0.1
Q = np.random.rand(num_movies, num_factors) * 0.1

# Функція втрат
def loss_function(P, Q, user_movie_matrix, lambda_reg=0.02):
    predicted = np.dot(P, Q.T)
    error = user_movie_matrix - predicted
    loss = np.sum(error ** 2) + lambda_reg * (np.sum(P ** 2) + np.sum(Q ** 2))
    return loss

# Розрахунок градієнтів
def compute_gradients(P, Q, user_movie_matrix, lambda_reg=0.02):
    predicted = np.dot(P, Q.T)
    error = user_movie_matrix - predicted
    P_grad = -2 * np.dot(error, Q) + 2 * lambda_reg * P
    Q_grad = -2 * np.dot(error.T, P) + 2 * lambda_reg * Q
    return P_grad, Q_grad

# Навчання моделі
learning_rate = 0.0001
num_epochs = 100

user_movie_matrix = merged_df.pivot_table(index='userId', columns='movieId', values='rating')

for epoch in range(num_epochs):
    P_grad, Q_grad = compute_gradients(P, Q, user_movie_matrix.fillna(0), lambda_reg=0.02)
    P -= learning_rate * P_grad
    Q -= learning_rate * Q_grad
    loss = loss_function(P, Q, user_movie_matrix.fillna(0), lambda_reg=0.02)
    print(f'Epoch {epoch + 1}, Loss: {loss}')

# Оцінка моделі
predicted_ratings = np.dot(P, Q.T)
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_ids, columns=movie_ids)

# Об'єднання передбачених рейтингів з DataFrame назв фільмів
predicted_ratings_long = predicted_ratings_df.stack().reset_index()
predicted_ratings_long.columns = ['userId', 'movieId', 'predicted_rating']
predicted_ratings_long = pd.merge(predicted_ratings_long, movies_df, on='movieId')

# Виведення результатів
print(predicted_ratings_long)

# Збереження результатів у CSV-файл
predicted_ratings_long.to_csv('predicted_ratings.csv', index=False)

Epoch 1, Loss: movieId
1      3914.771988
2       903.319188
3       496.366741
4       466.938000
5      1744.626398
          ...     
939     926.530816
940    1364.160225
941     374.884058
942    1467.393113
943    2189.254544
Length: 943, dtype: float64
Epoch 2, Loss: movieId
1      3894.753488
2       900.522425
3       495.669098
4       466.273957
5      1738.809077
          ...     
939     924.803185
940    1358.625384
941     374.072600
942    1463.823074
943    2181.220944
Length: 943, dtype: float64
Epoch 3, Loss: movieId
1      3869.527800
2       897.387483
3       494.936538
4       465.585562
5      1731.688253
          ...     
939     922.884514
940    1351.968198
941     373.221827
942    1459.527417
943    2171.181827
Length: 943, dtype: float64
Epoch 4, Loss: movieId
1      3837.968104
2       893.878584
3       494.176992
4       464.879977
5      1723.011171
          ...     
939     920.762468
940    1343.972636
941     372.334448
942    1454.372728
943    