In [3]:
from surprise import Dataset, Reader
from surprise import SVD, SVDpp, NMF
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'surprise'

In [2]:
data = Dataset.load_builtin('ml-100k')

algorithms = {
    'SVD': SVD(),
    'SVD++': SVDpp(),
    'NMF': NMF()
}

results = {}
for name, algo in algorithms.items():
    print(f"Evaluating {name}...")
    cross_val_result = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=False)
    results[name] = {
        'RMSE': np.mean(cross_val_result['test_rmse']),
        'MAE': np.mean(cross_val_result['test_mae'])
    }

print("\nРезультати крос-валідації:")
for name, metrics in results.items():
    print(f"{name}: RMSE = {metrics['RMSE']:.4f}, MAE = {metrics['MAE']:.4f}")

param_grid = {
    'n_factors': [20, 50, 100],
    'n_epochs': [20, 30, 50],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.1]
}

grid_search = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
grid_search.fit(data)

print("\nНайкращі параметри для SVD:")
print(grid_search.best_params['rmse'])
print("\nНайкраще RMSE:", grid_search.best_score['rmse'])

df = pd.read_csv('https://raw.githubusercontent.com/sidooms/MovieTweetings/master/latest/ratings.dat',
                 sep='::', names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')

n_users = df['userId'].nunique()
n_items = df['movieId'].nunique()

train_data, test_data = train_test_split(df, test_size=0.2)

train_matrix = train_data.pivot(index='userId', columns='movieId', values='rating').fillna(0).values
test_matrix = test_data.pivot(index='userId', columns='movieId', values='rating').fillna(0).values

class MatrixFactorization:
    def __init__(self, R, K, steps=5000, alpha=0.0002, beta=0.02):
        self.R = R
        self.K = K
        self.steps = steps
        self.alpha = alpha
        self.beta = beta

    def train(self):
        n_users, n_items = self.R.shape
        self.P = np.random.rand(n_users, self.K)
        self.Q = np.random.rand(n_items, self.K)

        for step in range(self.steps):
            for i in range(n_users):
                for j in range(n_items):
                    if self.R[i, j] > 0:
                        error = self.R[i, j] - np.dot(self.P[i, :], self.Q[j, :].T)
                        self.P[i, :] += self.alpha * (2 * error * self.Q[j, :] - self.beta * self.P[i, :])
                        self.Q[j, :] += self.alpha * (2 * error * self.P[i, :] - self.beta * self.Q[j, :])

        return self.P, self.Q

mf = MatrixFactorization(train_matrix, K=10)
P, Q = mf.train()

predicted_matrix = np.dot(P, Q.T)
mse = np.mean((test_matrix[test_matrix > 0] - predicted_matrix[test_matrix > 0])**2)

print("\nMSE для матричної факторизації з нуля:", mse)

metrics_df = pd.DataFrame(results).T
metrics_df.plot(kind='bar', figsize=(10, 6))
plt.title('Порівняння алгоритмів за RMSE та MAE')
plt.ylabel('Середнє значення')
plt.xlabel('Алгоритм')
plt.xticks(rotation=0)
plt.show()

NameError: name 'Dataset' is not defined