In [42]:
import numpy as np
import pandas as pd

ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [43]:
n = 100000
ratings_df_sample = ratings_df[:n]

n_users = len(ratings_df_sample['userId'].unique())
n_movies = len(ratings_df_sample['movieId'].unique())
print("Количество уникальных пользователей:", n_users, "\n","Количество уникальных фильмов:", n_movies)

Количество уникальных пользователей: 610 
 Количество уникальных фильмов: 9569


In [44]:
#отмасштабируем идентификаторы фильмов, чтобы они начинались с 1 и заканчивались на n_movies:
movie_ids = ratings_df_sample['movieId'].unique()

def scale_movie_id(movie_id):
    scaled = np.where(movie_ids == movie_id)[0][0] + 1
    return scaled

ratings_df_sample['movieId'] = ratings_df_sample['movieId'].apply(scale_movie_id)
ratings_df_sample.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df_sample['movieId'] = ratings_df_sample['movieId'].apply(scale_movie_id)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,2,4.0,964981247
2,1,3,4.0,964982224
3,1,4,5.0,964983815
4,1,5,5.0,964982931


In [45]:
#разделим датасет на два других сета - обучающий и тестовый
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(ratings_df_sample, test_size=0.25)

print('Train shape: {}'.format(train_data.shape))
print('Test shape: {}'.format(test_data.shape))


Train shape: (75000, 4)
Test shape: (25000, 4)


In [46]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(prediction, ground_truth):
    # Оставим оценки, предсказанные алгоритмом, только для соотвествующего набора данных

    prediction = np.nan_to_num(prediction)[ground_truth.nonzero()].flatten()
    # Оставим оценки, которые реально поставил пользователь, только для соотвествующего набора данных

    ground_truth = np.nan_to_num(ground_truth)[ground_truth.nonzero()].flatten()

    mse = mean_squared_error(prediction, ground_truth)
    return sqrt(mse)

In [47]:
train_data_matrix = np.zeros((n_users, n_movies))
for line in train_data.itertuples():
    train_data_matrix[line[1] - 1, line[2] - 1] = line[3]

test_data_matrix = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    test_data_matrix[line[1] - 1, line[2] - 1] = line[3]

In [48]:
from  sklearn.metrics.pairwise import pairwise_distances

# расчет косинусного расстояния

user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [49]:
from scipy.spatial import distance

print(distance.cosine([3,3],[2,3]))
print(distance.cosine([3, 3],[1, 1.5]))
print(distance.cosine([3, 3],[1, 3]))

0.01941932430907989
0.01941932430907989
0.10557280900008414


In [50]:
from  sklearn.metrics.pairwise import pairwise_distances

demo_data = [[5,5,5,0,0], [4,1,0,5,3], [1,0,0,5,0], [5,0,5,0,4]]
pairwise_distances(demo_data, metric='cosine')

array([[1.11022302e-16, 5.95773958e-01, 8.86772297e-01, 2.89330945e-01],
       [5.95773958e-01, 1.11022302e-16, 2.03609197e-01, 4.48439797e-01],
       [8.86772297e-01, 2.03609197e-01, 0.00000000e+00, 8.79298863e-01],
       [2.89330945e-01, 4.48439797e-01, 8.79298863e-01, 2.22044605e-16]])

In [51]:
def naive_predict(top):

    top_similar_ratings = np.zeros((n_users, top, n_movies))

    for i in range(n_users):

        top_sim_users = user_similarity[i].argsort()[1:top + 1]

        top_similar_ratings[i] = train_data_matrix[top_sim_users]

    pred = np.zeros((n_users, n_movies))
    for i in range(n_users):
        pred[i] = top_similar_ratings[i].sum(axis=0) / top

    return pred


def naive_predict_item(top):
    top_similar_ratings = np.zeros((n_movies, top, n_users))

    for i in range(n_movies):
        top_sim_movies = item_similarity[i].argsort()[1:top + 1]
        top_similar_ratings[i] = train_data_matrix.T[top_sim_movies]

    pred = np.zeros((n_movies, n_users))
    for i in range(n_movies):
        pred[i] = top_similar_ratings[i].sum(axis=0) / top

    return pred.T

naive_pred = naive_predict(7)
print('User-based RMSE: ', rmse(naive_pred, test_data_matrix))

naive_pred_item = naive_predict_item(7)
print('Item-based RMSE: ', rmse(naive_pred_item, test_data_matrix))


User-based CF RMSE:  2.8235506666867054
Item-based CF RMSE:  3.0215278605773106


In [52]:
def k_fract_predict(top):
    top_similar = np.zeros((n_users, top))

    for i in range(n_users):
        user_sim = user_similarity[i]
        top_sim_users = user_sim.argsort()[1:top + 1]#[-top:]


        for j in range(top):
            top_similar[i, j] = top_sim_users[j]

    abs_sim = np.abs(user_similarity)
    pred = np.zeros((n_users, n_movies))

    for i in range(n_users):
        indexes = top_similar[i].astype(np.int)
        numerator = user_similarity[i][indexes]

        product = numerator.dot(train_data_matrix[indexes])

        denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()

        pred[i] = product / denominator

    return pred


def k_fract_predict_item(top):
    flag = True
    top_similar = np.zeros((n_movies, top))

    for i in range(n_movies):
        movies_sim = item_similarity[i]
        top_sim_movies = movies_sim.argsort()[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_movies.T[j]

    abs_sim = np.abs(item_similarity)
    pred = np.zeros((n_movies, n_users))


    for i in range(n_users):
        indexes = top_similar[i].astype(np.int_)
        numerator = item_similarity[i][indexes]

        product = numerator.dot(train_data_matrix.T[indexes])

        denominator = abs_sim[i][indexes].sum()
        denominator = denominator if denominator != 0 else 1

        pred[i] = product / denominator

    return pred.T


k_predict = k_fract_predict(7)
print('User-based RMSE: ', rmse(k_predict, test_data_matrix))

k_predict_item = k_fract_predict_item(7)
print('Item-based RMSE: ', rmse(k_predict_item, test_data_matrix))


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  indexes = top_similar[i].astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()


User-based CF RMSE:  2.8244057029955214
Item-based CF RMSE:  3.355662470857755


In [53]:
def k_fract_mean_predict(top):
    top_similar = np.zeros((n_users, top))

    for i in range(n_users):
        user_sim = user_similarity[i]
        top_sim_users = user_sim.argsort()[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_users[j]

    abs_sim = np.abs(user_similarity)
    pred = np.zeros((n_users, n_movies))

    for i in range(n_users):
        indexes = top_similar[i].astype(np.int)
        numerator = user_similarity[i][indexes]

        mean_rating = np.array([x for x in train_data_matrix[i] if x > 0]).mean()
        diff_ratings = train_data_matrix[indexes] - train_data_matrix[indexes].mean()
        numerator = numerator.dot(diff_ratings)
        denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()

        pred[i] = mean_rating + numerator / denominator

    return pred

def k_fract_mean_predict_item(top):
    top_similar = np.zeros((n_movies, top))

    for i in range(n_movies):
        movie_sim = item_similarity[i]
        top_sim_movies = movie_sim.argsort()[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_movies[j]

    abs_sim = np.abs(item_similarity)
    pred = np.zeros((n_movies, n_users))

    for i in range(n_movies):
        indexes = top_similar[i].astype(np.int_)
        numerator = item_similarity[i][indexes]

        diff_ratings = train_data_matrix.T[indexes] - train_data_matrix.T[indexes].mean()
        numerator = numerator.dot(diff_ratings)
        denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()
        denominator = denominator if denominator != 0 else 1

        mean_rating = np.array([x for x in train_data_matrix.T[i] if x > 0]).mean()
        mean_rating = 0 if np.isnan(mean_rating) else mean_rating
        pred[i] = mean_rating + numerator / denominator

    return pred.T

k_predict = k_fract_mean_predict(7)
print('User-based RMSE: ', rmse(k_predict, test_data_matrix))

k_predict_item = k_fract_mean_predict_item(7)
print('Item-based RMSE: ', rmse(k_predict_item, test_data_matrix))


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  indexes = top_similar[i].astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()


User-based CF RMSE:  1.466365032397601


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()
  mean_rating = np.array([x for x in train_data_matrix.T[i] if x > 0]).mean()
  ret = ret.dtype.type(ret / rcount)


Item-based CF RMSE:  1.4288150670459723
