<a href="https://colab.research.google.com/github/EgorSolovei/VK-recommendation-system/blob/main/create_user_movie_matrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Filtering

In [None]:
import pandas as pd
import numpy as np
import time 
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/ML_&_AI/Pratice/VK recomd/data/ratings.csv')

In [None]:
ratings.drop(columns=['timestamp'], inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5


In [None]:
# не хватит памяти, чтобы построить матрицу таких размеров, поэтому нужно убрать малоинформативные профили
ratings.movieId.unique().shape, ratings.userId.unique().shape # количество фильмов и пользователей

((53889,), (283228,))

In [None]:
# будем выбирать такие профили пользователей, которые поставили хотя бы 50 оценок
ratings.groupby('userId')['movieId'].count().reset_index(name='count_film').describe()

Unnamed: 0,userId,count_film
count,283228.0,283228.0
mean,141614.5,97.989761
std,81761.025358,212.760722
min,1.0,1.0
25%,70807.75,15.0
50%,141614.5,30.0
75%,212421.25,95.0
max,283228.0,23715.0


In [None]:
# будем выбирать такие профили фильмов, которые имеют хотя бы 48 оценок
ratings.groupby('movieId')['userId'].count().reset_index(name='count_score').describe()

Unnamed: 0,movieId,count_score
count,53889.0,53889.0
mean,109110.13795,515.011301
std,60910.551686,2934.758939
min,1.0,1.0
25%,66310.0,2.0
50%,123105.0,7.0
75%,160770.0,48.0
max,193886.0,97999.0


In [None]:
# фильтруем данные
user_movie_df = ratings.groupby('userId').filter(lambda x: len(x) >= 50)\
                        .groupby('movieId').filter(lambda x: len(x) >= 48).reset_index()

user_movie_df.drop(columns='index', inplace=True)
user_movie_df.head()

Unnamed: 0,userId,movieId,rating
0,4,1,4.0
1,4,2,4.0
2,4,5,2.0
3,4,6,4.5
4,4,10,4.0


In [None]:
user_movie_df.movieId.unique().shape, user_movie_df.userId.unique().shape # количество фильмов и пользователей

((13338,), (109672,))

In [None]:
user_movie_df.memory_usage(index=True).sum() / 1048576 # мб занимает этот массив

553.5987243652344

In [None]:
user_movie_df.shape

(24187092, 3)

In [None]:
user_movie_df.dtypes

userId       int64
movieId      int64
rating     float64
dtype: object

In [None]:
user_movie_df.groupby('userId')['movieId'].count().reset_index(name='count_film').describe()

Unnamed: 0,userId,count_film
count,109672.0,109672.0
mean,141931.087534,220.540266
std,81624.528656,280.878359
min,4.0,9.0
25%,71226.75,76.0
50%,142316.5,126.0
75%,212467.0,246.0
max,283228.0,9404.0


In [None]:
user_movie_df.groupby('movieId')['userId'].count().reset_index(name='count_score').describe()

Unnamed: 0,movieId,count_score
count,13338.0,13338.0
mean,41332.142825,1813.397211
std,49522.650888,4663.156555
min,1.0,48.0
25%,3760.25,109.0
50%,8139.0,320.0
75%,72556.25,1250.0
max,192307.0,69322.0


In [None]:
user_movie_df = user_movie_df.astype({'userId': 'int32', 'movieId':'int32', 'rating': 'float16'})
user_movie_df.dtypes

userId       int32
movieId      int32
rating     float16
dtype: object

In [None]:
user_movie_df.memory_usage(index=True).sum() / 1048576 # мб занимает этот массив

230.66620635986328

In [None]:
user_movie_df.movieId.unique().shape, user_movie_df.userId.unique().shape # количество фильмов и пользователей

((13338,), (109672,))

In [None]:
del ratings

In [None]:
user_movie_df.to_csv('user_movie_raw.csv') # отфильтрованные данные rating. Пригодятся для создания признаков user и movie
!cp user_movie_raw.csv "/content/drive/MyDrive/Colab_Notebooks/ML_&_AI/Pratice/VK recomd/data"

# Create matrix 


Преобразовать в один массив не получилось с помощью библиотек - не хватает оперативной памяти. Поэтому будем делать в лоб - создавать отдельный вектора для каждого пользователя и по индексу фильма ставить оценку этого пользователя (изначально создаётся нулевой вектор). Считать эту матрицу придётся долго, порядка 8 часов.

In [None]:
# создадим нужные названия колонок
col = ['userId']
for movie_id in list(user_movie_df.movieId.unique()):
    col.append(f'{movie_id}')

user_movie = pd.DataFrame(columns=col)

Поменял структуру данных с pandas.Series на обычный лист. Причина, по которой так необходимо сделать - Series должны иметь один тип данных, который приводится к float16 (для экономии памяти), но при преведении userId к float16 происходит повторение userId и происходят коллизии. 


In [None]:
count = 0
lst_unique_userId = list(user_movie_df.userId.unique())
temp_lst = [] # временный список

start = time.time()
for user_id in lst_unique_userId:
    user_vec = pd.Series([0] * len(col), index=col) # создаём нулевой вектор

    data_temp = user_movie_df[user_movie_df['userId'] == user_id] # данные только для одного userId

    for movie_id in list(data_temp['movieId'].values):
        user_vec[f'{movie_id}']  = data_temp[data_temp['movieId'] == movie_id]['rating'].values[0]

  
    lst_user_vec = list(map(np.float16, list(user_vec.values))) # меняем тип, для экономии памяти
    lst_user_vec[0] = user_id

    temp_lst.append(lst_user_vec)

    count += 1
    if count % 1000 == 0:
        # из списка создаём df и склеиваем с итоговым df
        user_movie = pd.concat([user_movie, pd.DataFrame(temp_lst, columns=col)])

        del temp_lst
        temp_lst = []
        if count == user_movie.userId.unique().shape[0]:
            print(f'Построено {count} профилей пользователей. Уникальных профилей: {user_movie.userId.unique().shape[0]}')
            print(f'Время обработки: {(time.time() - start):.02f} секунд')
            print(f'Размер матрицы user_movie: {(user_movie.memory_usage(index=True).sum() / 1048576):.02f} мб\n')
            start = time.time()
        else:
            print(f'Повторяющиеся userId!!! Сделано {count} профилей')
            break

In [None]:
user_movie.shape

(3000, 13339)

In [None]:
user_movie.head(5)

Unnamed: 0,userId,1,2,5,6,10,11,16,19,20,...,32906,60551,26676,47868,103449,69498,130044,166149,6588,4955
0,4,4.0,4.0,2.0,4.5,4.0,3.5,4.0,2.0,2.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,14,4.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
user_movie.to_csv('user_movie_small.csv')
!cp user_movie_small.csv "/content/drive/MyDrive/Colab_Notebooks/ML_&_AI/Pratice/VK recomd/data"