In [1]:
%pylab inline

import numpy as np
import pandas as pd
import math
import sklearn

from tqdm import tqdm_notebook

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [2]:
df_info = pd.read_csv('Music Info.csv')
df_users = pd.read_csv('User Listening History.csv')

In [None]:
df_info = df_info.dropna(subset=['tags', 'genre'], how='all')

In [None]:
unique_genres = df_info['genre'].dropna().unique()
unique_genres = sorted(unique_genres)

tags_to_genre = {tag.lower(): tag for tag in unique_genres}

# Функция для назначения жанра на основе тегов
def assign_genre_based_on_tags_case_sensitive(tags_str, existing_genres):
    if pd.notna(tags_str) and tags_str != "":
        first_tag = tags_str.split(',')[0].strip().lower()

        if first_tag in existing_genres:
            return existing_genres[first_tag]
    return "Other"

df_info['genre'] = df_info.apply(
    lambda row: row['genre'] if pd.notna(row['genre']) and row['genre'] in unique_genres else assign_genre_based_on_tags_case_sensitive(row['tags'], tags_to_genre),
    axis=1
)

In [3]:
grouped_users = df_users.groupby(['user_id', 'track_id']).sum().reset_index()
grouped_users.head()

Unnamed: 0,user_id,track_id,playcount
0,00000b722001882066dff9d2da8a775658053ea0,TRQEBOU128F425D087,1
1,00001638d6189236866af9bbf309ae6c2347ffdc,TRBCDMC128F1452976,1
2,0000175652312d12576d9e6b84f600caa24c4715,TRHVDTF128F428D64C,1
3,00001cf0dce3fb22b0df0f3a1d9cd21e38385372,TREZYWT128F93191B7,2
4,00001cf0dce3fb22b0df0f3a1d9cd21e38385372,TRGEIDA128F933B4B8,1


In [None]:
# Объединение датасетов
df_merged = pd.merge(grouped_users, df_info, on='track_id', how='left')
df_merged.head()
df_merged.info()

In [5]:
# Выборка подмножества пользователей
sampled_users = grouped_users['user_id'].drop_duplicates().sample(frac=0.04)
df_sampled = grouped_users[grouped_users['user_id'].isin(sampled_users)]

In [6]:
# Определение числа тестовых образцов
num_test_samples = 10

# Использование groupby с tail и head для получения тестовых и тренировочных наборов
test = df_sampled.groupby('user_id').tail(num_test_samples)
train = df_sampled.drop(test.index)

# Вывод размеров тренировочного и тестового наборов
print(train.shape, test.shape)


(163088, 3) (221815, 3)


In [6]:
train = train.dropna(subset=['name'], how='all')
test = test.dropna(subset=['name'], how='all')

KeyError: ['name']

In [7]:
interactions = (
    train
    .groupby('user_id')['track_id'].agg(lambda x: list(x))
    .reset_index()
    .rename(columns={'track_id': 'true_train'})
    .set_index('user_id')
)

interactions['true_test'] = (
    test
    .groupby('user_id')['track_id'].agg(lambda x: list(x))
)

# заполнение пропусков пустыми списками
interactions.loc[pd.isnull(interactions.true_test), 'true_test'] = [
    [''] for x in range(len(interactions.loc[pd.isnull(interactions.true_test), 'true_test']))]

interactions.head(5)

Unnamed: 0_level_0,true_train,true_test
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0005eb11fd1dad47e6e6719a4db30340073a9e38,"[TRCJAHJ128E07815B6, TRCKWGF12903CD2DCD, TRCXW...","[TRPWIGO128F931BAEB, TRQPSHM128F92F29ED, TRRLG..."
000d22baa9fe50874f9482f8eaf57f00aa32a755,"[TRABFDT12903CADD73, TRAWRKT128E0788857, TRBKF...","[TRPFYYL128F92F7144, TRPGPDK12903CCC651, TRREL..."
000ef25cc955ad5841c915d269432eea41f4a1a5,"[TRCCRQF128E0786B9E, TRENOXQ128E0786BBC, TRFSK...","[TRQVZKY128E0786BA1, TRQYJKD128F92F418E, TRRMN..."
000f295d2b9ddd4ed0bfe674f010dbc4376270f7,"[TRAFUBQ128F92EB825, TRDVSCH128F931B5E0, TREEZ...","[TRIQHAU128F42775E4, TRJKAKF128F92F50DF, TRONW..."
00185cb7e8455bcce9a17f0ee9f6f772b6f4f8b9,"[TRCPXID128F92D5D3C, TRCRILW128F428AC12, TRCXJ...","[TRPGPDK12903CCC651, TRPXTDY128F4257BED, TRRLP..."


In [8]:
def calc_precision(column):
    return (
        interactions
        .apply(
            lambda row:
            len(set(row['true_test']).intersection(
                set(row[column]))) /
            min(len(row['true_test']) + 0.001, 10.0),
            axis=1)).mean()

In [9]:
ratings = pd.pivot_table(
    train,
    values='playcount',
    index='user_id',
    columns='track_id').fillna(0)

ratings

track_id,TRAAAED128E0783FAB,TRAABJS128F9325C99,TRAACER128F4290F96,TRAACKM12903CE5BE9,TRAACPH12903CF5F14,TRAADCQ128F932EC14,TRAADNN128F42B1D94,TRAADQW128F427CE68,TRAADQX128F422B4CF,TRAADVO128E07999E9,...,TRYWUKH128F423CC6B,TRYWXOD128F931A560,TRYXAQC128F92C5A77,TRYXNBH128F427FCF9,TRYYEVU128E0789673,TRYYJHJ128F9340B19,TRZALGW128F4296174,TRZANYY128E0781F8E,TRZCLHB128F4288791,TRZDDRC128F9346B9C
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0005eb11fd1dad47e6e6719a4db30340073a9e38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000d22baa9fe50874f9482f8eaf57f00aa32a755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000ef25cc955ad5841c915d269432eea41f4a1a5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000f295d2b9ddd4ed0bfe674f010dbc4376270f7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00185cb7e8455bcce9a17f0ee9f6f772b6f4f8b9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fff206000d102a7671de0e82965f6f2f89fcf192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fff5487257d5401605dbecfe6555baabcc224b60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fff5a0a0022842a6c713ee78935c347df31d6a0b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fff5fdcc0a663f1efd5417dfeae1e26b3363a3d2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
ratings_m = ratings.values

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Создание разреженной матрицы оценок
sparse_ratings = csr_matrix(ratings)

# Вычисление сходства
similarity_users = cosine_similarity(ratings)


In [12]:
# Вычисление суммарных рейтингов один раз
sum_ratings = np.argsort(ratings_m.sum(axis=0))[::-1]
sorted_columns = ratings.columns[sum_ratings]

# Предварительное вычисление рекомендаций для всех пользователей
all_recommendations = np.array([sorted_columns[~np.in1d(sorted_columns, interactions.iloc[i])][:10]
                                for i in range(len(similarity_users))])

# Создание маски для фильтрации пользователей с ненулевым сходством
mask = (similarity_users > 0).sum(axis=1) > 0

# Применение маски
prediction_user_based = [list(all_recommendations[i]) if mask[i] else [] for i in range(len(similarity_users))]

# Добавление предсказаний в DataFrame
interactions['prediction_user_based'] = prediction_user_based


In [13]:
calc_precision('prediction_user_based')

0.01086990111248455

In [11]:
from scipy.linalg import svd

In [12]:
U, sigma, V = svd(ratings)

print(ratings.shape, U.shape, sigma.shape, V.shape)

(10589, 16157) (10589, 10589) (10589,) (16157, 16157)


In [13]:
Sigma = np.zeros((10589, 16157))
Sigma[:10589, :10589] = np.diag(sigma)

new_ratings = U.dot(Sigma).dot(V)

print(sum(sum((new_ratings - ratings.values) ** 2)))

7.446633426855672e-23


In [15]:
K = 100

sigma[K:] = 0
Sigma = np.zeros((10589, 16157))
Sigma[:10589, :10589] = np.diag(sigma)

In [16]:
new_ratings = U.dot(Sigma).dot(V)

print(sum(sum((new_ratings - ratings.values) ** 2)))
print(sum(sum((ratings.values.mean() - ratings.values) ** 2)))

2029763.557201076
3434096.122016915


In [17]:
new_ratings = pd.DataFrame(new_ratings, index=ratings.index, columns=ratings.columns)

predictions = []

for personId in tqdm_notebook(interactions.index):
    prediction = (
        new_ratings
        .loc[personId]
        .sort_values(ascending=False)
        .index.values
    )

    predictions.append(
        list(prediction[~np.in1d(
            prediction,
            interactions.loc[personId, 'true_train'])])[:10])

interactions['prediction_svd'] = predictions

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for personId in tqdm_notebook(interactions.index):


  0%|          | 0/10589 [00:00<?, ?it/s]

In [18]:
calc_precision('prediction_svd')

0.02476154499952781

In [None]:
interactions.head()

In [None]:
interactions.to_csv('1.csv')

In [20]:
import numpy as np
import pandas as pd
from scipy.linalg import svd

def generate_recommendations(user_track_ids, ratings, new_ratings, num_recommendations=10):
    user_preference_vector = create_preference_vector(user_track_ids, ratings.columns)
    user_predicted_ratings = np.dot(new_ratings, user_preference_vector).flatten()

    # Отфильтровываем уже выбранные треки и сортируем оставшиеся
    recommendations = sort_and_filter_tracks(user_predicted_ratings, user_track_ids, ratings.columns)

    return recommendations[:num_recommendations]

def create_preference_vector(user_track_ids, all_track_ids):
    # Вектор предпочтений, где 1 означает, что пользователь выбрал этот трек
    preference_vector = np.zeros(len(all_track_ids))
    track_indices = [list(all_track_ids).index(track_id) for track_id in user_track_ids if track_id in all_track_ids]
    preference_vector[track_indices] = 1
    return preference_vector

def sort_and_filter_tracks(user_predicted_ratings, user_track_ids, all_track_ids):
    track_ratings = {track_id: rating for track_id, rating in zip(all_track_ids, user_predicted_ratings) if track_id not in user_track_ids}
    sorted_tracks = sorted(track_ratings.items(), key=lambda x: x[1], reverse=True)
    return [track[0] for track in sorted_tracks]

# Пример использования:
user_track_ids = ['TRIOREW128F424EAF0', 'TRLNZBD128F935E4D8']
recommendations = generate_recommendations(user_track_ids, ratings, new_ratings, num_recommendations=10)
print(recommendations)


['TRAAAED128E0783FAB', 'TRAABJS128F9325C99', 'TRAACER128F4290F96', 'TRAACKM12903CE5BE9', 'TRAACPH12903CF5F14', 'TRAADCQ128F932EC14', 'TRAADNN128F42B1D94', 'TRAADQW128F427CE68', 'TRAADQX128F422B4CF', 'TRAADVO128E07999E9']


In [22]:
import numpy as np

# Предположим, что U, sigma и Vt - это ваши матрицы, полученные из SVD
# Сохранение матриц U, Sigma и Vt
np.save('U_matrix.npy', U)
np.save('Sigma_values.npy', sigma)
np.save('Vt_matrix.npy', V)

In [23]:
interactions.to_csv('interactions.csv')
ratings.to_csv('ratings.csv')
new_ratings.to_csv('new_ratings.csv')