In [84]:
import pandas as pd
import numpy as np
import json
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score

# Подсасываем датасет

In [2]:
train = pd.read_csv("prep_train_data.csv",index_col=[0])

In [4]:
train.shape

(3314966, 20)

In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3314966 entries, 0 to 3314965
Data columns (total 20 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   msno                    int64 
 1   song_id                 int64 
 2   source_system_tab       int64 
 3   source_screen_name      int64 
 4   source_type             int64 
 5   target                  int64 
 6   city                    int64 
 7   bd                      int64 
 8   gender                  object
 9   registered_via          int64 
 10  registration_init_time  object
 11  expiration_date         object
 12  registration_year       int64 
 13  registration_month      int64 
 14  registration_day        int64 
 15  expiration_year         int64 
 16  expiration_month        int64 
 17  expiration_day          int64 
 18  nogender_noage          bool  
 19  membership_duration     int64 
dtypes: bool(1), int64(16), object(3)
memory usage: 509.0+ MB


In [5]:
train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,city,bd,gender,registered_via,registration_init_time,expiration_date,registration_year,registration_month,registration_day,expiration_year,expiration_month,expiration_day,nogender_noage,membership_duration
0,7640,60186,1,7,7,1,1,27,3,7,2012-01-02,2017-10-05,2012,1,2,2017,10,5,True,2103
1,16139,179889,4,8,5,1,13,24,female,9,2011-05-25,2017-09-11,2011,5,25,2017,9,11,False,2301
2,16139,97284,4,8,5,1,13,24,female,9,2011-05-25,2017-09-11,2011,5,25,2017,9,11,False,2301
3,16139,19141,4,8,5,1,13,24,female,9,2011-05-25,2017-09-11,2011,5,25,2017,9,11,False,2301
4,7640,26909,1,7,7,1,1,27,3,7,2012-01-02,2017-10-05,2012,1,2,2017,10,5,True,2103


# Пробуем ALS

In [59]:
import implicit
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

## Создание sparse матрицы

In [66]:
user_ids = train["msno"].values
song_ids = train["song_id"].values
target = train["target"].values

matrix = csr_matrix((target, (user_ids, song_ids)),
                    shape=(len(train['msno'].unique()), len(train['song_id'].unique())))

# Разделение данных на train/test
train_data, test_data = train_test_split(train, test_size=0.2, random_state=42, shuffle=True)

## Обучение ALS

In [67]:
model = implicit.als.AlternatingLeastSquares(
    factors=64,       # Количество латентных факторов
    iterations=15,    # Количество итераций
    regularization=0.1, 
    random_state=42
)

model.fit(matrix)

  0%|          | 0/15 [00:00<?, ?it/s]

### Предскажем треки для рандомного пользователя

In [81]:
def recommend(user_id, n=10):
    recommendations = model.recommend(user_id, matrix[user_id], N=n)
    song_ids = recommendations[0]
    return song_ids

user_example = train.sample(1)['msno'].values[0]
print(f"Рекомендации для {user_example}: {recommend(user_example)}")

Рекомендации для 28530: [113266 234916 107549  37054 215222 199823 180395  28393 167597  71477]


### Посчитаем метрики

In [88]:
test_users = test_data["msno"]
test_songs = test_data["song_id"]

user_factors = model.user_factors[test_users]
song_factors = model.item_factors[test_songs]
predicted_scores = np.sum(user_factors * song_factors, axis=1)

auc = roc_auc_score(test_data["target"], predicted_scores)
precision = precision_score(test_data['target'], predicted_scores>0.6)
recall = recall_score(test_data['target'], predicted_scores>0.6)
print("AUC: ", auc)
print("precision: ", precision)
print("recall: ", recall)

AUC:  0.7454892185230693
precision:  0.8439884687129049
recall:  0.20541238995367975
