In [1]:
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

# Подсасываем датасеты

In [2]:
train_df = pd.read_csv("train.csv")
# train, test = train_test_split(train_df, test_size=0.2)

In [3]:
members_df = pd.read_csv("preprocessed_members.csv")
songs_df = pd.read_csv("songs.csv")
song_extra_df = pd.read_csv("song_extra_info.csv")

data = pd.merge(train_df, songs_df, on="song_id", how="left")
data = pd.merge(data, members_df, on="msno", how="left")
data = pd.merge(data, song_extra_df, on="song_id", how="left")

# Небольшая предобработка

In [6]:
data["song_pop"] = data.groupby("song_id")["target"].transform("mean")
data["artist_pop"] = data.groupby("artist_name")["target"].transform("mean")
data["genre_pop"] = data.groupby("genre_ids")["target"].transform("mean")

data["song_length"] = data["song_length"] / 60000  # В минутах
data["num_genres"] = data["genre_ids"].str.count("\|") + 1  # Количество жанров
data["release_year"] = data["isrc"].str[5:7].astype(float) + 1900  # Год выпуска из ISRC

# ohe
data = pd.get_dummies(data, columns=["source_system_tab", "source_type"], dummy_na=True)

  data["num_genres"] = data["genre_ids"].str.count("\|") + 1  # Количество жанров


In [8]:
features = [
    # Популярность
    "song_pop", "artist_pop", "genre_pop",
    
    # Пользователи
    "gender", "bd", "city", "membership_duration",
    
    # Песни
    "song_length", "language", "num_genres", "release_year",
    
    # Контекст
    "source_system_tab_discover", "source_system_tab_my_library",
    "source_type_top-hits-for-you", "source_type_local-playlist"
]


features = [f for f in features if f in data.columns]
data[features] = data[features].fillna(data[features].median())


In [12]:
song_popularity = data.groupby("song_id")["target"].mean().reset_index(name="song_pop")
artist_popularity = data.groupby("artist_name")["target"].mean().reset_index(name="artist_pop")
genre_popularity = data.groupby("genre_ids")["target"].mean().reset_index(name="genre_pop")

# Обучение логистической регрессии, которая предсказывает вероятность послушает пользователь трек или нет

In [10]:
features = ["song_pop", "artist_pop", "genre_pop", "membership_duration"]
model = LogisticRegression()
model.fit(data[features], data["target"])

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Проверяем работоспособность модели

In [13]:
# Список всех треков и топ-100 популярных
all_songs = songs_df["song_id"].unique()
top_100_songs = song_popularity.sort_values("song_pop", ascending=False).head(100)["song_id"].values

# Функция генерации кандидатов
def generate_candidates(n_top=10, n_random=10):
    top = list(top_100_songs[:n_top])
    random = np.random.choice(list(set(all_songs) - set(top_100_songs)), n_random, replace=False)
    return list(top) + list(random)

# Пример для одного пользователя
user_id = members_df['msno'].sample(1).values[0]
candidate_songs = generate_candidates()

In [14]:
candidates = pd.DataFrame({"msno": [user_id]*20, "song_id": candidate_songs})

candidates = pd.merge(candidates, songs_df, on="song_id", how="left")
candidates = pd.merge(candidates, members_df[members_df["msno"] == user_id], on="msno", how="left")

candidates["song_pop"] = candidates["song_id"].map(song_popularity.set_index("song_id")["song_pop"])
candidates["artist_pop"] = candidates["artist_name"].map(artist_popularity.set_index("artist_name")["artist_pop"])
candidates["genre_pop"] = candidates["genre_ids"].map(genre_popularity.set_index("genre_ids")["genre_pop"])
candidates.fillna(data[candidates.columns].select_dtypes(include='number').drop(['Unnamed: 0'], axis=1).median(), inplace=True)

candidates["prob"] = model.predict_proba(candidates[features])[:, 1]

recommended_songs = candidates.sort_values("prob", ascending=False).head(10)
print(recommended_songs[["song_id", "prob"]])

                                        song_id      prob
7  srJNksUrcETWixC7biZnzKZKrFJXQlQqKUOailvg0j0=  0.925849
6  GqcGdygdhDc1iw+CjaSPnK97x+TIrwVKaMn8qA87aPo=  0.925513
5  GqXkAz33+Lm46o3THB/YPzVMEioU9W2vQ9Dww+ramFo=  0.924463
2  wXcF2UkQiHDsq1F2FwIdyMzTitZEqVo9lpKeV2AKQ8E=  0.922653
1  o+NBOFI7HMxBcOcN2aXy32lmJ3fm9jvWjShObQva5nU=  0.921243
4  srLUOCTC3RVKSgrwokiws1g1x4KErG97Ii0cvwsis5Y=  0.920898
9  70p5ev2lrGSgpdngnVUG2wDbXCryZG/vCDpbGjC9POg=  0.920713
3  70rZK783Q6A1/QEfy0Llo0Ls//n8D/1IH+oHlMJfM8Q=  0.915197
0  DgTYktrctK/EUYgxIEkha33l8LO6mYqWIdekMksFNOo=  0.914949
8  WaqFLIlRxeHCYF0cSBIA+XFtwSR48hzt1E2H90kblkM=  0.914260


In [18]:
songs_df.head()

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0


# Тестируем на 10 случайных пользователей

In [None]:
# Выбор 10 случайных пользователей
sampled_users = data['msno'].drop_duplicates().sample(10, random_state=42).tolist()

# получаем 20 различных песен (топ 10 и 10 случайных)
def prepare_user_candidates(user_id, n_top=10, n_random=10):
    listened_songs = data[data['msno'] == user_id]['song_id'].unique()
    # исключаем прослушанные
    top_songs = song_popularity[~song_popularity['song_id'].isin(listened_songs)].sort_values('song_pop', ascending=False).head(n_top)['song_id'].tolist()
        
    random_pool = songs_df[~songs_df['song_id'].isin(listened_songs)]['song_id'].unique()
    random_songs = np.random.choice(random_pool, size=n_random, replace=False).tolist()
    
    return top_songs + random_songs

# 3. Генерация рекомендаций
recommendations = []
for user in sampled_users:
    candidates = prepare_user_candidates(user)
    
    # Создание DataFrame для предсказания
    candidate_df = pd.DataFrame({
        'msno': [user] * len(candidates),
        'song_id': candidates
    })
    
    # Добавление признаков песен
    candidate_df = candidate_df.merge(songs_df, on='song_id', how='left')
    
    # Добавление признаков пользователя
    user_features = data[data['msno'] == user][['msno', 'gender_male', 'gender_female', 'gender_3', 'bd', 'membership_duration']].iloc[0]
    candidate_df = candidate_df.assign(**user_features.to_dict())
    
    # Добавление популярностей
    candidate_df = candidate_df.merge(song_popularity, on='song_id', how='left')
    candidate_df = candidate_df.merge(artist_popularity, on='artist_name', how='left')
    candidate_df = candidate_df.merge(genre_popularity, on='genre_ids', how='left')
    
    # Заполнение пропусков
    candidate_df.fillna({
        'song_pop': song_popularity['song_pop'].median(),
        'artist_pop': artist_popularity['artist_pop'].median(),
        'genre_pop': genre_popularity['genre_pop'].median()
    }, inplace=True)
    
    # Предсказание вероятностей
    features = ["song_pop", "artist_pop", "genre_pop", "membership_duration"]
    candidate_df['prob'] = model.predict_proba(candidate_df[features])[:, 1]
    
    # выдаем рекомендации
    top_recommendations = candidate_df.sort_values('prob', ascending=False).head(10)
    recommendations.append(top_recommendations[['msno', 'song_id', 'prob']])

# 4. Итоговый результат
final_recommendations = pd.concat(recommendations)

Итого получаем для каждого из 10 пользователей 20 песен, с вероятностями, которые он послушает

In [30]:
final_recommendations

Unnamed: 0,msno,song_id,prob
0,5hebwI9Q5dTNBNh81QjKXhv2y+2AbR+4hAFZcCQgL0g=,TzRI0qj+h30CErbiW/P+GpVD8tk5IDaynZw31YD8TLE=,0.921651
8,5hebwI9Q5dTNBNh81QjKXhv2y+2AbR+4hAFZcCQgL0g=,KA3Mp0J/e50dKXyi6V55T0VpPP4X3o020FePoU6ycpo=,0.918948
1,5hebwI9Q5dTNBNh81QjKXhv2y+2AbR+4hAFZcCQgL0g=,K+3PXLsnICTbIy4S7Iyo4oB5DPDqOHvuesWWkUi1LMU=,0.918934
6,5hebwI9Q5dTNBNh81QjKXhv2y+2AbR+4hAFZcCQgL0g=,KAAvktd7ShU/LrAqYeqBgEW+rLiY/2cPxwKylEXi9gs=,0.918641
7,5hebwI9Q5dTNBNh81QjKXhv2y+2AbR+4hAFZcCQgL0g=,KA3j/dwh8VijIdDYCIJP7+RVrW1HLCZFhNEDuXqTIzI=,0.917987
...,...,...,...
2,4MUe9plN4YPCaTAjwlCOHF0B9qbdm/KEFMeNTIfPWKw=,W4zzn8dJfrBdGm7dcNgfqSboQIY7WABInOe3y8k8tdU=,0.916620
7,4MUe9plN4YPCaTAjwlCOHF0B9qbdm/KEFMeNTIfPWKw=,AaypyoEJCyUgKXjTPw7A004/dLoEIAvGyO8EEewqEcU=,0.915163
0,4MUe9plN4YPCaTAjwlCOHF0B9qbdm/KEFMeNTIfPWKw=,DgTqRuqKGtn4W78th328k3wENm/gsRbMoLUnLFJroTA=,0.915138
9,4MUe9plN4YPCaTAjwlCOHF0B9qbdm/KEFMeNTIfPWKw=,W50IiC+UjTtQqiPqFg9kSMURFxBoBFejzZ5VZZZDvYc=,0.910630


# Посчитаем метрики

In [101]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

Ввиду того, что в качестве 20 рекомендованных треков исключаются те, которые пользователь уже послушал, оценивать качество модели будем на треках, которые он уже послушал??

In [82]:
sampled_users = data['msno'].drop_duplicates().sample(10, random_state=42).tolist()

In [106]:
def prepare_user_candidates_for_metric(user):
    temp = data[(data['msno'] == user) & (data['target'] == 1)]['song_id']
    if len(temp) < 20:
        return temp
    return temp.sample(20)
def get_recoms(sampled_users):
    recommendations = []
    for user in tqdm(sampled_users):
        candidates = prepare_user_candidates_for_metric(user)
        if len(candidates) == 0: continue
        candidate_df = pd.DataFrame({
            'msno': [user] * len(candidates),
            'song_id': candidates
        })

        candidate_df = candidate_df.merge(songs_df, on='song_id', how='left')
        
        user_features = data[data['msno'] == user][['msno', 'gender_male', 'gender_female', 'gender_3', 'bd', 'membership_duration']].iloc[0]
        candidate_df = candidate_df.assign(**user_features.to_dict())
        
        candidate_df = candidate_df.merge(song_popularity, on='song_id', how='left')
        candidate_df = candidate_df.merge(artist_popularity, on='artist_name', how='left')
        candidate_df = candidate_df.merge(genre_popularity, on='genre_ids', how='left')
        
        candidate_df.fillna({
            'song_pop': song_popularity['song_pop'].median(),
            'artist_pop': artist_popularity['artist_pop'].median(),
            'genre_pop': genre_popularity['genre_pop'].median()
        }, inplace=True)
        
        features = ["song_pop", "artist_pop", "genre_pop", "membership_duration"]
        candidate_df['prob'] = model.predict_proba(candidate_df[features])[:, 1]
        
        top_recommendations = candidate_df.sort_values('prob', ascending=False).head(10)
        recommendations.append(top_recommendations[['msno', 'song_id', 'prob']])
    return recommendations
final_recommendations = pd.concat(get_recoms(sampled_users))

100%|██████████| 10/10 [00:08<00:00,  1.12it/s]


In [89]:
final_recommendations

Unnamed: 0,msno,song_id,prob
6,5hebwI9Q5dTNBNh81QjKXhv2y+2AbR+4hAFZcCQgL0g=,GqcGdygdhDc1iw+CjaSPnK97x+TIrwVKaMn8qA87aPo=,0.922619
7,5hebwI9Q5dTNBNh81QjKXhv2y+2AbR+4hAFZcCQgL0g=,srJNksUrcETWixC7biZnzKZKrFJXQlQqKUOailvg0j0=,0.921357
5,5hebwI9Q5dTNBNh81QjKXhv2y+2AbR+4hAFZcCQgL0g=,GqXkAz33+Lm46o3THB/YPzVMEioU9W2vQ9Dww+ramFo=,0.919895
10,5hebwI9Q5dTNBNh81QjKXhv2y+2AbR+4hAFZcCQgL0g=,SI9Ueo6tFEsEdzN1JuvMIxEI0nI9pg74AffwFX16JKI=,0.918951
2,5hebwI9Q5dTNBNh81QjKXhv2y+2AbR+4hAFZcCQgL0g=,wXcF2UkQiHDsq1F2FwIdyMzTitZEqVo9lpKeV2AKQ8E=,0.917985
...,...,...,...
4,4MUe9plN4YPCaTAjwlCOHF0B9qbdm/KEFMeNTIfPWKw=,srLUOCTC3RVKSgrwokiws1g1x4KErG97Ii0cvwsis5Y=,0.915738
9,4MUe9plN4YPCaTAjwlCOHF0B9qbdm/KEFMeNTIfPWKw=,70p5ev2lrGSgpdngnVUG2wDbXCryZG/vCDpbGjC9POg=,0.915542
3,4MUe9plN4YPCaTAjwlCOHF0B9qbdm/KEFMeNTIfPWKw=,70rZK783Q6A1/QEfy0Llo0Ls//n8D/1IH+oHlMJfM8Q=,0.909701
0,4MUe9plN4YPCaTAjwlCOHF0B9qbdm/KEFMeNTIfPWKw=,DgTYktrctK/EUYgxIEkha33l8LO6mYqWIdekMksFNOo=,0.909439


In [84]:
def evaluate_recommendations(user, recommended_songs, data):
    # Истинные положительные взаимодействия пользователя
    true_positive = data[(data['msno'] == user) & (data['target'] == 1)]['song_id'].tolist()
    
    # Рекомендованные песни
    recommended = recommended_songs['song_id'].tolist()
    
    # Бинарные метки: 1 если рекомендованная песня была прослушана, иначе 0
    y_true = [1 if song in true_positive else 0 for song in recommended]
    
    # Precision@10 и Recall@10
    precision = precision_score([1]*len(y_true), y_true) if sum(y_true) > 0 else 0
    recall = recall_score([1]*len(y_true), y_true) if sum(y_true) > 0 else 0
    
    return precision, recall

Посчитаем метрику для 10 пользователей

In [102]:
precisions, recalls = [], []

for user in tqdm(sampled_users):
    # Получаем рекомендации для пользователя (из предыдущего кода)
    user_recommendations = final_recommendations[final_recommendations['msno'] == user]

    # Вычисляем метрики
    prec, rec = evaluate_recommendations(user, user_recommendations, data)
    
    precisions.append(prec)
    recalls.append(rec)

# 4. Усреднение результатов
mean_precision = np.mean(precisions)
mean_recall = np.mean(recalls)

print(f"Mean Precision@10: {mean_precision:.4f}")
print(f"Mean Recall@10: {mean_recall:.4f}")

100%|██████████| 10/10 [00:02<00:00,  4.74it/s]

Mean Precision@10: 0.8000
Mean Recall@10: 0.8000





Посчитаем для всех

In [None]:
temp_data = data[data['target'] == 1]
def prepare_user_candidates_for_metric(user):
    temp = temp_data[temp_data['msno'] == user]['song_id']
    if len(temp) < 20:
        return temp
    return temp.sample(20)
def get_recoms(sampled_users):
    recommendations = []
    for user in tqdm(sampled_users):
        candidates = prepare_user_candidates_for_metric(user)
        candidate_df = pd.DataFrame({
            'msno': [user] * len(candidates),
            'song_id': candidates
        })

        candidate_df = candidate_df.merge(songs_df, on='song_id', how='left')
        
        user_features = data[data['msno'] == user][['msno', 'gender_male', 'gender_female', 'gender_3', 'bd', 'membership_duration']].iloc[0]
        candidate_df = candidate_df.assign(**user_features.to_dict())
        
        candidate_df = candidate_df.merge(song_popularity, on='song_id', how='left')
        candidate_df = candidate_df.merge(artist_popularity, on='artist_name', how='left')
        candidate_df = candidate_df.merge(genre_popularity, on='genre_ids', how='left')
        
        candidate_df.fillna({
            'song_pop': song_popularity['song_pop'].median(),
            'artist_pop': artist_popularity['artist_pop'].median(),
            'genre_pop': genre_popularity['genre_pop'].median()
        }, inplace=True)
        
        features = ["song_pop", "artist_pop", "genre_pop", "membership_duration"]
        candidate_df['prob'] = model.predict_proba(candidate_df[features])[:, 1]
        
        top_recommendations = candidate_df.sort_values('prob', ascending=False).head(10)
        recommendations.append(top_recommendations[['msno', 'song_id', 'prob']])
    return recommendations
final_recommendations = pd.concat(get_recoms(sampled_users))

In [107]:
%%time
users = data['msno'].drop_duplicates()
precisions, recalls = [], []
final_recommendations = get_recoms(users)
for user in tqdm(users):
    user_recommendations = final_recommendations[final_recommendations['msno'] == user]

    prec, rec = evaluate_recommendations(user, user_recommendations, data)
    
    precisions.append(prec)
    recalls.append(rec)

mean_precision = np.mean(precisions)
mean_recall = np.mean(recalls)

print(f"Mean Precision: {mean_precision:.4f}")
print(f"Mean Recall: {mean_recall:.4f}")

  2%|▏         | 613/30755 [11:08<9:08:11,  1.09s/it] 


KeyboardInterrupt: 

# ________________________________________________________________________________________________________________

In [62]:
train_df[(train_df['msno'] == "FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=")]

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1
5,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,explore,Explore,online-playlist,1
7,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,bPIvRTzfHxH5LgHrStll+tYwSQNVV8PySgA3M1PfTgc=,explore,Explore,online-playlist,1
6721,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,XUz4Z4wPPChz+OIwkwj7HJ8teIjW3rEMdEATM80rDxM=,explore,Explore,online-playlist,1
...,...,...,...,...,...,...
7351223,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,bCv44kWOnfTwkj2qg8RTnNvi+cCXrahngqDrmjIyd2o=,radio,Radio,radio,0
7353615,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,+BuoFaAw+sbIYkgPW7ReKkPQpJbW9zvT8+UU3bU7iKU=,radio,Radio,radio,0
7353617,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,+ChJrJNQu/z9JLoAyhielYRDiG99yhzo9m5CzQNCOD8=,radio,Radio,radio,0
7353619,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,X8uGX+w3qTMTG8S5wauveoBnE81OP99ywklEVcXRHYE=,radio,Radio,radio,0


In [72]:
temp = train_df[train_df.duplicated(subset=['song_id'], keep=False)]
temp[(temp['song_id'] == 'BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=')]

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1
35532,e5Ezre9HPuPos+CXQXtmo32E/hHIZTMmo6jG3yRf6UA=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,my library,Local playlist more,local-playlist,1
39773,pouJqjNRmZOnRNzzMWWkamTKkIGHyvhl/jo4HgbncnM=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,discover,Online playlist more,online-playlist,0
95899,sSexP400TJOZRhx3JB+0s9cqrCnqrlV51B9njoKR1II=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,discover,Online playlist more,online-playlist,0
137360,hKdGiUKHVqKkXGHLrc+EzdSW6q0ERAJ2Cs7/L1N0Ae4=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,discover,,online-playlist,0
...,...,...,...,...,...,...
7132328,Lk0UToUeLSYkRuIZn0oi3acZp4pklkywNGLEBKRO++w=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,my library,Local playlist more,local-library,0
7247033,q00NnZSmIltLiXDL04zVoJcX6k5j+fgz9qzVAJvXm6A=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,discover,Online playlist more,song-based-playlist,0
7263586,hbiDVON3wLLm4zxxDhe4uPlHAASAalkd4ZRpfkP/xyU=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,my library,Local playlist more,local-playlist,1
7270117,wOP2RYdLECfcc07RHA+2tQf/PN11lfOWay4y1zY1H9s=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,my library,Local playlist more,local-library,1
