In [109]:
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score

# Подсасываем датасеты

In [2]:
train_df = pd.read_csv("train.csv")
members_df = pd.read_csv("preprocessed_members.csv")
songs_df = pd.read_csv("songs.csv")
song_extra_df = pd.read_csv("song_extra_info.csv")

data = pd.merge(train_df, songs_df, on="song_id", how="left")
data = pd.merge(data, members_df, on="msno", how="left")
data = pd.merge(data, song_extra_df, on="song_id", how="left")

In [3]:
data.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'song_length', 'genre_ids', 'artist_name',
       'composer', 'lyricist', 'language', 'Unnamed: 0', 'bd',
       'registration_init_time', 'expiration_date', 'registration_year',
       'registration_month', 'registration_day', 'expiration_year',
       'expiration_month', 'expiration_day', 'nogender_noage',
       'membership_duration', 'is_long_term', 'registered_via_3',
       'registered_via_4', 'registered_via_7', 'registered_via_9',
       'registered_via_13', 'registered_via_16', 'gender_3', 'gender_female',
       'gender_male', 'city_1', 'city_3', 'city_4', 'city_5', 'city_6',
       'city_7', 'city_8', 'city_9', 'city_10', 'city_11', 'city_12',
       'city_13', 'city_14', 'city_15', 'city_16', 'city_17', 'city_18',
       'city_19', 'city_20', 'city_21', 'city_22', 'name', 'isrc'],
      dtype='object')

# Popularity-based рекомендации

In [28]:
train, test = train_test_split(train_df, test_size=0.2)

# средний target
song_popularity = train.groupby("song_id")["target"].mean().reset_index()
song_popularity.columns = ["song_id", "song_pop"]

In [29]:
song_popularity.head()

Unnamed: 0,song_id,song_pop
0,++/ACCkEN/+VtgrJxEqeRgRmV4y8pcarDJ9T/yRAi1E=,0.0
1,++/lJNswCU+za2pYB0cWIbGL5UzWIKtfweX20+GImZA=,0.0
2,++4Ihbdp0juQ9ldp9DysOL1WTLHIiawg7cnBTn55I/k=,0.0
3,++6SwJ+aXGV4LLqJmgEogoeEC0DxEdyus0MzD3iuveA=,0.0
4,++732ZgaVBo177j83D3Iht3ZeHUctfXg/y47RKvmc3k=,0.0


In [15]:
test_with_pop = pd.merge(test, song_popularity, on='song_id', how='left')
test_with_pop['song_pop'] = test_with_pop['song_pop'].fillna(train_df['target'].mean())

In [21]:
test_with_pop[['target', 'song_pop']].head()

Unnamed: 0,target,song_pop
0,1,0.57506
1,0,0.445087
2,1,0.505942
3,0,0.546758
4,0,0.609756


In [24]:
# Оценка бейзлайна Popularity-based рекомендации
roc_score = roc_auc_score(test_with_pop['target'], test_with_pop['song_pop'])
print(f'AUC: {roc_score:.4f}')
print("precision: ", precision_score(test_with_pop['target'], test_with_pop['song_pop']>0.6))
print("recall: ", recall_score(test_with_pop['target'], test_with_pop['song_pop']>0.6))

AUC: 0.6151
precision:  0.6292748223280205
recall:  0.3065923500500898


# Popularity-based топ 10 для каждого пользователя

In [34]:
from sklearn.preprocessing import LabelEncoder

In [36]:
le_msno = LabelEncoder()
le_song_id = LabelEncoder()
temp_df = train_df.copy()
temp_df['msno'] = le_msno.fit_transform(temp_df['msno'])
temp_df['song_id'] = le_song_id.fit_transform(temp_df['song_id'])

train, test = train_test_split(temp_df, test_size=0.2)

In [47]:
song_popularity = train.groupby("song_id")['target'].mean().reset_index()
song_popularity.columns = ["song_id", "song_pop"]
top_ten = song_popularity.sort_values(by='song_pop',ascending=False)['song_id'][:10].reset_index().drop('index', axis=1)

In [39]:
test.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
5324257,13708,249555,discover,Online playlist more,online-playlist,0
1491696,7959,62578,explore,Explore,topic-article-playlist,1
918842,27139,232083,my library,Local playlist more,local-playlist,1
6807472,2476,87671,my library,Local playlist more,local-library,0
110860,16620,330181,my library,Local playlist more,local-library,1


In [158]:
test_users = test['msno'].unique()
listened_tracks = [test[(test['msno']==user) & (test['target'] == 1)]['song_id'].unique() for user in test_users]

test_user_his_songs = pd.concat([pd.DataFrame(test_users, columns=['msno']), pd.DataFrame().assign(tracks=listened_tracks)], axis=1)

top_max = song_popularity.sort_values(by='song_pop',ascending=False)['song_id'][:max(test_user_his_songs['tracks'].apply(len))].reset_index().drop('index', axis=1)
test_user_his_songs = test_user_his_songs.assign(top_max = [*top_max.T.values]*len(test_user_his_songs))

In [159]:
test_user_his_songs.head()

Unnamed: 0,msno,tracks,top_max
0,13708,"[155979, 181045, 52412, 151490, 236660, 176256...","[218734, 61170, 233101, 305450, 143658, 61165,..."
1,7959,"[62578, 308848, 304420, 86658, 58602, 51546, 2...","[218734, 61170, 233101, 305450, 143658, 61165,..."
2,27139,"[232083, 145492, 201657, 86658, 315526, 185080...","[218734, 61170, 233101, 305450, 143658, 61165,..."
3,2476,"[295291, 194376, 183649, 4641, 131070, 293570,...","[218734, 61170, 233101, 305450, 143658, 61165,..."
4,16620,"[330181, 81636, 140667, 199854, 312782, 72904,...","[218734, 61170, 233101, 305450, 143658, 61165,..."


In [196]:
def compute_metrics(row, k=100):

    recommended = set(row['top_max'])
    relevant = set(row['tracks'])
    
    intersection_size = len(recommended & relevant )# Вычисляем пересечение
    
    precision = intersection_size / k  
    recall = intersection_size / len(relevant) if len(relevant) > 0 else np.nan
    hit = 1 if intersection_size > 0 else 0
    
    return pd.Series({'precision': precision, 'recall': recall, 'hit': hit})

metrics_df = test_user_his_songs.apply(compute_metrics, axis=1)

In [197]:
metrics_df.dropna().mean()

precision    0.000023
recall       0.000066
hit          0.002255
dtype: float64