In [1]:
import pandas as pd
import numpy as np
import matplotlib
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
#from surprise import Dataset, Reader, SVD
#from surprise.model_selection import train_test_split
#from surprise import accuracy

In [2]:
RANDOM_STATE = 42
TEST_SIZE = 0.2

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
songs = pd.read_csv('songs.csv')
song_extra_info = pd.read_csv('song_extra_info.csv')
members = pd.read_csv('members.csv')

In [4]:
# Добавим к тренировочным и тестовым данным информацию о песнях и пользователях

train = train.merge(songs, on='song_id', how='left')
train = train.merge(members, on='msno', how='left')

test = test.merge(songs, on='song_id', how='left')
test = test.merge(members, on='msno', how='left')

In [5]:
train = train.merge(song_extra_info, on='song_id', how='left')
test = test.merge(song_extra_info, on='song_id', how='left')

In [6]:
def values_info(data):
    dtypes = pd.DataFrame(data.dtypes,columns=['Data Type'])
    dtypes['Unique Values']=data.nunique().sort_values(ascending=True)
    dtypes['Null Values']=data.isnull().sum()
    dtypes['% null Values']=data.isnull().sum()/len(data)
    return dtypes.sort_values(by='Null Values' , ascending=False).style.background_gradient()

In [7]:
values_info(train)

Unnamed: 0,Data Type,Unique Values,Null Values,% null Values
lyricist,object,33887,3178812,0.430884
gender,object,2,2961479,0.401425
composer,object,76064,1675706,0.22714
isrc,object,269760,577858,0.078328
source_screen_name,object,20,414804,0.056226
genre_ids,object,572,118455,0.016056
source_system_tab,object,8,24849,0.003368
source_type,object,12,21539,0.00292
name,object,234144,1457,0.000197
language,float64,10,150,2e-05


Считаю, что пол, автор текста и композитор не несут значимой информации, при этом имеют большое количество пропусков, данные столбцы можно дропнуть.

Столбцы isrc и source_screen_name также можем удалить, так как isrc у некоторых песен могут быть одинаковыми

Так же следует удалить столбцы с датами, они нам не понадобятся.

Подумаем что нужно оставить. Оставим язык, исполнителя, жанр и название песни, и возможно точку входа.

In [8]:
train_clear = train.copy()
target_train = train_clear['target']
train_clear = train_clear[['genre_ids', 'source_type', 'name', 'language', 'artist_name', 'song_id', 'msno']]

test_clear = test.copy()
test_clear = test_clear[['genre_ids', 'source_type', 'name', 'language', 'artist_name', 'song_id', 'msno']]

In [9]:
train_clear.head()

Unnamed: 0,genre_ids,source_type,name,language,artist_name,song_id,msno
0,359,online-playlist,Good Grief,52.0,Bastille,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=
1,1259,local-playlist,Lords of Cardboard,52.0,Various Artists,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=
2,1259,local-playlist,Hip Hop Is Dead(Album Version (Edited)),52.0,Nas,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=
3,1019,local-playlist,Disco Africa,-1.0,Soundway,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=
4,1011,online-playlist,Sleep Without You,52.0,Brett Young,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=


Заполним строки с пропущенными цзначениями на самые часто встречаемые

In [10]:
values_info(train_clear)

Unnamed: 0,Data Type,Unique Values,Null Values,% null Values
genre_ids,object,572,118455,0.016056
source_type,object,12,21539,0.00292
name,object,234144,1457,0.000197
language,float64,10,150,2e-05
artist_name,object,40582,114,1.5e-05
song_id,object,359966,0,0.0
msno,object,30755,0,0.0


In [11]:
missing_pipe = Pipeline(
    [
        (
            'simpleImputer_missing', 
            SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        )
    ]
)

In [12]:
train_full = missing_pipe.fit_transform(train_clear)
train_full = pd.DataFrame(train_full, columns=['genre_ids', 'source_type', 'name', 'language', 'artist_name', 'song_id', 'msno'])
train_full.head()

Unnamed: 0,genre_ids,source_type,name,language,artist_name,song_id,msno
0,359,online-playlist,Good Grief,52.0,Bastille,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=
1,1259,local-playlist,Lords of Cardboard,52.0,Various Artists,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=
2,1259,local-playlist,Hip Hop Is Dead(Album Version (Edited)),52.0,Nas,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=
3,1019,local-playlist,Disco Africa,-1.0,Soundway,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=
4,1011,online-playlist,Sleep Without You,52.0,Brett Young,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=


In [13]:
test_full = missing_pipe.transform(test_clear)
test_full = pd.DataFrame(test_full, columns=['genre_ids', 'source_type', 'name', 'language', 'artist_name', 'song_id', 'msno'])
test_full.head()

Unnamed: 0,genre_ids,source_type,name,language,artist_name,song_id,msno
0,458,local-library,愛其實很殘忍,3.0,梁文音 (Rachel Liang),WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=
1,465,local-library,她說,3.0,林俊傑 (JJ Lin),y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=
2,2022,song-based-playlist,subarashiki nichijo,17.0,Yu Takahashi (高橋優),8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=,/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=
3,465,radio,Hold Me| Thrill Me| Kiss Me| Kill Me,52.0,U2,ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=
4,873,radio,Om Yoga,-1.0,Yoga Mr Sound,MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=


In [14]:
train_full.duplicated().sum()

0

In [15]:
train_full

Unnamed: 0,genre_ids,source_type,name,language,artist_name,song_id,msno
0,359,online-playlist,Good Grief,52.0,Bastille,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=
1,1259,local-playlist,Lords of Cardboard,52.0,Various Artists,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=
2,1259,local-playlist,Hip Hop Is Dead(Album Version (Edited)),52.0,Nas,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=
3,1019,local-playlist,Disco Africa,-1.0,Soundway,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=
4,1011,online-playlist,Sleep Without You,52.0,Brett Young,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=
...,...,...,...,...,...,...,...
7377413,1259,local-playlist,Still D.R.E.,52.0,Dr. Dre,VJTxizih/o28kXCbtPbIyWXScoXGvxyYtl6R+0YB5JM=,6xdFzPlrasIDD95mQWXVC3Bg4ptnGYtBl4ztVEZMddU=
7377414,465,song,Locked Away,52.0,R. City feat. Adam Levine,z1mqaU9YOX7T/PFDvUoWozdFq7rC3KwaQP7nFVprjMI=,ZxbVmt3Kh/XOH+h58c2Kdj6SjFZk+wnUO006IgWzMQE=
7377415,465,song,FLY OUT,3.0,兄弟本色G.U.T.S. (姚中仁、張震嶽、頑童MJ116),750RprmFfLV0bymtDH88g24pLZGVi5VpBAI300P6UOA=,ZxbVmt3Kh/XOH+h58c2Kdj6SjFZk+wnUO006IgWzMQE=
7377416,2122,online-playlist,Wonderwall,52.0,Brad Mehldau Trio,G8wgqObgeAMER/rVCIlgcNeQ8mm0CzF/GsxiMK8TTnA=,0aH4Hd3ziPSRHClRX8rkeOEaAG5EPPkW1mKGCdXEok0=


In [16]:
values_info(train_full)

Unnamed: 0,Data Type,Unique Values,Null Values,% null Values
genre_ids,object,572,0,0.0
source_type,object,12,0,0.0
name,object,234144,0,0.0
language,object,10,0,0.0
artist_name,object,40582,0,0.0
song_id,object,359966,0,0.0
msno,object,30755,0,0.0


In [17]:
values_info(test_full)

Unnamed: 0,Data Type,Unique Values,Null Values,% null Values
genre_ids,object,501,0,0.0
source_type,object,12,0,0.0
name,object,154716,0,0.0
language,object,10,0,0.0
artist_name,object,27563,0,0.0
song_id,object,224753,0,0.0
msno,object,25131,0,0.0


In [18]:
list_columns = ['genre_ids', 'source_type', 'name', 'language', 'artist_name']
for name in list_columns:
    train_full[name] = train_full[name].astype(str)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [20]:

# Создание матрицы TF-IDF на основе текстовых признаков
text_features = ['genre_ids', 'source_type', 'name', 'language', 'artist_name']
text_data = train_full[text_features].apply(lambda x: ' '.join(x), axis=1)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)


In [42]:

# Функция рекомендации на основе схожести между пользователями
def recommend(user_id, num_recommendations=20):
    user_index = train_full[train_full['msno'] == user_id].index[0]
    user_profile = tfidf_matrix[user_index]
    similarity_scores = cosine_similarity(tfidf_matrix, user_profile)
    similar_users = similarity_scores.argsort(axis=0)[::-1][:num_recommendations].flatten()
    recommended_songs = []
    for index in similar_users:
        if index != user_index and train_full.iloc[index]['song_id'] not in recommended_songs:
            recommended_songs.append(train_full.iloc[index]['song_id'])
    return recommended_songs


In [48]:
import random

In [58]:
# Пример использования:
user_id = random.choice(test_full['msno'])
recommendations = recommend(user_id)
print("Рекомендации для пользователя", user_id, ":", recommendations)

Рекомендации для пользователя dltpI9IaXapOUbDzGoTeUz2zY4F70jC7D+wdrcgZA7c= : ['sDa4M+ukVOvZHES+jdGCnjRkpIGr4TJv5I5fn//3FTM=']


In [26]:
# Создаем пустой словарь
songs_dict = {}

# Итерируемся по строкам DataFrame
for index, row in train_full.iterrows():
    # Получаем значения признаков
    song_id = row['song_id']
    genre_ids = row['genre_ids']
    source_type = row['source_type']
    name = row['name']
    language = row['language']
    artist_name = row['artist_name']
    
    # Добавляем данные в словарь с использованием song_id в качестве ключа
    songs_dict[song_id] = {
        'genre_ids': genre_ids,
        'source_type': source_type,
        'name': name,
        'language': language,
        'artist_name': artist_name
    }

# Выводим первый элемент словаря в качестве примера
print(songs_dict[next(iter(songs_dict))])

{'genre_ids': '359', 'source_type': 'radio', 'name': 'Good Grief', 'language': '52.0', 'artist_name': 'Bastille'}


In [55]:
songs_dict['mJjY3Aq2ziKriR8beib4g7okk6qwf2BPK5rd/g7lF7Q=']

{'genre_ids': '465',
 'source_type': 'top-hits-for-artist',
 'name': '這是你的歌',
 'language': '3.0',
 'artist_name': '周湯豪 (NICKTHEREAL)'}

In [53]:
songs_dict['AlEP3SDleCdt6gP1jxB7GZPch277MfWzxFaZ8xIZLsA=']

{'genre_ids': '444',
 'source_type': 'top-hits-for-artist',
 'name': 'Baboya',
 'language': '31.0',
 'artist_name': 'Kim Jong Kook'}