## 推荐系统
- 音乐数据处理
- 基于商品相似性的推荐-- 协同过滤
- 基于SVD矩阵分解的推荐-- 隐语义模型 

In [19]:
import pandas as pd
import numpy as np
import time
import sqlite3

data_home = './'

## 数据读取


In [20]:
triplet_dataset = pd.read_csv(filepath_or_buffer=data_home+'train_triplets.txt',
                             sep='\t', header=None, names= ['user', 'song', 'play_count'])

MemoryError: Unable to allocate array with shape (48373586,) and data type object

In [None]:
triplet_dataset.shape

In [None]:
triplet_dataset.info()

**tip here**:对于int64或float64可以采用astype()转换为int32或float32来减小内存

In [None]:
triplet_dataset.head()

## 对原始数据处理
- 计算每个用户的播放总量

In [None]:
output_dict = {}
with open(data_home+'train_triplets.txt') as f:
    for line_num, line in enumerate(f):
        user = line.split('\t')[0]
        play_count = int(line.split('\t')[2])
        if user in output_dict:
            play_count += output_dict[user]
            output_dict.update({user:play_count}) # update当前指标加上原指标
        output_dict.update({user:play_count})
output_list = [{'user':k, 'play_count':v} for k, v in output_dict.items()]
play_count_df = pd.DataFrame(output_list)
play_count_df = play_count_df.sort_values(by = 'play_count', ascending=False)

In [None]:
play_count_df.to_csv(path_or_buf='user_playcount_df.csv', index=False)

- 对每首歌计算其播放总量

In [None]:
output_dict = {}
with open(data_home+'train_triplets.txt') as f:
    for line_number, line in enumerate(f):
        song = line.split('\t')[1]
        play_count = int(line.split('\t')[2])
        if song in output_dict:
            play_count += output_dict[song]
            output_dict.update({song:play_count})
        output_dict.update({song:play_count})
output_list = [{'song':k, 'play_count':v} for k, v in output_dict.items()]
song_count_df = pd.DataFrame(output_list)
song_count_df = song_count_df.sort_values(by='play_count', ascending=False)

In [None]:
song_count_df.to_csv(path_or_buf='song_playcount_df.csv', index=False)

In [None]:
play_count_df = pd.read_csv(filepath_or_buffer='user_playcount_df.csv')
play_count_df.head()

In [None]:
song_count_df = pd.read_csv('song_playcount_df.csv')
song_count_df.head()

### 排序的目的：
tip:惰性用户： 播放数较少，值时体验的用户，没有推荐的必要将其过滤掉

    取10W用户，3W歌

In [None]:
total_play_count = sum(song_count_df.play_count)  # 总播放数
total_play_count

In [None]:
print(float(play_count_df.head(n=100000).play_count.sum()/total_play_count)*100)
# 前100000个用户的播放量占比
play_count_subset = play_count_df.head(100000)

In [None]:
# 前30000首歌
song_count_subsest = song_count_df.head(30000)
float(song_count_df.head(30000).play_count.sum()/total_play_count*100)

### 过滤其他用户

In [None]:
user_subset = list(play_count_subset.user)
song_subset = list(song_count_subsest.song)

In [None]:
# del(triplet_dataset)

In [None]:
# triplet_dataset = pd.read_csv(filepath_or_buffer=data_home+'train_triplets.txt', sep='\t', header=None, names=['user', 'song', 'play_count'])
# triplet_dataset_ = pd.read_csv(filepath_or_buffer=data_home+'train_triplets.txt', sep='\t', header=None, names= ['user', 'song', 'play_count'])

In [None]:
triplet_subset = triplet_dataset[triplet_dataset.user.isin(user_subset)]
del(triplet_dataset)

In [None]:
triplet_subset_subsong = triplet_subset[triplet_subset.song.isin(song_subset)]
del(triplet_subset)

In [None]:
triplet_subset_subsong.to_csv('triplet_subset_subsong.csv', index=False)

## 我们生成的数据集

In [None]:
triplet_subset_subsong.shape

In [None]:
triplet_subset_subsong.head(10)

## 数据集整合

    - 补全音乐名字：sqlite3， .db转换为.csv

In [None]:
conn = sqlite3.connect(data_home+'track_metadata.db')
cur = conn.cursor()
cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
cur.fetchall()

In [None]:
# pandas 执行sql操作
track_metadata_df = pd.read_sql(sql='select * from songs', con=conn)
track_metadata_df.head()

In [None]:
track_metadata_df_sub = track_metadata_df[track_metadata_df.song_id.isin(song_subset)]
# 在元数据中的所有歌曲名取出

In [None]:
track_metadata_df_sub.to_csv('track_metadata_df_sub.csv')

In [None]:
track_metadata_df_sub.shape  # 有重复的

### 整合
pandas的merge操作， song_id(track_metadata_df_sub)和song(triplet_dataset_subsong)

In [None]:
track_metadata_df_sub.head()

In [None]:
# 清洗无用数据
track_data = track_metadata_df_sub.loc[:,('song_id', 'title','release', 'artist_name', 'year')]
# del(track_metadata_df_sub.iloc[:, track_metadata_df_sub.columns.isin(['song_id', 'title'])])

In [None]:
track_data.shape

In [None]:
# 在元数据中取了3W首歌， 去除重复数据
track_data = track_data.drop_duplicates(['song_id'])
track_data.shape

In [None]:
# 合并两张表
triplet_dataset_sub_song_merged = pd.merge(triplet_subset_subsong, track_data, 
                                           how='left', left_on='song', right_on='song_id')

In [None]:
# 改名
triplet_dataset_sub_song_merged = triplet_dataset_sub_song_merged.rename(columns={'play_count':'listen_count'})

In [None]:
triplet_dataset_sub_song_merged.head()

In [None]:
triplet_dataset_sub_song_merged.drop(['song_id'], axis=1)

## EDA

In [None]:
# 最流行
popular_songs = triplet_dataset_sub_song_merged[['title', 'listen_count']].groupby('title').sum().reset_index()

In [None]:
popular_songs.shape

In [None]:
popular_songs_top_20 = popular_songs.sort_values(by='listen_count', ascending=False).head(20)
popular_songs_top_20

In [None]:
objects = (list(popular_songs_top_20['title']))
x_pos = np.arange(len(objects))
performance = list(popular_songs_top_20['listen_count'])

import matplotlib.pyplot as plt


In [None]:
# 条形图
plt.bar(x_pos, performance, align='center', alpha=0.5)
plt.xticks(x_pos, objects, rotation='vertical')
plt.ylabel('Item count')
plt.title('Most popular songs')

plt.show()

In [None]:
# 最欢迎的歌手
popular_artists = triplet_dataset_sub_song_merged.loc[:, ('artist_name','listen_count')].groupby(
    'artist_name').sum().reset_index()

In [None]:
popular_artists.head()

In [None]:
popular_artists_top_20 = popular_artists.sort_values(by='listen_count', ascending=False).head(20)

In [None]:
objects = list(popular_artists_top_20['artist_name'])
x_pos = list(np.arange(len(popular_artists_top_20)))
listen = list(popular_artists_top_20['listen_count'])

plt.bar(x_pos, listen, align='center', alpha=0.5)
plt.xticks(x_pos, objects, rotation='vertical')
plt.ylabel('listen_counts')
plt.title('most popular artists')

plt.show()

### 用户分布量的分布


In [None]:
user_song_count_distribution = triplet_dataset_sub_song_merged[['user', 'title']].groupby(
    'user').count().reset_index().sort_values(by='title', ascending=False)
# 每位用户听歌数量

In [None]:
user_song_count_distribution.title.describe()

In [None]:
user_song_count_distribution.head()

In [None]:
x = user_song_count_distribution.title

n, bins, patched = plt.hist(x, 50)
plt.xlabel('play counts')
plt.ylabel('Nums of users')
plt.title(r'$\mathrm{Histogram\ of\ User \ Play\ Count\ Distribution\ }$')
plt.grid(True)
plt.show()

## **Core**:推荐系统


In [None]:
import Reconmmenders as Reconmmenders
from sklearn.model_selection import train_test_split

In [None]:
triplet_dataset_sub_song_merged_set = triple_dataset_sub_song_merged
train_data, test_data = train_test_split(triplet_dataset_sub_song_merged_set, test_size=0.3, random_state=0)

In [None]:
def create_popularity_recommendation(train_data, user_id, item_id):
    #Get a count of user_ids for each unique song as recommendation score
    train_data_grouped = train_data.groupby([item_id]).agg({usesr_id:'count'}).reset_index()
    train_data_grouped.rename(columns={user_id:'score'}, inplace=True)
    
    #Sort the songs based on Score
    train_data_sort = train_data_grouped.sort_values(by='score', ascending=False)
    
    #Generate a recommendation rank based on score
    train_data_sort['rank'] = train_data_grouped['score'].rank(ascending=False, method='first')
    
    # Get top 20
    popularity_recommendations = train_data_sort.head(20)
    return popularity_recommendations