# Last.fm数据集的检索与推荐

## 名词解释
变量名：
    1. t_a_w：tag_artist_whole
    2. t_a：tag_artist
    3. u_a：user_artist
    4. r_u_a：recommend_user_artist
    5. u_f：user_friend
    6. t_a_id_list：tag_artist_id_list
    7. t_n：tag_name
    8. a_t_w：artist_tag_whole
    9. a_t：artist_tag
    10. 

函数名：
    1. s_b_n：search_by_name
    2. s_b_t：search_by_tag
    3. r_b_w：recommend_by_weight
    4. r_b_f：recommend_by_friend
    5. r_b_s_a：recommend_by_searching_artist

## 1.导包

In [None]:
import pandas as pd
import numpy as np
import csv
import difflib
from pprint import pprint
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

## 2.根据音乐艺术家名字进行检索的设计代码

In [180]:
# 读取artists.dat表
artists = pd.read_csv('C:\\Users\\ASUS\\Desktop\\homework\\hetrec2011-lastfm-2k\\artists.dat', encoding='utf-8', delimiter="\t", quoting=csv.QUOTE_NONE)

# 创建一个空的list，用来储存音乐艺术家的名字
artists_list = []

# 遍历name列，向artists_list中添加音乐艺术家的名字
for name in artists['name']:
    artists_list.append(name)

# 使用difflib中的模糊查询方法进行查找
def s_b_n(sname):
    # 查询最多10个结果，准确度设置为0.1（范围是0~1）
    data = difflib.get_close_matches(sname, artists_list, 10, cutoff=0.1)
    print(data)

## 3.根据标签进行检索的设计代码

In [173]:
# 读取tags.dat表与user_taggedartists.dat表
tags = pd.read_csv('C:\\Users\\ASUS\\Desktop\\homework\\hetrec2011-lastfm-2k\\tags.dat', encoding='gbk', delimiter="\t", quoting=csv.QUOTE_NONE)
user_taggedartists = pd.read_csv('C:\\Users\\ASUS\\Desktop\\homework\\hetrec2011-lastfm-2k\\user_taggedartists.dat', encoding='gbk', delimiter="\t", quoting=csv.QUOTE_NONE, usecols=[0,1,2])

# 通过键连接artists.dat、tags.dat和user_taggedartists.dat三表
t_a_w = pd.merge(pd.merge(tags,user_taggedartists,on='tagID'),artists,left_on='artistID',right_on='id')# tag_artist_whole

# 保留标签名-音乐艺术家名表，得到了音乐艺术家与标签之间的关系
t_a = t_a_w[['tagValue', 'name']]

# 去重
t_a.duplicated(keep='first')
t_a = t_a.drop_duplicates()

# 创建一个空的list，用来储存标签名
tags_list = []

# 遍历tagValue列，向tags_list中添加音乐艺术家的名字
for tag in t_a['tagValue']:
    if(tag not in tags_list):
        tags_list.append(tag)

# 使用difflib中的模糊查询方法进行查找
def s_b_t(stag):
    # 每个tag查询最多10个结果，准确度设置为0.1（范围是0~1）
    data = difflib.get_close_matches(stag, tags_list, 10, cutoff=0.1)
    for tag in data:
        print(t_a[t_a['tagValue'] == tag])

## 4.基于用户感兴趣程度的推荐的设计代码

In [174]:
# 读取user_artists.dat表
u_a = pd.read_csv('C:\\Users\\ASUS\\Desktop\\homework\\hetrec2011-lastfm-2k\\user_artists.dat', encoding='gbk', delimiter="\t", quoting=csv.QUOTE_NONE)

# 通过userID分类并获取每个用户最感兴趣的5位音乐艺术家，通过键连接user_artists.dat和artists.dat两表
r_u_a = pd.merge(u_a.groupby('userID').head(5),artists,left_on='artistID',right_on='id',how='left')# recommend_user_artist

# 保留用户ID-音乐艺术家名表，得到了音乐艺术家与用户之间的关系
r_u_a = r_u_a[['userID', 'name']]

def r_b_w(uid):
    print(r_u_a[r_u_a['userID']==uid])

## 5.基于用户好友的推荐的设计代码

In [175]:
# 读取user_friends.dat表
u_f = pd.read_csv('C:\\Users\\ASUS\\Desktop\\homework\\hetrec2011-lastfm-2k\\user_friends.dat', encoding='gbk', delimiter="\t", quoting=csv.QUOTE_NONE)

def r_b_f(uid):
    data = u_f[u_f['userID']==uid]
    for friend in data['friendID']:
        r_b_w(friend)

## 6.根据用户查询的具体歌手进行相似的歌手推荐

In [176]:
# 读取tags.dat表，并以tagID作为纵坐标
tags_id_list = pd.read_csv('C:\\Users\\ASUS\\Desktop\\homework\\hetrec2011-lastfm-2k\\tags.dat', encoding='gbk', delimiter="\t", quoting=csv.QUOTE_NONE,index_col='tagID')
# 以tagID分组
t_a_id_list = t_a_w[['tagID', 'name']].groupby("tagID").agg(list)

# 得到交集
artists_index = set(tags_id_list.index) & set(t_a_id_list.index)
# 得到评价-音乐艺术家列表
new_t_a_id_list = t_a_id_list.loc[list(artists_index)]
# 将评价列添加进tags_id_list表中
t_a_id_list = tags_id_list.join(new_t_a_id_list)
# 将NaN替换为空元组
t_a_id_list = pd.DataFrame(map(lambda x: (x[0], x[1], x[2]) if x[2] is not np.nan else (x[0], x[1], []), t_a_id_list.itertuples()), columns=["tagID", "tagValue", "name"])
# 将tagID作为t_a_id_list横坐标
t_a_id_list.set_index("tagID", inplace=True)

# 得到标签关键字的集合（音乐艺术家的名字）
dataset = t_a_id_list['name'].values

# 转化为字典储存
dct = Dictionary(dataset)

# 生成词袋模型
corpus = [dct.doc2bow(line) for line in dataset]

# 生成TF-IDF模型
model = TfidfModel(corpus)

# 遍历t_a_id_list的每一行
tag_profile = []
for i, data in enumerate(t_a_id_list.itertuples()):
    # 标签id
    id = data[0]
    # 标签名
    t_n = data[1]
    # TF-IDF值
    vector = model[corpus[i]]
    # 根据TF-IDf值对标签降序排序
    artist_tags = sorted(vector, key=lambda x: x[1], reverse=True)[:30]
    # 输出每个标签的“关键字：推荐度（即TF-IDF值)”的集合
    topN_tags_weights = dict(map(lambda x: (dct[x[0]], x[1]), artist_tags))

    topN_tags = [i[0] for i in topN_tags_weights.items()]
    tag_profile.append((id, a_n, topN_tags, topN_tags_weights))

tag_profile = pd.DataFrame(tag_profile, columns=["tagID", "t_n", "profile", "weights"])
# 将tagID作为t_a_id_list横坐标
tag_profile.set_index("tagID", inplace=True)

# 得到各关键字对各个标签的推荐度
artist_table = {}
for mid, weights in tag_profile["weights"].iteritems():
    for tag, weight in weights.items():
        #到artist_table dict 用tag作为Key去取值 如果取不到就返回[]
        _ = artist_table.get(tag, [])
        _.append((mid, weight))
        artist_table.setdefault(tag, _)


In [177]:
# 读取artists.dat表，并以id作为纵坐标
artists_id_list = pd.read_csv('C:\\Users\\ASUS\\Desktop\\homework\\hetrec2011-lastfm-2k\\artists.dat', encoding='utf-8', delimiter="\t", quoting=csv.QUOTE_NONE,index_col='id')
a_t_w = pd.merge(artists, pd.merge(tags, user_taggedartists, on='tagID'), left_on='id', right_on='artistID')
a_t = a_t_w[['id','tagValue']]
# 以tagID分组
a_t_id_list = a_t.groupby("id").agg(list)

# 得到交集
artists_index = set(artists_id_list.index) & set(a_t_id_list.index)
# 得到音乐艺术家-评价列表
new_a_t = a_t_id_list.loc[list(artists_index)]
# 将评价列添加进artists_id_list表中
a_t_id_list = artists_id_list.join(new_a_t)
a_t_id_list = a_t_id_list[['name','tagValue']]
# 将NaN替换为空元组
a_t_id_list = pd.DataFrame(map(lambda x: (x[0], x[1], x[2]) if x[2] is not np.nan else (x[0], x[1], []), a_t_id_list.itertuples()), columns=["id", "name", "tagValue", ])
# 将id作为a_t_id_list横坐标
a_t_id_list.set_index("id", inplace=True)

# 得到音乐艺术家关键字的集合（标签）
dataset = a_t_id_list['tagValue'].values

# 转化为字典储存
dct = Dictionary(dataset)

# 生成词袋模型
corpus = [dct.doc2bow(line) for line in dataset]

# 生成TF-IDF模型
model = TfidfModel(corpus)

# 遍历a_t_id_list的每一行
artist_profile = []
for i, data in enumerate(a_t_id_list.itertuples()):
    # 音乐艺术家id
    id = data[0]
    # 音乐艺术家名
    a_n = data[1]
    # TF-IDF值
    vector = model[corpus[i]]
    # 根据TF-IDf值对电影降序排序
    artist_tags = sorted(vector, key=lambda x: x[1], reverse=True)[:30]
    # 输出每个音乐艺术家的“关键字：推荐度（即TF-IDF值)”的集合
    topN_tags_weights = dict(map(lambda x: (dct[x[0]], x[1]), artist_tags))

    topN_tags = [i[0] for i in topN_tags_weights.items()]
    artist_profile.append((id, a_n, topN_tags, topN_tags_weights))

artist_profile = pd.DataFrame(artist_profile, columns=["id", "a_n", "profile", "weights"])
# 将id作为a_t_id_list横坐标
artist_profile.set_index("id", inplace=True)

# 得到各关键字对各个音乐艺术家的推荐度
tag_table = {}
for mid, weights in artist_profile["weights"].iteritems():
    for tag, weight in weights.items():
        #到tag_table dict 用tag作为Key去取值 如果取不到就返回[]
        _ = tag_table.get(tag, [])
        _.append((mid, weight))
        tag_table.setdefault(tag, _)

In [178]:
# 通过查表得到当前搜索音乐艺术家最符合的标签，再通过标签查找高度符合此标签的其他音乐艺术家
def r_b_s_a(sname):
    result_table = {}
    related_tags = artist_table[sname]
    max_tagID = -1
    max_weight = -1
    for tagID, relate_weight in related_tags:
        if(relate_weight > max_weight):
            max_tagID = tagID
    relate_artists = tag_table[tags_id_list['tagValue'][max_tagID]]
    for id, relate_weight in relate_artists:
        if(relate_weight == 1):
            continue
        _ = result_table.get(id, [])
        _.append(relate_weight)
        result_table.setdefault(id, _)

    rs_result = map(lambda x: (x[0], sum(x[1])), result_table.items())
    rs_result = sorted(rs_result, key=lambda x: x[1], reverse=True)[:20]  # 推荐TOP20
    for id, relate_weight in rs_result:
        print(artists_id_list['name'][id])

## 7.输出检查

In [179]:
s_b_n('雅')
s_b_t('metal')
r_b_w(2)
r_b_f(2)
r_b_s_a('奥井雅美')

['奥井雅美', '上田雅美', '雅-MIYAVI-']
      tagValue                     name
0        metal                 Megadeth
148      metal            Paradise Lost
174      metal         System of a Down
367      metal  Bullet for My Valentine
448      metal                In Flames
...        ...                      ...
22216    metal             Cliff Burton
22235    metal                Crematory
22244    metal                     Doro
22251    metal                   Diablo
22255    metal      Dance Club Massacre

[638 rows x 2 columns]
       tagValue    name
3154     metall  Slayer
161766   metall  Психея
      tagValue        name
66521   mental  Iron Jesus
     tagValue       name
1200     meta  Metallica
      tagValue                 name
22887  j-metal  Maximum the Hormone
       tagValue           name
8857   us metal         Dokken
16207  us metal       W.A.S.P.
18103  us metal  Armored Saint
        tagValue                      name
337     nu-metal          System of a Down
634     