In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
import json
import re

In [None]:
# https://segmentfault.com/a/1190000014948845
# https://www.jianshu.com/p/f3764544f6d6
# https://cloud.tencent.com/developer/article/1543945

# id=19723756，云音乐飙升榜
# id=3779629，云音乐新歌榜
# id=3778678，云音乐热歌榜
# id=2250011882，抖音排行榜
# id=5059661515，云音乐民谣榜

## User defined functions 

### webpage capture function

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 \
                (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Referer': 'https://music.163.com/',
        'Host': 'music.163.com'}

def pageCapture(url):
    response = requests.get(url,headers=headers)
    doc = BeautifulSoup(response.content, 'html.parser')
    return doc

### get playlist function 

In [3]:
def getPlaylistInfor(pages, order, cat):
    playlst_ids = []
    playlst_titles = []
    playlst_streams = []
    playlst_create_dates = []
    playlst_collections = []
    playlst_reposts = []
    playlst_comments = []
    
    for page in range(pages):
        url="http://music.163.com/discover/playlist/?order={}&cat={}&limit=35\
                                &offset={}".format(order, cat, str(page*35))
        #print(url)
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser') 
        # extract playlist ids and save it to the list
        playlst_ids.extend(re.findall(r'playlist\?id=(\d+?)" class="msk"', response.text))
        
    for idx in playlst_ids:
        mainpage = requests.get('http://music.163.com/playlist?id={}'.format(idx), headers=headers)
        mainsoup = BeautifulSoup(mainpage.text, 'html.parser') 
        playlst_title = re.search(r'>(.+?) - 歌单 - 网易云音乐', mainpage.text).group(1)
        streams = mainsoup.select('strong#play-count.s-fc6')[0].get_text().strip()
        playlst_create_date = mainsoup.select('span.time.s-fc4')[0].get_text().strip()[0:10]
        playlst_collection = re.search(r'data-count="(\d+?)"', 
                                       str(mainsoup.select('a.u-btni.u-btni-fav'))).group(1)
        playlst_repost = re.search(r'data-count="(\d+?)"', 
                                       str(mainsoup.select('a.u-btni.u-btni-share'))).group(1)
        playlst_comment = mainsoup.select('span#cnt_comment_count')[0].get_text().strip()
        
        playlst_titles.append(playlst_title)
        playlst_streams.append(streams)
        playlst_create_dates.append(playlst_create_date)
        playlst_collections.append(playlst_collection)
        playlst_reposts.append(playlst_repost)
        playlst_comments.append(playlst_comment)

    df_playlist = pd.DataFrame(data=list(zip(playlst_ids, playlst_titles, playlst_streams, 
                                             playlst_create_dates, playlst_collections, playlst_reposts, 
                                             playlst_comments)),
                              columns = ["playlstID", "playlstTitle", "playlstStream", 
                                          "playlstCreateDate", "playlstCollections", "playlstReposts", 
                                          "playlstComments"])
    return df_playlist

In [3]:
def getTrendingSongsInfor(playlistid):
    song_ids = []
    song_titles = []
    artists = []
    albums = []
    song_types = []
    publishtime = []
    companies = []
    
    url = 'https://music.163.com/api/playlist/detail?id={}'.format(playlistid)
    json_dict = json.loads(str(pageCapture(url)))
    songs = json_dict['result']['tracks']
    for i in range(len(songs)):
        song_id = songs[i]['id']
        song_title = songs[i]['name']
        artist = songs[i]['artists'][0]['name']
        album = songs[i]['album']['name']
        song_type = songs[i]['album']['subType']
        time = songs[i]['album']['publishTime']
        company = songs[i]['album']['company']
    
        song_ids.append(song_id)
        song_titles.append(song_title)
        artists.append(artist)
        albums.append(album)
        song_types.append(song_type)
        publishtime.append(time)
        companies.append(company)

    df_trending_songs = pd.DataFrame(data=list(zip(song_ids, song_titles, artists, albums, song_types,
                                                  publishtime, companies)),
                              columns = ["songID", "songTitle", "artists", "albums", "songType",
                                        "publishTime", "company"])
    return df_trending_songs

### get song function 

In [4]:
def getSongs(playlst_id):
    response = requests.get('http://music.163.com/playlist?id={}'.format(playlst_id), headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser') 
    songID = re.findall(r'song\?id=(\d+?)".+?</a>', response.text)
    songTitle = re.findall(r'song\?id=\d+?">(.+?)</a>', response.text)
    
    artists = []
    albums = []
    
    for idx in songID:
        songInfor = "http://music.163.com/api/song/detail/?id={}&ids=%5B{}%5D".format(idx, idx)
        try: 
            songSoup = json.loads(str(pageCapture(songInfor)))
            artist = songSoup['songs'][0]['artists'][0]['name']
            album = songSoup['songs'][0]['album']['name']
        except: 
            artist = None
            album = None
            continue
            
        artists.append(artist)
        albums.append(album)
    df_songs = pd.DataFrame(data=list(zip([playlst_id]*len(songID), songID, 
                                              songTitle, artists, albums)), 
                               columns = ["playlstID", "songID", "songTitle", "artists", "albums"])
    
    return df_songs

### get comment function 

In [5]:
def getComments(songid, pages):
    userID = []
    userName = []
    commentID = []
    comments = []
    commentDateTime = []
    commentLike = []
    
    for page in range(pages):
        try:
            url="http://music.163.com/api/v1/resource/comments/R_SO_4_{}?\
                    limit=50&offset={}".format(songid, str(page*50))
            response = requests.get(url, headers=headers)
            doc = BeautifulSoup(response.content, 'html.parser')
            json_dict = json.loads(str(doc))
            for i in range(len(json_dict['hotComments'])):
                try: 
                    userid = json_dict['hotComments'][i]['user']['userId']
                    username = json_dict['hotComments'][i]['user']['nickname']
                    commentid = json_dict['hotComments'][i]['commentId']
                    comment = json_dict['hotComments'][i]['content']
                    comment_datetime = json_dict['hotComments'][i]['time']
                    comment_likes = json_dict['hotComments'][i]['likedCount']
                except: 
                    userid = None
                    username = None
                    commentid = None
                    comment = None
                    comment_datetime = None
                    comment_likes = None
                    continue

                userID.append(userid)
                userName.append(username)
                commentID.append(commentid)
                comments.append(comment)
                commentDateTime.append(comment_datetime)
                commentLike.append(comment_likes)
            
        except:
            continue
            
    df_comment = pd.DataFrame(data = list(zip([songid]*len(userID), userID, userName, commentID, 
                                        comments, commentDateTime, commentLike)), 
                            columns = ['songID', "userID", "userName", "commentID", "comments", 
                                                     "commentDateTime", "commentLike"])
 
    return df_comment

### get user profile 

In [6]:
def userInfor(userid):
    url = "https://music.163.com/api/v1/user/detail/{}".format(userid)
    try:
        response = requests.get(url, headers=headers)
        doc = BeautifulSoup(response.content, 'html.parser')
        json_dict = json.loads(str(doc))
        
        userLevel = json_dict['level']
        listenSongs = json_dict['listenSongs']
        followeds = json_dict['profile']['followeds']
        follows = json_dict['profile']['follows']
        gender = json_dict['profile']['gender']
        province = json_dict['profile']['province']
        city = json_dict['profile']['city']
        
        userInfor = [userid, userLevel, listenSongs, followeds, follows, gender, 
                                        province, city]
    except:
        print('page error ', errorids)
        userInfor = None
    
    return userInfor

## Data collection - web scraping

### Chinese Songs 华语歌单 

In [7]:
ChineseSonglists = getPlaylistInfor(20, "hot", "%E5%8D%8E%E8%AF%AD")
ChineseSonglists.head()

Unnamed: 0,playlstID,playlstTitle,playlstStream,playlstCreateDate,playlstCollections,playlstReposts,playlstComments
0,2829883282,[华语私人订制] 最懂你的华语推荐 每日更新35首,359956960,2019-06-05,3018942,14389,36412
1,5035498001,拜拜上半年的不顺，下半年愿你苦尽甘来,228831,2020-05-26,882,91,13
2,2909007895,刷题背书写作业轻音乐（致正在努力的你）,275896,2019-07-31,7706,50,160
3,3134064854,网易云比你懂我,3686969,2019-12-24,45285,250,666
4,2618915549,你搜不到的土嗨神曲,61153232,2019-01-11,1264625,10083,3154


In [7]:
ChineseSonglists.shape

(700, 7)

In [None]:
ChineseSonglists.to_excel("ChineseSonglists华语歌单.xlsx", index=False)

In [9]:
df_songs = pd.DataFrame([])
# index = np.where(ChineseSonglists.playlstID.values == 3080106226)
# for i in ChineseSonglists.playlstID.values[index[0][0]+1:]:

for i in ChineseSonglists.playlstID.values:
    df_songs = pd.concat([df_songs, getSongs(i)], axis=0)
    
print(df_songs.shape)
df_songs.head()

(6999, 5)


Unnamed: 0,playlstID,songID,songTitle,artists,albums
0,2829883282,1318234987,贝贝,李荣浩,耳朵
1,2829883282,167696,火烧的寂寞,信,趁我
2,2829883282,1425886269,听到就爱上的二十秒即兴,特污兔,听到就爱上的二十秒即兴
3,2829883282,66282,浮夸,陈奕迅,U-87
4,2829883282,150361,三国恋,Tank,Fighting!生存之道


In [10]:
df_songs.to_excel("df_songs.xlsx", index=False)

In [7]:
df_comment = pd.DataFrame([])

for i in df_songs.songID.values:
    df_comment = pd.concat([df_comment, getComments(i, 10)], axis=0)
    
print(df_comment.shape)
df_comment.head()

(89404, 7)


Unnamed: 0,songID,userID,userName,commentID,comments,commentDateTime,commentLike
0,1318234987,368558326,大米思蜜达,1274553078,当你看完这条评论的时候，你已经在听下一首歌了,1539707347260,752273
1,1318234987,260139318,假若我年少有为,1274602859,虽然只有短短的四秒，但是却展现了李荣浩高超的作词及作曲能力，戛然而止，而意犹未尽，让人浮想联...,1539710970774,498506
2,1318234987,622891340,亲爱的巴恩斯中士,1274703747,眼睛不大，胆子不小[大哭],1539738634623,449685
3,1318234987,103645133,在与枕头的婚礼中入睡,1274709846,周杰伦：能不能给我一首歌的时间\n李荣浩：给你一首《贝贝》,1539740602675,375818
4,1318234987,415034968,松岛阿森,1274666303,请李先生尊重一下2G网用户\n缓冲一分钟就一句拜拜？？[大笑],1539736736859,349181


In [8]:
df_comment.to_excel("df_comment.xlsx", index=False)

In [None]:
users = []
for i in list(set(df_comment.userID.values)):
    try:
        users.append(userInfor(i))
    except: 
        continue

# page error  [1644563139]
# page error  [1950356667]
# page error  [1644566105]
# page error  [510404522]
# page error  [1644577261]
# page error  [1893752281]
# page error  [516321657]
# page error  [1894286550]
# page error  [1929419306]
# page error  [1321384722]
# page error  [1522449851]
# page error  [9753203]
# page error  [38987663]
# page error  [582543308]
# page error  [1894313543]
# page error  [251851501]
# page error  [1894316508]
# page error  [1894316531]
# page error  [291704061]
# page error  [1650007669]
# page error  [1644505911]
# page error  [1641632295]
# page error  [1719236828]
# page error  [1893706841]
# page error  [628339165]
# page error  [1893322967]
# page error  [1636946197]
# page error  [1923469918]
# page error  [113628079]
# page error  [1893719852]
# page error  [1893719853]
# page error  [1315300598]
# page error  [1644555133]
# page error  [486274606]
# page error  [124123185]
# page error  [1894251701]

In [12]:
userprofile = []
for i in users:
    if i!=None:
        userprofile.append(i)
        
df_user = pd.DataFrame(data=userprofile, 
                       columns = ["userID", "userLevel", "listenSongs", "followeds", "follows", 
                                    'gender', 'province', 'city'])
print(df_user.shape)
df_user.head()

(48127, 8)


Unnamed: 0,userID,userLevel,listenSongs,followeds,follows,gender,province,city
0,1,9,14564,99999,456,1,110000,110101
1,1365770243,6,607,94,21,0,0,100
2,1334181892,6,973,3,9,1,0,100
3,1809842188,8,9766,4196,52,1,0,100
4,40370188,9,10426,22,4,1,0,100


In [13]:
df_user.to_excel("df_user.xlsx", index=False)

### 云音乐飙升榜

In [4]:
TrendingSongs = getTrendingSongsInfor(19723756)
print(TrendingSongs.shape)
TrendingSongs.head()

(100, 7)


Unnamed: 0,songID,songTitle,artists,albums,songType,publishTime,company
0,1463349975,烟花易冷 (live),周深,流淌的歌声 第二季 第4期,现场版,1594915200000,广东卫视
1,1463362956,成全,买辣椒也用券,成全,录音室版,1595001600000,智慧大狗 × 网易云音乐
2,1456286877,爱，存在,旺仔小乔,爱，存在（正式版）,,1595001600000,
3,1460682363,爱，存在,卢卢快闭嘴,爱，存在,录音室版,1593964800000,中视鸣达
4,1450574147,情人,蔡徐坤,情人,录音室版,1590249600000,上海圣臻文化发展有限公司


In [5]:
TrendingSongs.to_excel("TrendingSongs_20200719.xlsx", index=False)

In [None]:
TrendingSongs = pd.read_excel("TrendingSongs_20200719.xlsx")

df_trending_comment = pd.DataFrame([])

for i in list(set(TrendingSongs.songID.values)):
    df_trending_comment = pd.concat([df_trending_comment, getComments(i, 100)], axis=0)
    
print(df_trending_comment.shape)
df_trending_comment.head()