# 使用Google APIs - YouTube Data API v3 取得電影預告片留言內容

In [1]:
import pandas as pd
import googleapiclient.discovery
import datetime

#### api_keys：可更改api_keys中的金鑰內容

In [2]:
api_keys = ["AIzaSyA2lO3oKFkCiEb84LxMssuPUBD15S1xcok"]
key_number = 0
current_key = api_keys[key_number]
current_key

'AIzaSyA2lO3oKFkCiEb84LxMssuPUBD15S1xcok'

#### change_key(): 當目前使用API key每日流量超出限制時，改變使用的金鑰

In [3]:
def change_key():
    global api_keys
    global key_number
    global current_key
    global youtube
    if key_number == (len(api_keys)-1):
        print("no more api_keys")
    else:
        key_number += 1
        current_key = api_keys[key_number]
        youtube = googleapiclient.discovery.build("youtube", "v3", developerKey = current_key)
        print("change API Key to:{}".format(current_key))

#### get_comments(response): 抓取影片所有留言(commentThreads)內容所需的值
- 影片ID(videoId)
- 留言時間(publishedAt)
- 留言者名稱(authorDisplayName)
- 留言內容(textOriginal)
- 留言按讚數(likeCount)
- 留言回覆數(commentsReplyCount)

In [3]:
def get_comments(response):
    videoId, authorDisplayName, textOriginal, likeCount, publishedAt, totalReplyCount = [],[],[],[],[],[]
    
    for item in response["items"]:
        if  item["kind"] == "youtube#commentThread":
            snippet = item["snippet"]["topLevelComment"]["snippet"]
            videoId.append(snippet["videoId"])
            authorDisplayName.append(snippet["authorDisplayName"])
            textOriginal.append(snippet["textOriginal"])
            likeCount.append(snippet["likeCount"])
            publishedAt.append(snippet["publishedAt"])
            totalReplyCount.append(item["snippet"]["totalReplyCount"])
    
    if len(videoId) == len(authorDisplayName) == len(textOriginal) == len(likeCount) == len(publishedAt) == len(totalReplyCount):
        comments = {'videoid': videoId,
                    'publishedAt': publishedAt,
                    'authorDisplayName': authorDisplayName,
                    'textOriginal': textOriginal,
                    'likeCount': likeCount,
                    'commentsReplyCount': totalReplyCount }
    df_comments = pd.DataFrame(comments)
    return df_comments

#### get_trailer_comments(videoId): 傳入影片ID(videoId)會取得所抓取的所有評論的內容
- API限定每一次request只能100筆，換頁要帶nextPageToken參數給下次request使用
- trailer_comments：儲存評論內容，設為全域變數，有exception發生之前抓取的資料還在裡面！可以先匯出成csv檔存檔！
- execution_log：是儲存執行與exception發生的資訊

In [4]:
def get_trailer_comments(videoId):
    global trailer_comments
    global execution_log
    nextPageToken = ""
    page = 0
    while nextPageToken is not None:
        page += 1
        request = youtube.commentThreads().list(videoId=videoId, pageToken=nextPageToken, part="snippet",
                                                maxResults=100, order="time", moderationStatus="published")
        try:
            response = request.execute()
            nextPageToken = response.get('nextPageToken')
            if len(response["items"]) != 0:
                tmp = get_comments(response)
                trailer_comments = pd.concat([trailer_comments, tmp], ignore_index = True)
                execution_log = execution_log.append(
                    pd.DataFrame({'videoid':[videoId],'page':[page],'nextPageToken':[nextPageToken],
                                  'message':["success append!"]}),ignore_index = True)
            else:
                execution_log = execution_log.append(
                    pd.DataFrame({'videoid':[videoId],'page':[page],'nextPageToken':[nextPageToken],
                                  'message':["No available comments"]}), ignore_index = True)
        except Exception as e1:
            execution_log = execution_log.append(
                pd.DataFrame({'videoid':[videoId],'page':[page],'nextPageToken':[nextPageToken],
                              'message':[str(e1)]}), ignore_index = True)
            global flag
            if "Daily Limit Exceeded" in str(e1):
                print("Daily Limit Exceeded!")
                global current_key
                global api_keys
                if current_key == api_keys[-1]:
                    execution_log = execution_log.append(
                        pd.DataFrame({'videoid':[videoId],'page':[page],'nextPageToken':[nextPageToken],
                                      'message':["all API Keys used"]}), ignore_index = True)
                    print("No more API keys...QAQ...")
                    break
                change_key()
            elif "parameter has disabled comments" in str(e1):
                print("Video comments disabled!")
                break
            elif "parameter could not be found" in str(e1):
                print("Video link is invalid!")
                break
            elif "not found and cannot be used for API calls" in str(e1):
                print("API key didn't be activated!...QAQ...")
                flag = False
                break
            elif "API server failed to successfully process the request" in str(e1):
                print("processingFailure by API server...QAQ...")
                flag = False
            else:
                print("unknown error...QAQ...")
                print(str(e1))
                flag = False
                break

### 設定連YouTube API所使用的金鑰

In [6]:
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey = current_key)
current_key

'AIzaSyA2lO3oKFkCiEb84LxMssuPUBD15S1xcok'

#### 對照youtube id檔案

In [7]:
df_trailer = pd.read_csv('D:/Movie/MovieList/trailer_data_with_year.csv', encoding='UTF-8', index_col=0)
df_trailer.head()

Unnamed: 0,title_id,youtube_id,release_time,release_year
0,19995,5PSNL1qE6VY,9-Nov-09,2009
1,8373,uH3STHC63hU,1-May-09,2009
2,767,JYLdTuL9Wjw,12-Jun-09,2009
3,767,Jrx8zZveyVU,8-Oct-17,2017
4,767,lcNtt0f6KdM,8-Oct-17,2017


year位置設定各自負責年份，篩選所需年份

In [8]:
year = 2019
df_trailer = df_trailer[df_trailer['release_year']==year].reset_index(drop=True)
df_trailer.head()

Unnamed: 0,title_id,youtube_id,release_time,release_year
0,19585,AD_fezWiaZ0,8-Feb-19,2019
1,10138,LebmJcJX3Ck,10-Jun-19,2019
2,10138,qMObsIYkqxU,10-Jun-19,2019
3,10138,uLUqN8YG-s4,10-Jun-19,2019
4,26022,nGH2mTu7AhU,24-Jan-19,2019


In [10]:
len(df_trailer)

357

In [11]:
len(df_trailer[ (df_trailer['youtube_id'] != 'TcMBFSGVi1c') & (df_trailer['youtube_id'] != 'AMSITikqKiM')])

355

In [12]:
df_trailer = df_trailer[ (df_trailer['youtube_id'] != 'TcMBFSGVi1c') & (df_trailer['youtube_id'] != 'AMSITikqKiM')].reset_index(drop=True)

In [37]:
df_trailer.tail()

Unnamed: 0,title_id,youtube_id,release_time,release_year
350,555295,pXww9NuqUDU,18-Mar-19,2019
351,595985,u61ymqblUU0,5-Jun-19,2019
352,595985,XKev4lpHMcE,5-Jun-19,2019
353,480429,4MD16iUK4CQ,31-Jan-19,2019
354,515676,LaNUEShKr6c,18-Jan-19,2019


#### 建立trailer_comments與execution_log，寫入csv檔案後記得再執行清空，不然內容會重複寫入

In [113]:
trailer_comments = pd.DataFrame()
execution_log = pd.DataFrame()

In [114]:
flag = True

#### 設定抓取youtube id的起始資料列(start)與結束資料列(end)

In [115]:
start = 200
end = 355

### 執行主程式，Good Luck!

若出現 <b>QAQ error</b> 表示可能API Key有問題或是其他不知名錯誤，for迴圈會停止，<b>請注意已抓資料以及對錯誤訊息進行處理</b>

In [116]:
%%time
for query_videoId in df_trailer['youtube_id'][start:end]:
    if flag:
        get_trailer_comments(query_videoId)
    else:
        print(f'QAQ error! Current key= {current_key}')
        break

Video comments disabled!
Video comments disabled!
Video link is invalid!
Video comments disabled!
Wall time: 5min 18s


### 檢查執行後成果

看每一輪最後一筆video_id, 若與execution_log最後一筆id相同代表抓取成功。<br>
若出現QAQ error，建議下次抓取時從上次最後一筆開始抓，檢查是否有抓完整。

In [117]:
execution_log.tail()

Unnamed: 0,videoid,page,nextPageToken,message
553,u61ymqblUU0,4,QURTSl9pMVcySVBfeGtweFJGdzFmdG1WN09qNmVZdHVtZ1...,success append!
554,u61ymqblUU0,5,,success append!
555,XKev4lpHMcE,1,,success append!
556,4MD16iUK4CQ,1,,success append!
557,LaNUEShKr6c,1,,success append!


In [118]:
df_trailer['youtube_id'][end-1]

'LaNUEShKr6c'

檢查trailer_comments(抓取的評論筆數), execution_log(執行次數)

In [119]:
len(trailer_comments), len(execution_log)

(43698, 558)

檢查已執行過那些video_id

In [120]:
execution_log['videoid'].value_counts()

xMJNm24n4Ok    206
aejAkKGiimk     44
cQFFnUg0u70     36
EXNP731K1qE     24
M7zrHiqoJ6k     19
7afc9gTbVFI     18
i0gTo-qpxSI      7
0QzILYONVT0      7
w47IJjuEV8o      6
p4m7wgSiDhM      5
B5HI4VsXvIQ      5
u61ymqblUU0      5
Kas0BKHZwEQ      5
pkaTGGwwvSI      4
VDlQQD2354A      4
DovnHrIwfTY      4
BxY2vnJiByw      4
hzrFqZcosI0      3
HIMO1IheTpA      3
NYCTxoXx-H8      3
UDaYckxMgpM      2
GjEPOMZ6sf0      2
se9n853lBNo      2
FjfUPLEKZtI      2
PMlHDNdLGU8      2
OitVw2gE6aQ      2
FhyaYaK9lC4      2
WVZBNT0Ap-A      2
dmtfpB6MUi0      2
KPSKkxTQFEU      2
              ... 
72mPcb_nLq0      1
gYko3cN_KZk      1
l-MzHGRVG4k      1
oB8PpdeSXRQ      1
l43skoy-_qw      1
dPGwJmSMoPs      1
JsNBLkO2Itg      1
3x0aZS6uY7E      1
dlxYlvNlj8o      1
3_yobkhvzJk      1
ejZ75QFesgE      1
BLqAGR8eFt8      1
XKev4lpHMcE      1
xOKfu8X48Hs      1
z1JICa6kiOQ      1
A3NebXesuFc      1
BrDNYXW9a_Y      1
UsFGeBRn5SI      1
QwG3b6e1ARU      1
sBQF9t1qKkA      1
63vgOHam-kg      1
ljz7WxgTpx0 

檢查已執行過video_id的數量

In [121]:
execution_log['videoid'].value_counts().count()

155

檢查每筆video_id留言數量

In [122]:
trailer_comments['videoid'].value_counts()

xMJNm24n4Ok    20513
aejAkKGiimk     4388
cQFFnUg0u70     3593
EXNP731K1qE     2330
M7zrHiqoJ6k     1888
7afc9gTbVFI     1756
i0gTo-qpxSI      669
0QzILYONVT0      643
w47IJjuEV8o      508
p4m7wgSiDhM      471
B5HI4VsXvIQ      457
u61ymqblUU0      420
Kas0BKHZwEQ      402
pkaTGGwwvSI      363
VDlQQD2354A      332
DovnHrIwfTY      322
BxY2vnJiByw      306
NYCTxoXx-H8      289
HIMO1IheTpA      270
hzrFqZcosI0      259
KPSKkxTQFEU      199
xxLBTRcDVfU      194
dmtfpB6MUi0      165
se9n853lBNo      143
OitVw2gE6aQ      134
UDaYckxMgpM      134
FjfUPLEKZtI      125
WVZBNT0Ap-A      124
FhyaYaK9lC4      112
PMlHDNdLGU8      104
               ...  
fh9dImdUYIY        6
sBQF9t1qKkA        5
OPUZKqaWQAg        4
8XdDTM2zcNE        4
qqCw6DqGbKA        4
ydoaqi2Whkk        3
l43skoy-_qw        3
Fk1iGpUFcjY        3
NmNT2emdUGY        3
LByUBimVyJA        3
LaNUEShKr6c        3
JsNBLkO2Itg        2
qVcn6cQ6VeM        2
72mPcb_nLq0        2
A6DVFhvs0AM        2
UHNEtSJIep0        2
XMoo2SjBRug  

檢查有評論的影片數量

In [123]:
trailer_comments['videoid'].value_counts().count()

130

##### 檢查QAQ error原因與需要重抓的nextPageToken

In [74]:
df_trailer[df_trailer['youtube_id'] == '3On0BXzGnuI']

Unnamed: 0,title_id,youtube_id,release_time,release_year
134,508791,3On0BXzGnuI,6-Feb-19,2019


In [99]:
df_trailer[134:136]

Unnamed: 0,title_id,youtube_id,release_time,release_year
134,508791,3On0BXzGnuI,6-Feb-19,2019
135,611291,AwrClxIhsYc,3-Jul-19,2019


In [96]:
execution_log[execution_log['message'].str.contains('HttpError', regex=False)]

Unnamed: 0,videoid,page,nextPageToken,message
77,Ap0NRJD-2ts,1,,<HttpError 403 when requesting https://www.goo...
78,#NAME?,1,,<HttpError 404 when requesting https://www.goo...
139,3On0BXzGnuI,41,QURTSl9pM3NkbHI5WXF2UnZPTWlvUGJJTXVuZG1zak4wb3...,<HttpError 400 when requesting https://www.goo...


In [95]:
execution_log.iloc[138].nextPageToken

'QURTSl9pM3NkbHI5WXF2UnZPTWlvUGJJTXVuZG1zak4wb3I0TkJreHE3SmZkMnI2UjI3VWFPZnVMQUJpM3VCMUVEV1U0RWlRZnI5UXF0UVp0alFQY0QzaDVlVm81cjJZNDdFQ1RYd3dVVHBKZG8zakxZRmphWTJJM2lzWWZlWlo1Ym9SMkdkMWJoTm1JcEduLVE='

In [80]:
current_key

'AIzaSyDJSwg-6x_Ytukfh4tK90BCS9fCHwk8404'

In [17]:
change_key()

change API Key to:AIzaSyD3v6ml3YZpIPxz5EdLSyZMHR068YGdQMM


### 將抓取的資料與執行log檔匯出成csv：檔名後自動帶入目前執行日期時間

In [124]:
trailer_comments.to_csv('trailerComments_{:%Y%m%d_%H%M}.csv'.format(datetime.datetime.now()), encoding="utf-8", index=False)
execution_log.to_csv('executionLog_{:%Y%m%d_%H%M}.csv'.format(datetime.datetime.now()), encoding="utf-8", index=False)

In [21]:
execution_log

Unnamed: 0,videoid,page,nextPageToken,message
0,AD_fezWiaZ0,1,,No available comments
1,LebmJcJX3Ck,1,,No available comments
2,qMObsIYkqxU,1,,No available comments
3,uLUqN8YG-s4,1,,No available comments
4,nGH2mTu7AhU,1,,No available comments
5,7TavVZMewpY,1,QURTSl9pMWVqTjBQZjU3dm5EcVpuaWFIYzhyQkhPdFNPZl...,success append!
6,7TavVZMewpY,2,QURTSl9pMDJkZExWRi13Sy14WDVFQkVZT0RiR0tyRkU3Zl...,success append!
7,7TavVZMewpY,3,QURTSl9pMWt4Z1dEOThJRTRSMkNjYVdmOE1OYVY1SFo3bW...,success append!
8,7TavVZMewpY,4,QURTSl9pMTNCQWpPNlVsMUlOY1E5S3VKQVBYRmZncnd0Y1...,success append!
9,7TavVZMewpY,5,QURTSl9pMS1jU3RfN1dMb3BrN05MRl9JZmNHbGw0MXdDb0...,success append!


In [22]:
trailer_comments

Unnamed: 0,videoid,publishedAt,authorDisplayName,textOriginal,likeCount,commentsReplyCount
0,7TavVZMewpY,2019-10-03T00:26:05.000Z,Daniel Miles,When I saw the trailer of the remake Lion King...,0,0
1,7TavVZMewpY,2019-10-02T19:40:14.000Z,fede manu,LIFE KING,0,1
2,7TavVZMewpY,2019-10-02T18:34:22.000Z,Александър Божков,Scar is my favourite hero,1,0
3,7TavVZMewpY,2019-10-02T13:56:33.000Z,Mirthe Bananas,Can anyone please say if the animals talk in t...,0,1
4,7TavVZMewpY,2019-10-01T23:43:37.000Z,melissa padovan,/,0,0
5,7TavVZMewpY,2019-10-01T22:40:44.000Z,Loick Marie Louise,the lion king is real the magic is real with t...,0,1
6,7TavVZMewpY,2019-10-01T15:19:33.000Z,Prabhjot singh,👌👌,0,0
7,7TavVZMewpY,2019-09-30T18:59:07.000Z,Srividhya Jayaraman,Life \nOther\nFlamingo Disney \nGazelle Run\nR...,0,0
8,7TavVZMewpY,2019-09-29T16:09:40.000Z,Srividhya Jayaraman,Life’s \nOther\nFlamingo Disney\nRun Away Simb...,2,0
9,7TavVZMewpY,2019-09-29T11:21:10.000Z,Seventy Seven,Im still mad that Beyonce voices nasa because ...,0,0
