In [34]:
import pandas as pd
from datetime import *
from csv import writer
from apiclient.discovery import build
from apiclient.errors import HttpError

In [61]:
op_filename = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")+".csv"
op_filename

'2021-10-11-13-11-22.csv'

In [53]:
df = pd.read_csv('vdoLinks.csv')
df.head()

Unnamed: 0,youtubeId,movieId,title
0,K26_sDKnvMU,1,Toy Story (1995)
1,3LPANjHlPxo,2,Jumanji (1995)
2,rEnOoWs3FuA,3,Grumpier Old Men (1995)
3,j9xml1CxgXI,4,Waiting to Exhale (1995)
4,ltwvKLnj1B4,5,Father of the Bride Part II (1995)


In [54]:
def get_start_index() -> int:
    startFrom: int = 0

    with open('record_processed.txt') as f:
        startFrom = int(f.readline())

    return startFrom

def update_start_index(index: int) -> None:
    with open('record_processed.txt', 'w') as f:
        f.write(str(index))

def get_apikey():
    key: str

    with open('apikey') as f:
        key = f.readline()

    return key

In [55]:
def log_error(index, youtubeId, method, error):
    with open('error_log.txt', 'a+') as f:
        f.write(f'{str(index)}\t{youtubeId}\t{method}\t{error}\n')

In [29]:

# creating youtube resource object
youtube = build('youtube','v3', developerKey=get_apikey())


In [62]:


def retrieve_video_data(index: int, youtubeId: str):

    update_start_index(index)
    
    try:
        video_list_response=youtube.videos().list(
        part='snippet,statistics,contentDetails',
        id=youtubeId
        ).execute()

        if len(video_list_response['items']) > 0:

            title = video_list_response['items'][0]['snippet']['title']
            description = video_list_response['items'][0]['snippet']['description']
            duration = video_list_response['items'][0]['contentDetails']['duration']
            viewCount = video_list_response['items'][0]['statistics']['viewCount']
            likeCount = video_list_response['items'][0]['statistics']['likeCount']
            dislikeCount = video_list_response['items'][0]['statistics']['dislikeCount']
            favoriteCount = video_list_response['items'][0]['statistics']['favoriteCount']
            commentCount = video_list_response['items'][0]['statistics']['commentCount']

            comment_list_response=youtube.commentThreads().list(
            part='snippet',
            videoId=youtubeId,
            maxResults=100
            ).execute()

            comment_list = []
            comments = comment_list_response['items']

            for comment in comments:
                comment_text: str = comment['snippet']['topLevelComment']['snippet']['textOriginal']
                comment_text = comment_text.replace('\n', ' . ')
                comment_text = comment_text.replace('\r', ' ')
                comment_text = comment_text.replace(',', ' ')
                comment_list.append(comment_text)

            comments_100 = ' . '.join(comment_list)

            with open(op_filename, 'a+', newline='', encoding="utf-8") as f:
                csv_writer = writer(f)
                csv_writer.writerow([index, youtubeId, title, description, duration, viewCount, likeCount, dislikeCount, favoriteCount, commentCount,  comments_100])

        else:
            with open(op_filename, 'a+', newline='', encoding="utf-8") as f:
                csv_writer = writer(f)
                csv_writer.writerow([index, youtubeId, 'Null', 'Null', 'Null', 'Null', 'Null', 'Null', 'Null', 'Null'])


    except HttpError as e:

        log_error(index, youtubeId, 'retrieve_video_data[1][HttpError]', str(e.content))
         # repr() converts bytes to string
        if 'commentsDisabled' in repr(e.content):
            print(f'[{index}] {youtubeId}: commentsDisabled')  
        elif 'not properly authorized to access video file' in repr(e.content):
            print('No Access')
        else:
            print(e.content)
            raise

    except KeyError as k_err:
        log_error(index, youtubeId, 'retrieve_video_data[KeyError]', str(k_err))
            
    except Exception as ex:
        print(ex)
        log_error(index, youtubeId, 'retrieve_video_data[Exception]', str(ex))
        raise
        


In [63]:
unprocessed_df = df.iloc[get_start_index():, :]

In [64]:
for i, j in unprocessed_df.iterrows():
    print(i, j['youtubeId'])
    retrieve_video_data(i, j['youtubeId'])

0 K26_sDKnvMU
1 3LPANjHlPxo
2 rEnOoWs3FuA
3 j9xml1CxgXI
4 ltwvKLnj1B4
5 2GfZl4kuVNI
6 twTksx_lWB4
7 -C-xXZyX2zU
8 SCOxEKkuWG4
9 lcOqUE0u1LM
10 UrC75wUKoFM
11 tVdn8JH91Dg
12 a6lGULmQdb0
13 dO2LWKpeyI8
14 JXxFESHwnX0
15 EJXDMwGWhoA
16 Ns17RQr1yK8
17 Rieq_TR7cV0
18 DfqPjRMsRP0
19 qPPUmzK5pPc
20 yNLaTtpovys
21 lsmXhM4yfU0
22 OOTTvrb4JZI
23 gHl-UHu2-lM
24 UMlYWZgCIgo
25 RAYuASqrs94
26 RQLVzTtt2Ws
27 lYSHAyODiGs
28 toH1vzAmDBI
29 mQf3Ngg2cks
30 gA-5nLQCmW8
31 15s4Y9ffW_o
32 3_wez2B83ic
33 tVxeoUtVF0o
34 ccq3f9-lQyM
35 pg-GMqPHIPQ
36 qpQ-CnutDy0
37 6PKq7YSpO6c
38 XuHEa7pWAQY
39 zEY1dPRUrVU
40 OXc0-EME0C8
41 eETnzbLwTs8
42 CvsAuHYBgEQ
43 JHIfHL5UgFs
44 Y1uPXA1ceNo
45 K5_2VgJUSBA
46 SpKbZ_3zlb0
47 DSf_pVG8QZM
48 J5mszOh_Qis
49 oiXdPolca5w
50 2BBKx2Hji_0
51 -qz_zB2Pc2g
52 3Fd6ZlgOFqs
53 K4MltltKq4k
54 jG9pUsmjAO8
55 moJMVGICPc4
56 ThH8WocPRM0
57 0or7hSz-7gc
58 mRX2-vM4eYQ
59 yoIBXVm_B9Y
60 jY56N6irNCk
61 JYoWGIrXiz0
62 JAAhQwcJ20U
63 Psql1jhaIiA
64 9J6qOSUJXy0
65 MubmJ2jwvOM
66 z7oSfRVevzU
67 Qd

In [24]:
log_error(1, 'asdf', 'cmt', 'unknown')