In [271]:
################### YouTube API Data Extraction with Python ###################
############################# Author: Tyler Blair #############################

# This script will pull data from YouTube using their APIs. To do this,
# you will have to set up API credentials with Google, which can be easily
# done at console.developers.google.com

# Additionally, this script pulls the YouTube API key from your system's
# environmental variables. If you are unfamiliar with how to do this, I
# have included the steps in the README.md file on my GitHub (tblair7)

import os

import requests
import json
import datetime
from datetime import datetime, time, timedelta, tzinfo
import pandas as pd
import numpy as np
import re


import google.oauth2.credentials

import google_auth_oauthlib.flow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google_auth_oauthlib.flow import InstalledAppFlow

##### you'll need to have already set your API key as an environmental ######
##### variable before this point. If you haven't/don't want to do so   ######
##### you can simply set it explicitly here:
# api_key = ['your key']
api_key = os.environ.get('YT_API_KEY')

################### only parameters you should need to set ###################
# documentation of parameters you are able to use for playlistItems
# https://developers.google.com/youtube/v3/docs/playlistItems#properties

playlistId = 'LLC6-c2lD7fSObuYk478qtRQ' # ID of the whatever api type you're utilizing
playlistIdentifier = '50test' # identifier for saving purposes
maxResults = 0 # 0-250, though I've set 0 to mean no maximum so I can use it for my playlist

api_params_playlist = 'snippet, contentDetails' # e.g., 'id, contentDetails, statistics' as a string
api_params_videos = "id, contentDetails, statistics, snippet" # parameters I wish to retrieve from my playlist in the end

# these are the column headers that are selected from the playlistItems df
params_playlist = 'videoId', 'videoPublishedAt', 'publishedAt', 'title'
params_playlist_rename = 'id', 'dateUploaded', 'dateFound', 'title'

# these are the column headers that are selected from the videos dataframe
params_videos = 'id','channelId','viewCount','likeCount','dislikeCount', 'duration'
params_videos_rename = 'id', 'channelID','views', 'likes', 'dislikes', 'duration_secs'


################################## Constants #################################


# url_playlist = "https://www.googleapis.com/youtube/v3/playlistItems"
# url_videos = "https://www.googleapis.com/youtube/v3/videos"

times = ['H','M','S']
s_conv = [3600,60,1]
data_playlist = pd.DataFrame([])

################################## Functions ##################################

def gen_params_playlist(playlistId, api_key, api_params_playlist):
    playlist_parameters = {"playlistId": playlistId,
                           "key": api_key,
                           "part": api_params_playlist}
                           #"pageToken": 'CAQAA'}
    return playlist_parameters

def gen_params_playlist_token(playlistId, api_key, api_params_playlist,token):
    playlist_parameters = {"playlistId": playlistId,
                           "key": api_key,
                           "part": api_params_playlist,
                           "pageToken": token}
    return playlist_parameters
            
def gen_params_videos(ID, api_key, api_params_videos):
    videos_parameters = {"id": ID,
                         "key": api_key,
                         "part": api_params_videos}
    return videos_parameters
        
#     if token == False:
#         None
#     else:
#         parameters["nextpagetoken"] = token
    return parameters

#     if api_type == 'playlistItems':
#         url = 'https://www.googleapis.com/youtube/v3/playlistItems'
#     elif api_type == 'videos':
#         url = 'https://www.googleapis.com/youtube/v3/videos'

def pull_playlist_data(playlist_parameters):
    url = "https://www.googleapis.com/youtube/v3/playlistItems"
    page = requests.get(url = url,
                        params = playlist_parameters)
    playlist_results = json.loads(page.text)
    df = pd.io.json.json_normalize(playlist_results['items'])
    df.columns = df.columns.map(lambda x: x.split('.')[-1])
    df = df.loc[:, df.columns.isin(list(params_playlist))]
    df_playlist = df.T.drop_duplicates(keep='first').T
    return df_playlist, playlist_results # playlist_results, page.text)

# def pull_more_playlist_data(playlist_parameters):
#     url = "https://www.googleapis.com/youtube/v3/playlistItems"
#     page = requests.get(url = url,
#                         params = playlist_parameters)
#     playlist_results = json.loads(page.text)
#     df = pd.io.json.json_normalize(playlist_results['items'])
#     df.columns = df.columns.map(lambda x: x.split('.')[-1])
#     df = df.loc[:, df.columns.isin(list(params_playlist))]
#     df_playlist = df.T.drop_duplicates(keep='first').T
#     return df_playlist, playlist_results

def pull_videos_data(videos_parameters):
    url = "https://www.googleapis.com/youtube/v3/videos"
    page = requests.get(url = url,
                        params = videos_parameters)
    videos_results = json.loads(page.text)
    df_videos = pd.io.json.json_normalize(videos_results['items'])
    df_videos.columns = df_videos.columns.map(lambda x: x.split('.')[-1])
    return df_videos # j_results)


def song_length(duration,times,s_conv):
    song_time = 0
    
    for i in range(len(times)):
        my_regex = r'(\d.?' + times[i] + ')'
        pattern = re.search(my_regex, duration)
        if pattern:
            span_start = pattern.span(1)[0]
            span_end = pattern.span(1)[1]
            value = duration[span_start:span_end-1]
            #print(c[span_start:span_end])
            song_time = song_time + int(value)*s_conv[i]        
        else:
            None       
    return song_time

def date_parse(date):
    date = date[0:10] + '-' + date[11:19]
    date_conv = datetime.strptime(date, '%Y-%m-%d-%H:%M:%S')# T %H%M%S.%f Z')
    return(date_conv)


def time_diff_days(time1, time2):
    timeDelta = (time1 - time2)
    a = re.search(r'\d*? ', str(timeDelta))
    days_span_start = a.span(0)[0]
    days_span_end = a.span(0)[1]
    days = int(str(timeDelta)[days_span_start:days_span_end])
    return(days)

############################### Playlist API Calls ###############################

# if j_results:
#     j_results['nextPageToken'] = False
    




# pulls the data from YT and puts it in a usable format
# page = requests.get(url = url,
#                     params = parameters) # pulls the data
# j_results = json.loads(page.text) # make somewhat readable
# df = pd.io.json.json_normalize(j_results['items']) # formatted table, lots of redundant info
# df.columns = df.columns.map(lambda x: x.split('.')[-1])
api_type = 'playlistItems'

playlist_parameters = gen_params_playlist(playlistId, api_key, api_params_playlist)

if maxResults == 0:
    print('No maximum number of results returned')
else:
    playlist_parameters.update(dict(maxResults = maxResults))

[df_playlist, playlist_results] = pull_playlist_data(playlist_parameters)
data_playlist = data_playlist.append(df_playlist)

while playlist_results['nextPageToken']:
    playlist_parameters = gen_params_playlist_token(playlistId, api_key, api_params_playlist,playlist_results['nextPageToken'])
    [df_playlist, playlist_results] = pull_playlist_data(playlist_parameters)
    data_playlist = data_playlist.append(df_playlist)



# # truncates the data based on the params_playlist input from the beginning
# data_playlist = df.loc[:, df.columns.isin(list(params_playlist))]
# data_playlist = data_playlist.T.drop_duplicates(keep='first').T # drop_duplicates works on rows, so transpose, select row, transpose back
data_playlist.columns = list(params_playlist_rename) # assigns column names

#length = np.arange(data_playlist.shape[0])
length = len(data_playlist)

for i in range(length):
    data_playlist.dateUploaded.iloc[i] = date_parse(data_playlist.dateUploaded.iloc[i])
    data_playlist.dateFound.iloc[i] = date_parse(data_playlist.dateFound.iloc[i])
    data_playlist.discoveryTime = (data_playlist.dateFound.iloc[i] - data_playlist.dateUploaded.iloc[i])

#################  Videos API requests and data manipulation #################


data_videos_full = pd.DataFrame([])

for i in range(length):
    parameters_vids = gen_params_videos(data_playlist.id[i], api_key, api_params_videos)
    df = pull_videos_data(parameters_vids)
    data_videos = df.loc[:, df.columns.isin(list(params_videos))]
    data_videos = data_videos.T.drop_duplicates(keep='first').T
    #data_videos.duration = song_length(data_videos.duration,times,s_conv)
    data_videos_full = data_videos_full.append(data_videos)
    data_videos_full.duration.iloc[i] = song_length(data_videos_full.duration.iloc[i],times,s_conv)

data_videos_full = data_videos_full[['id','channelId','viewCount','likeCount','dislikeCount','duration']]
data_videos_full.columns = list(params_videos_rename)

full_data = pd.merge(data_playlist,data_videos_full, on='id', how='outer')
full_data = full_data[['id','title','channelID','views','likes','dislikes','duration_secs','dateUploaded','dateFound']]
full_data = full_data.reindex(columns = np.append(full_data.columns.values,'discoveryTime_days'))

days = np.zeros(length,dtype=int)

for i in range(length):
    days[i] = time_diff_days(data_playlist.dateFound.iloc[i], data_playlist.dateUploaded.iloc[i])

    
full_data.discoveryTime_days = days

print(full_data)

# saves the data_playlist structure as a .csv with a name dictated by the playlistIdentifier variable and the time
# time = datetime.now().strftime('_%Y_%m_%d')
# name = playlistIdentifier + time
# f = open('%s.csv' % name, 'w')
# full_data.to_csv(f.name)




No maximum number of results returned


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


KeyError: 'nextPageToken'

In [272]:
print(data_playlist)

                 publishedAt  \
0   2018-07-13T16:00:00.000Z   
1   2018-07-09T00:00:00.000Z   
2   2018-07-08T08:00:00.000Z   
3   2018-07-08T00:00:00.000Z   
4   2018-07-07T16:00:00.000Z   
0   2018-07-07T16:00:00.000Z   
1   2018-07-06T08:00:00.000Z   
2   2018-07-05T16:00:00.000Z   
3   2018-07-04T16:00:00.000Z   
4   2018-07-03T00:00:00.000Z   
0   2018-07-02T00:00:00.000Z   
1   2018-07-02T00:00:00.000Z   
2   2018-07-02T00:00:00.000Z   
3   2018-07-01T16:00:00.000Z   
4   2018-07-01T16:00:00.000Z   
0   2018-07-01T16:00:00.000Z   
1   2018-07-01T16:00:00.000Z   
2   2018-07-01T00:00:00.000Z   
3   2018-07-01T00:00:00.000Z   
4   2018-07-01T00:00:00.000Z   
0   2018-07-01T00:00:00.000Z   
1   2018-06-30T00:00:00.000Z   
2   2018-06-30T00:00:00.000Z   
3   2018-06-30T00:00:00.000Z   
4   2018-06-29T16:00:00.000Z   
0   2018-06-29T08:00:00.000Z   
1   2018-06-29T08:00:00.000Z   
2   2018-06-27T00:00:00.000Z   
3   2018-06-27T00:00:00.000Z   
4   2018-06-27T00:00:00.000Z   
..      

In [270]:
playlist_results

{'kind': 'youtube#playlistItemListResponse',
 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/H3Hec_JhN5xUbly3GSW_t2_ARA8"',
 'nextPageToken': 'CAUQAA',
 'pageInfo': {'totalResults': 5000, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#playlistItem',
   'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/vUteCaiIwetxnHcaylhyQIWbB-U"',
   'id': 'TExDNi1jMmxEN2ZTT2J1WWs0NzhxdFJRLnc2RHVrcllLNXdR',
   'snippet': {'publishedAt': '2018-07-13T16:00:00.000Z',
    'channelId': 'UCC6-c2lD7fSObuYk478qtRQ',
    'title': 'ALMA - Chasing Highs',
    'description': 'New single available now https://lnk.to/chasinghighs\n \nDirected by Thomas Trail\n \nKeep up with ALMA:\nhttps://instagram.com/cyberalma/\nsnapchat : @cyberalma\nhttps://facebook.com/CYBERALMA\nhttps://twitter.com/almacyber\nwww.youtube.com/user/cyberalmaVEVO\n \nMusic video by ALMA performing Chasing Highs. (C) 2017 PME Records, under exclusive license to Polydor/Island, a division of Universal Music GmbH\nWatch Vevo dscvr: Artists to Watch 2018 - https://

The block below this does ....

In [267]:
A = pd.DataFrame([1, 2, 3, 4])
B = pd.DataFrame([2, 2, 2, 2])
A = A.append(B)
print(A)

   0
0  1
1  2
2  3
3  4
0  2
1  2
2  2
3  2


In [259]:
type(A)

numpy.ndarray

In [212]:
print(a)

helloplaylistItems
