In [1]:
import os
import pandas as pd
pd.options.display.max_columns = None
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
from utils import searchVideosByListOfIds, fillCategoryIds
from datetime import datetime, timedelta
import sys, traceback

api_key = os.environ.get('YT_API')
# api_key = "XYZ"

# df = pd.read_csv("../../our_data/Etap4/GB_US-rdy-to-learn-properly-V2.csv")
# df = pd.read_csv("../../our_data/Etap3/dfGB_merged.csv")
# df = pd.read_csv("../../our_data/Etap1/GB_US.csv")

todownload = {0:"US",1:"GB"}
download = todownload[1] #HERE CHOSE WHICH ONE TO DOWNLOAD
NUMBER_OF_VIDEOS = 10

start_datetime = "2017-11-14T23:59:59Z"
end_datetime = "2018-06-14T23:59:59Z"


COLUMNS =['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'video_error_or_removed', 'description ']

## CREATING LIST OF CHANNEL_TITLES NAMES (RUN IT IF YOU NEED A NEW ONE)

In [2]:
# GBdf = pd.read_csv("./../../data/youtube_data/GB_videos_5p.csv", sep=';', encoding='utf_16')
# USdf = pd.read_csv("./../../data/youtube_data/US_videos_5p.csv", sep=';')
# GB_names = pd.DataFrame(set(GBdf.channel_title))
# US_names = pd.DataFrame(set(USdf.channel_title))
# GB_names.to_csv("../../our_data/Etap5/GB_to_download.csv", index=False)
# US_names.to_csv("../../our_data/Etap5/US_to_download.csv", index=False)

### Functions communicating YT API

In [71]:
def searchChannelIdByName(channel_name):
    # Disable OAuthlib's HTTPS verification when running locally.
    # *DO NOT* leave this option enabled in production.
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

    youtube = googleapiclient.discovery.build(
        "youtube", "v3", developerKey=api_key)

#     request = youtube.search().list(part="snippet", type="channel", q=channel_name)
    
    request = youtube.channels().list(
        part="id",
        forUsername=channel_name
    )
    
    response = request.execute()
#     print(response)
    if response['pageInfo']['totalResults']==0:
        print("NO channel ID for: ", channel_name)
        return -1
    return response['items'][0]['id']


def getVideosOfChannelBetweenDates(channel_name, start_datetime, end_datetime):

    # Disable OAuthlib's HTTPS verification when running locally.

    youtube = googleapiclient.discovery.build(
        "youtube", "v3", developerKey=api_key)
    cID = searchChannelIdByName(channel_name)
    if cID == -1:
        return -1
    request = youtube.search().list(
        part=["snippet"],
        channelId =cID,
        
#         type='video',
        publishedBefore=end_datetime,
        publishedAfter=start_datetime,
#         pageToken="CDIQAA",
        maxResults=NUMBER_OF_VIDEOS

    )
    
    return request.execute()

def getStatisticsForVideoID(videoID):
    youtube = googleapiclient.discovery.build(
        "youtube", "v3", developerKey=api_key)
    
    request = youtube.videos().list(
        part = 'statistics',
        maxResults = 1,
        id = videoID
    )
    return request.execute()

def getNewVideosIDsForChannel(channel_name, start_datetime, end_datetime):
    response_videos = getVideosOfChannelBetweenDates(channel_name, start_datetime, end_datetime)
    newDf = pd.DataFrame(columns=COLUMNS)
    if response_videos == -1:
        return newDf
    if response_videos['items'] == []:
        print("NO MOVIES")
        return newDf
    
    for vid in response_videos['items']:
        vid_id = vid['id']['videoId']
        data = {
            'video_id' : vid_id,
            'title' : vid['snippet']['title'],
            'channel_title' : channel_name,
            'publish_time' : vid['snippet']['publishedAt'],
            'description ' : vid['snippet']['description']
        } 
        newDf = newDf.append(pd.Series(data=data), ignore_index=True)
    return newDf

## Pobieranie filmów z tych samych kanałów

In [72]:
GB_names_to_download =  pd.read_csv("../../our_data/Etap5/GB_to_download_same_channel.csv")
US_names_to_download =  pd.read_csv("../../our_data/Etap5/US_to_download-same_channel.csv")

names_to_delete = []
names = pd.DataFrame()

if download == 'GB':
    names = GB_names_to_download
elif download == "US":
    names = US_names_to_download


newDataDf = pd.DataFrame(columns=COLUMNS)
    
for i,row in names.iterrows():
    name = row[0]
    print(name)
    try:
        tempDF = getNewVideosIDsForChannel(name, start_datetime, end_datetime)
        newDataDf = pd.concat([newDataDf, tempDF], axis=0)
    except Exception as e:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        if exc_type == googleapiclient.errors.HttpError:
            print("YT API LIMIT ACHIEVED")
            break
        print("ERROR: ", e)
    names_to_delete.append(name)
    if i == 10:
        print("FAST ESCAPE")
        break
        
# removing names
new_names = set(names['0'])
new_names = new_names.difference(set(names_to_delete))


if download == 'GB':
    GB_names = pd.DataFrame(new_names)
    GB_names.to_csv("../../our_data/Etap5/GB_to_download_same_channel.csv", index=False)
    
    if os.path.isfile("../../our_data/Etap5/GB_new_data_same_channel.csv"):
        oldDataDF = pd.read_csv("../../our_data/Etap5/GB_new_data_same_channel.csv")
        newDataDf = pd.concat([newDataDf,oldDataDF], axis=0)
    newDataDf.to_csv("../../our_data/Etap5/GB_new_data_same_channel.csv", index=False)
    
elif download == "US":
    US_names = pd.DataFrame(new_names)
    US_names.to_csv("../../our_data/Etap5/US_to_download_same_channel.csv", index=False)
    
    if os.path.isfile("../../our_data/Etap5/US_new_data_same_channel.csv"):
        oldDataDF = pd.read_csv("../../our_data/Etap5/US_new_data_same_channel.csv")
        newDataDf = pd.concat([newDataDf,oldDataDF], axis=0)
        
    newDataDf.to_csv("../../our_data/Etap5/US_new_data_same_channel.csv", index=False)
    

Trixie Mattel
NO channel ID for:  Trixie Mattel
MILO
NO MOVIES
AxwellIngrossoVEVO
ERROR:  'videoId'
Anna Akana
NO channel ID for:  Anna Akana
Vogue
HOLLAND
NO channel ID for:  HOLLAND
Bravo
NO MOVIES
Asian Boss
NO channel ID for:  Asian Boss
LastWeekTonight
PaleWavesVEVO
NO channel ID for:  PaleWavesVEVO
Voxis Productions
NO channel ID for:  Voxis Productions
FAST ESCAPE


### relatedToVideoId

In [2]:
# fiusxyygqGk
def searchRelatedVideosForVideoID(video_id, maxResults=50):
    youtube = googleapiclient.discovery.build(
        "youtube", "v3", developerKey=api_key)

    # range of time probably does not work
    start_datetime = "2017-11-14T23:59:59Z"
    end_datetime = "2018-06-14T23:59:59Z"
    # Bartka daty
    minDate="2006-07-23T08:24:11+00:00"
    maxDate="2018-06-14T01:31:53+00:00"

    request = youtube.search().list(
        part="snippet",
        relatedToVideoId=video_id,
        type='video',
        publishedBefore=maxDate,
        publishedAfter=minDate,
        maxResults=maxResults
    )

    response = request.execute()
    return response

# res = searchRelatedVideosForVideoID("fiusxyygqGk", maxResults=50)

In [3]:
import json
import os.path
from googleapiclient.errors import HttpError
# list_of_ids_path = "../../our_data/Etap5/related_list_of_ids.csv"
list_of_ids_path = "../../our_data/Etap5/lista_od_Bartka.json"

if os.path.isfile(list_of_ids_path):
    print("Lista istnieje")
    with open(list_of_ids_path) as file:
        list_of_ids = json.loads(file.read())
else:
    print("Lista NIE istnieje")
    df = pd.read_csv('../../our_data/Etap4/GB_US-rdy-to-learn-properly-V2.csv')
    list_of_ids = df.video_id.to_list()

new_videos = pd.DataFrame(columns=COLUMNS + ["relatedTo"])
for i, video_id in enumerate(list_of_ids):
    print(f"Iteracja {i} video_id: {video_id}", end='\n')
    try:
        response = searchRelatedVideosForVideoID(video_id, maxResults=50)
    except HttpError as err:
        print("ERROR: ", err)
        if err.resp.status in [404]:
            list_of_ids.remove(video_id)
            with open(list_of_ids_path, 'w') as file:
                file.write(json.dumps(list_of_ids))
            continue
        elif err.resp.status in [403]:
            print("YT API LIMIT")
            break
                                                 
                                             
    for vid in response['items']:
        if 'snippet' not in vid:
            continue
        data = {
            'video_id' : vid['id']['videoId'],
            'relatedTo': video_id,
            'title' : vid['snippet']['title'],
            'channel_title' : vid['snippet']['channelTitle'],
            'publish_time' : vid['snippet']['publishedAt'],
            'description ' : vid['snippet']['description']
        } 
        new_videos = new_videos.append(pd.Series(data=data), ignore_index=True)
                                             
    list_of_ids.remove(video_id)
    
#     if i == 2:
# #         print(f"A kończymy sobie tutaj po {i} iteracjach.")
#         break

# Zapisywanie pozostałych video_id.
with open(list_of_ids_path, 'w') as file:
    file.write(json.dumps(list_of_ids))
    
# save_path = "../../our_data/Etap5/relatedToVideoId.csv"
save_path = "../../our_data/Etap5/relatedToVideoIdBartka.csv"
if os.path.isfile(save_path):
    new_videos.to_csv(save_path, mode='a', header=False, index=False)
else:
    new_videos.to_csv(save_path, index=False)

Lista istnieje
Iteracja 0 video_id: uBYzYji4rmE
Iteracja 1 video_id: rDl3gAJpm_4
Iteracja 2 video_id: 8nWw8q3zEpw
Iteracja 3 video_id: JtIY1Naxg0M
Iteracja 4 video_id: LftOOM6qlJ4
Iteracja 5 video_id: 5w1GdX44DHc
Iteracja 6 video_id: hHaUQh6Gx-Q
Iteracja 7 video_id: -kxPDAiuijo
Iteracja 8 video_id: Vm-avsAjrIo
Iteracja 9 video_id: 36DAv_m8Mlw
Iteracja 10 video_id: _P3l2rwAaOs
Iteracja 11 video_id: cZ2KJtiNZ0U
ERROR:  <HttpError 404 when requesting https://youtube.googleapis.com/youtube/v3/search?part=snippet&relatedToVideoId=cZ2KJtiNZ0U&type=video&publishedBefore=2018-06-14T01%3A31%3A53%2B00%3A00&publishedAfter=2006-07-23T08%3A24%3A11%2B00%3A00&maxResults=50&key=AIzaSyCAFQvS-KUz0GXF96gVqCkYcecGsFYcKsU&alt=json returned "Requested entity was not found.". Details: "Requested entity was not found.">
Iteracja 12 video_id: Zz_JEN74evU
Iteracja 13 video_id: ExJmET8boVw
Iteracja 14 video_id: ek1ePFp-nBI
Iteracja 15 video_id: ZWqClKiz4_I
Iteracja 16 video_id: arY6lepNdzU
Iteracja 17 video_id: 

KeyboardInterrupt: 