## Working with Youtube API to collect data.


### Setup.


Import libraries.


In [8]:
import requests 
import pandas as pd 
import time 
import os 
import re
import json

Prepare API key and channel's ID.


In [2]:
# The API_KEY is a private api key associated with Google's account and is stored in local machine.
# To be able to run this cell, go to "https://console.cloud.google.com/" to get your own api key.
API_KEY = os.environ["LEARNING_DS_YOUTUBE_API"]
PLAYLIST_ID = "PLiaWrX4zmrTmAVd0zQuLlXsTNcC62j5rN"

### Make API calls.


A method that takes a youtube video's id as parameter and returns information about that video.

In [3]:
def get_video_information(video_id):
    # Create the url for the video.
    video_url = (
        "https://www.googleapis.com/youtube/v3/videos?id="
        + video_id
        + "&key="
        + API_KEY
        + "&part=snippet,statistics&maxResults=100"
    )

    # Make the api call.
    video_detail = requests.get(video_url).json()

    # Since some videos were placed in the list but its privacy status is empty, in this case the result
    # when we try to find that video is equals to zero. Therefore, we need to exclude these videos. 
    if video_detail["pageInfo"]["totalResults"] == 0:
        published_at = None 
        title = None 
        view_count = None 
        like_count = None 
        comment_count = None 
    # If the video found, get videos' information.
    else:
        published_at = video_detail["items"][0]["snippet"]["publishedAt"]
        title = video_detail["items"][0]["snippet"]["title"]
        view_count = video_detail["items"][0]["statistics"]["viewCount"]
        like_count = video_detail["items"][0]["statistics"]["likeCount"]
        comment_count = video_detail["items"][0]["statistics"]["commentCount"]

        # Format information form.
        published_at = published_at.split("T")[0]
        title = re.split("[:|]", title)[0]


    return published_at, title, view_count, like_count, comment_count

A method for getting list of videos in a playlist. It takes a list as parameter and append videos to that list.


In [4]:
def get_videos(list):
    # By default, the pageToken is an empty string, which will make the url is the url of
    # the first page.
    pageToken = ""

    # Put the calling stacks in a while loop to retrieve all videos from all pages by using pageToken.
    while(True):
        url = (
            "https://www.googleapis.com/youtube/v3/playlistItems?playlistId="
            + PLAYLIST_ID
            + "&key="
            + API_KEY
            + "&part=snippet"
            + "&maxResults=100"
            + "&pageToken="
            + pageToken
        )

        # Make an API call, parse the result as json and store it to "videos"
        videos = requests.get(url).json()

        # 1 second delay so that all the data can get into the loop.
        time.sleep(1)

        # Get the videos' id
        for video in videos["items"]:
            # Get the video's id.
            video_id = video["snippet"]["resourceId"]["videoId"]

            # Call the method get_video_information() to retrieve video information. Additionally, since
            # some information might be confusing e.g. publish date and video title, some re-format will 
            # also be applied.
            published_at, title, view_count, like_count, comment_count = get_video_information(video_id)
            list.append(
                {
                    "Publish_time" : published_at,
                    "Video_title" : title, 
                    "View" : view_count,
                    "Like" : like_count,
                    "Comment" : comment_count,
                    "Video_ID" : video_id
                }
            )

        # Check if the "nextPageToken" key exists, the get() method will return none 
        # there are no more page to be retrieved from. In this case, break out of the loop.
        if videos.get("nextPageToken") is None:
            break
        
        # If there are still page to be retrieved from, set the pageToken to the value 
        # get from the "nextPageToken" key and start the next iteration of the while loop.
        pageToken = videos["nextPageToken"]
        
    # Finally, return the list containing videos' id.
    return list

Call the method to retrieve videos' information.

In [5]:
videos = []
videos = get_videos(videos)

### Convert the data into human-readable form.

#### Save data in a pandas dataframe.

In [6]:
video_df = pd.DataFrame(videos)
video_df.fillna("Private video", inplace=True)
pd.set_option("display.max_rows", None)

In [7]:
video_df

Unnamed: 0,Publish_time,Video_title,View,Like,Comment,Video_ID
0,2022-04-06,BURNLEY - EVERTON,360543,3170,241,WkgdzuZWj-Y
1,2022-04-04,CRYSTAL PALACE - ARSENAL,1123638,10038,1038,k3XWU03Hb3U
2,2022-04-03,TOTTENHAM - NEWCASTLE,1623256,14590,643,Gl3NROFXXXI
3,2022-04-03,WEST HAM - EVERTON,426732,2883,161,PTOVe1sQwkY
4,2022-04-02,MANCHESTER UNITED - LEICESTER CITY,2293241,21328,1868,2bw9JQGOXf4
5,2022-04-02,BRIGHTON - NORWICH,165588,900,32,6vlF4i2O5uU
6,2022-04-02,WOLVERHAMPTON - ASTON VILLA,514217,3117,198,H8z-cH8UMjI
7,2022-04-02,BURNLEY - MAN CITY,515370,6128,339,dF4WxS8foR0
8,2022-04-02,LIVERPOOL - WATFORD,799287,6855,321,m40Y6ANFFro
9,2022-03-20,TOTTENHAM - WEST HAM,1551906,14467,645,zIdE46dzfE4


Save data as JSON.

In [10]:
def save_data_json(title, data):
    with open(title, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=2)

def load_data_json(title):
    with open(title, encoding="utf-8") as file: 
        return json.load(file)

In [11]:
save_data_json("epl_21_22_highlights.json", videos)

Save data as CSV.

In [12]:
video_df.to_csv("epl_21_22_highlights.csv")

In [None]:
test_array = [[1, 2, 3],[3,   2, 12]

[3, 21, 31]]

test_string = "asdfasdfa sdfasdfasdf asdfasdfa sdfa asdfuaopiuenaklsjd" + "asdffffffffffffffffffffffffffffffffffffffffff"