# YouTube API to AWS Pipeline

## Libraries

In [None]:
import requests
import pandas as pd
import time

## Fetching YouTube video data

### Authentication

In [None]:
API_KEY = "ENTER KEY"
CHANNEL_ID = "ENTER CHANNEL ID"

### Fetching video metrics

In [None]:
def get_video_metrics(video_id):
    url_video_stats = "https://www.googleapis.com/youtube/v3/videos?id=" + video_id + "&part=statistics&key=" + API_KEY
    response_video_stats = requests.get(url_video_stats).json()

    view_count = response_video_stats["items"][0]["statistics"]["viewCount"]
    like_count = response_video_stats["items"][0]["statistics"]["likeCount"]
    favorite_count = response_video_stats["items"][0]["statistics"]["favoriteCount"]
    comment_count = response_video_stats["items"][0]["statistics"]["commentCount"]

    return view_count, like_count, favorite_count, comment_count

### Fetching initial videos

In [None]:
def get_videos(df):
    # Make API call
    pageToken = ""
    url = "https://www.googleapis.com/youtube/v3/search?key="+API_KEY+"&channelId="+CHANNEL_ID+"&part=snippet,id&order=date&maxResults=10000"+pageToken

    response = requests.get(url).json()
    response = response["items"]
    time.sleep(1)

    # Work for each single video
    for video in response:
        if video["id"]["kind"] == "youtube#video":
            videoId = video["id"]["videoId"]
            title = video["snippet"]["title"]
            date = video["snippet"]["publishedAt"]
            date = str(date).split("T")[0]

            view_count, like_count, favorite_count, comment_count = get_video_metrics(videoId)

            # Save data in dataframe
            df = df.append({"video_id": videoId, "video_title": title, "upload_date": date, "view_count": view_count, "like_count": like_count, "favorite_count": favorite_count, "comment_count": comment_count}, ignore_index=True)
    return df

### Build our dataframe

In [None]:
df = pd.DataFrame(columns=["video_id","video_title","upload_date","view_count","like_count","favorite_count","comment_count"])

### Fetching video data

In [None]:
df = get_videos(df)
df.columns

## Pipeline to mysql-AWS

In [None]:
import mysql.connector

In [None]:
df

### Functions

#### connect_to_db()

In [None]:
def connect_to_db(host, user, password):
    mydb = mysql.connector.connect(
        host = host,
        user = user,
        password = password,
        database = database
    )
    return mydb

#### create_table()

In [None]:
def create_table(mycursor):
    create_table_command = ("""
    CREATE TABLE IF NOT EXISTS videos (
    video_id VARCHAR(255) PRIMARY KEY,
    video_title VARCHAR(255) NOT NULL,
    upload_date DATE NOT NULL DEFAULT (CURRENT_DATE),
    view_count INTEGER NOT NULL,
    like_count INTEGER NOT NULL,
    favorite_count INTEGER NOT NULL,
    comment_count INTEGER NOT NULL)
    """)

    mycursor.execute(create_table_command)

#### check_if_video_exists()

In [None]:
def check_if_video_exists(mycursor, video_id):
    query = ("""SELECT video_id FROM videos WHERE video_id = %s""")
    mycursor.execute(query, (video_id,)) # Python and mySQL want the value as a tuple, even if there is only one value
    return mycursor.fetchone() is not None

#### update_row()

In [None]:
def update_row(mycursor, video_title, upload_date, view_count, like_count, favorite_count, comment_count, video_id):

    query = ("""
    UPDATE videos
        SET video_title = %s,
            upload_date = %s,
            view_count = %s,
            like_count = %s,
            favorite_count = %s,
            comment_count = %s
        WHERE video_id = %s;
    """)
    vars_to_update = (video_title, upload_date, view_count, like_count, favorite_count, comment_count, video_id)
    mycursor.execute(query, vars_to_update)

#### insert_row()

In [None]:
def insert_row(mycursor, video_id, video_title, upload_date, view_count, like_count, favorite_count, comment_count):
    insert_into_videos_query = ("""INSERT INTO videos (video_id, video_title, upload_date, view_count, like_count, favorite_count, comment_count) VALUES(%s, %s, %s, %s, %s, %s, %s)""")

    row_to_insert = (video_id, video_title, upload_date, view_count, like_count, favorite_count, comment_count)

    mycursor.execute(insert_into_videos_query, row_to_insert)

#### append_from_df_to_db()

In [None]:
def append_from_df_to_db(mycursor, df):
    for i, row in df.iterrows():
        insert_row(mycursor, row["video_id"], row["video_title"], row["upload_date"], row["view_count"], row["like_count"], row["favorite_count"], row["comment_count"])

#### update_db()

In [None]:
def update_db(mycursor, df):
    tmp_df = pd.DataFrame(columns=['video_id', 'video_title', 'upload_date', 'view_count', 'like_count','favorite_count', 'comment_count'])

    for i, row in df.iterrows():
        if check_if_video_exists(mycursor, row["video_id"]):
            update_row(mycursor, row["video_title"], row["upload_date"], row["view_count"], row["like_count"], row["favorite_count"], row["comment_count"], row["video_id"])
        else:
            tmp_df = tmp_df.append(row)

    return tmp_df

### Execution

In [None]:
host = "host"
user = "user"
password = "pw"
database = "pw"

In [None]:
mydb = connect_to_db(host, user, password) # Make a connection to database
mycursor = mydb.cursor() # Create the cursor (navigator)

In [None]:
create_table(mycursor) # Create the videos table (if not already existing)

mycursor.execute("DESCRIBE videos")
mycursor.fetchall()

In [None]:
new_vid_df = update_db(mycursor, df)

In [None]:
append_from_df_to_db(mycursor, new_vid_df)

In [None]:
mydb.commit()