# Building a Data Pipeline in Python

The goal of this project is to load in data from a YouTube channel API and extract useful data in a dataframe format, then upload that to an AWS database. 

In [1]:
import requests 
import time
import pandas as pd
from dotenv import load_dotenv
import os
import psycopg2 as ps 
import html

In [2]:
def configure():
    load_dotenv() #securely loading in my credentials from .env

For this project I will be looking at the popular science channel, Kurzgesagt. In order to find the channel ID, we obtain it from the source code on the YouTube channel's homepage. We also need the base url from which we will form the root of our api, this can be found in the documentation: https://developers.google.com/youtube/v3/docs/search/list

In [3]:
#key and ID, you will want to replace the API key with your own
configure()
API_KEY = os.getenv("API_KEY")
CHANNEL_ID = "UCsXVk37bltHxD1rDPwtNM8Q"

## 1. Initial Exploration

First, I will craft an API from the base URL and the parameters found in the documentation.

In [99]:
url = f"https://www.googleapis.com/youtube/v3/search?key={API_KEY}&channelId={CHANNEL_ID}&part=snippet,id&order=date&maxResults=2000"

video_info = requests.get(url).json()

video_info

{'kind': 'youtube#searchListResponse',
 'etag': 'XlOhiDG6Wfs4prkkDSeARcXxJ2M',
 'nextPageToken': 'CDIQAA',
 'regionCode': 'US',
 'pageInfo': {'totalResults': 206, 'resultsPerPage': 50},
 'items': [{'kind': 'youtube#searchResult',
   'etag': 'Fmk-JUAYW9KB-NAJIx6mh6rRURI',
   'id': {'kind': 'youtube#video', 'videoId': 'LEENEFaVUzU'},
   'snippet': {'publishedAt': '2022-06-28T14:00:23Z',
    'channelId': 'UCsXVk37bltHxD1rDPwtNM8Q',
    'title': 'The Last Human – A Glimpse Into The Far Future',
    'description': 'Because of the potential size of the future, the most important thing about our actions today might be their impact on future ...',
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/LEENEFaVUzU/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.ytimg.com/vi/LEENEFaVUzU/mqdefault.jpg',
      'width': 320,
      'height': 180},
     'high': {'url': 'https://i.ytimg.com/vi/LEENEFaVUzU/hqdefault.jpg',
      'width': 480,
      'height

We will have a few options to pick from for our statistics, including like count, view count, comment count, and favorite count. Favorite count is always zero, so we will leave it out. 

In [88]:
video_df = pd.DataFrame(columns = [ 'vid_id', 'vid_title', 'upload_date', 'view_count', 
                                    'like_count', 'comment_count'])

for vid in video_info['items']:
    if vid['id']['kind'] == 'youtube#video':
        vid_id = vid['id']['videoId']
        vid_title = vid['snippet']['title']
        upload_date = vid['snippet']['publishedAt']
        upload_time = str(upload_date).split("T")[1]
        upload_date = str(upload_date).split("T")[0]
        
        #obtaining stats using video id
        
        vid_url = "https://www.googleapis.com/youtube/v3/videos?key="+API_KEY+"&part=statistics&id="+vid_id
        video_info_vid = requests.get(vid_url).json()
        
        view_count = video_info_vid['items'][0]['statistics']['viewCount']
        like_count = video_info_vid['items'][0]['statistics']['likeCount']
        comment_count = video_info_vid['items'][0]['statistics']['commentCount']
        d = {'vid_id':[vid_id], 'vid_title':[vid_title], 'upload_date':[upload_date], 
             'view_count':[view_count], 'like_count':[like_count], 'comment_count':[comment_count]}
        video_df = pd.concat([video_df, pd.DataFrame(data = d)])

In [8]:
video_df

Unnamed: 0,vid_id,vid_title,upload_date,view_count,like_count,comment_count
0,LEENEFaVUzU,The Last Human – A Glimpse Into The Far Future,2022-06-28,3769614,282757,13645
0,75d_29QWELk,Change Your Life – One Tiny Step at a Time,2022-06-07,4696114,347370,10069
0,Pj-h6MEgE7I,You Are Not Where You Think You Are,2022-05-17,5838981,324156,13867
0,7OPg-ksxZ4Y,The Most Horrible Parasite: Brain Eating Amoeba,2022-05-03,5232541,310466,15826
0,LxgMdjyw8uw,We WILL Fix Climate Change!,2022-04-05,7942534,546696,38305
0,KRvv0QdruMQ,Are There Lost Alien Civilizations in Our Past?,2022-03-01,9101914,388520,16461
0,lheapd7bgLA,What Happens if the Moon Crashes into Earth?,2022-02-08,11964453,443325,25101
0,xAUJYP8tnRE,Why We Should NOT Look For Aliens - The Dark F...,2021-12-14,10938294,550337,28300
0,XFqn3uy238E,...And We&#39;ll Do it Again,2021-12-07,9796992,626482,24853
0,F1Hq8eVOMHs,Is Meat Really that Bad?,2021-11-30,6591421,368904,43080


## 2. Cleaning and Optimizing Code

This has only collected videos from a single page, we want to loop through all page tokens. Also, it would be better to collect this loop into a function that obtains this same data.

In [238]:
def get_youtube_data(API_KEY, CHANNEL_ID):
    page = ""
    vid_df = pd.DataFrame(columns=["vid_id","vid_title","upload_time","upload_date","view_count","like_count","comment_count"]) 
    
    while True:
        url = "https://www.googleapis.com/youtube/v3/search?key="+API_KEY+"&channelId="+CHANNEL_ID+"&order=date&maxResults=2000&part=snippet,id&"+page

        video_info = requests.get(url).json()
        time.sleep(1) #waits for one second
        for video in video_info['items']:
            if video['id']['kind'] == "youtube#video":
                vid_id = video['id']['videoId']
                vid_title = video['snippet']['title']
                upload_date = video['snippet']['publishedAt']
                upload_time = str(upload_date).split("T")[1]
                upload_time = upload_time.replace("Z","")
                upload_date = str(upload_date).split("T")[0]
                
                #making a separate api call to pull the video stats
                url_vid_stats = "https://www.googleapis.com/youtube/v3/videos?id="+vid_id+"&part=statistics&key="+API_KEY
                vid_stats = requests.get(url_vid_stats).json()
                
                view_count = vid_stats['items'][0]['statistics']['viewCount']
                like_count = vid_stats['items'][0]['statistics']['likeCount']
                comment_count = vid_stats['items'][0]['statistics']['commentCount']
                
                #concatenating into the dataframe
                d = {'vid_id':[vid_id], 'vid_title':[vid_title], 'upload_time':[upload_time],
                     'upload_date':[upload_date], 'view_count':[view_count], 
                     'like_count':[like_count], 'comment_count':[comment_count]}
                vid_df = pd.concat([vid_df, pd.DataFrame(data = d)], ignore_index = True)
                
                
        try:
            if video_info['nextPageToken'] != None: 
                page = "pageToken=" + video_info['nextPageToken'] # causes loop to end when we reach final page

        except:
            break
        
        #translating html codes in title names to their corresponding symbols
    for i in range(len(vid_df)):
        vid_df.vid_title[i] = html.unescape(vid_df.vid_title[i])
    vid_df['upload_date'] = pd.to_datetime(vid_df['upload_date'])

    return vid_df

In [239]:
video_df = get_youtube_data(API_KEY, CHANNEL_ID)

Now, instead of a series of for loops, we have a single function which allows for us to pull this data from any channel that we have the channel ID for. 

Next steps will be to perform sentiment analysis on titles, relate that to view counts, and then export that to AWS

In [240]:
pd.set_option("display.max_rows", None)

video_df

Unnamed: 0,vid_id,vid_title,upload_time,upload_date,view_count,like_count,comment_count
0,LEENEFaVUzU,The Last Human – A Glimpse Into The Far Future,14:00:23,2022-06-28,4411765,317218,15150
1,75d_29QWELk,Change Your Life – One Tiny Step at a Time,14:00:05,2022-06-07,4767383,351058,10132
2,Pj-h6MEgE7I,You Are Not Where You Think You Are,13:59:44,2022-05-17,5875201,325451,13891
3,7OPg-ksxZ4Y,The Most Horrible Parasite: Brain Eating Amoeba,13:59:29,2022-05-03,5268533,311403,15890
4,LxgMdjyw8uw,We WILL Fix Climate Change!,13:59:18,2022-04-05,7980593,547695,38413
5,KRvv0QdruMQ,Are There Lost Alien Civilizations in Our Past?,14:59:23,2022-03-01,9124257,389041,16470
6,lheapd7bgLA,What Happens if the Moon Crashes into Earth?,14:59:49,2022-02-08,11996164,443848,25121
7,xAUJYP8tnRE,Why We Should NOT Look For Aliens - The Dark F...,15:00:03,2021-12-14,10979992,551197,28327
8,XFqn3uy238E,...And We'll Do it Again,14:59:44,2021-12-07,9808528,626804,24855
9,F1Hq8eVOMHs,Is Meat Really that Bad?,15:01:34,2021-11-30,6603308,369199,43117


Timestamp('2022-06-07 00:00:00')

## 3. Performing NLP Sentiment Analysis

Will be built out soon


## 4. Porting to AWS

Next I will export this pandas dataframe to AWS, first loading in credentials from the .env file on my system, then connecting to the database.

In [12]:
configure()
ENDPOINT=os.getenv("ENDPOINT")
PORT=os.getenv("PORT")
DB_NAME=os.getenv("DB_NAME")
USERNAME=os.getenv("USERNAME")
PASSWORD=os.getenv("PASSWORD")

In [115]:
def db_connect(host, database, user, password, port):
    try:
        connection = ps.connect(host=host, database=database, user=user, password=password, port=port)

    except ps.OperationalError as e:
        raise e
    else:
        print('Connected!')
        return connection

In [123]:
connection = db_connect(host = ENDPOINT,database = DB_NAME,
                        user = USERNAME,password = PASSWORD,
                        port = PORT)

Connected!


In [286]:
def initialize_vid_table(curs):
    sql_create_df = ("""CREATE TABLE IF NOT EXISTS video_data (
                vid_id VARCHAR(255) PRIMARY KEY,
                vid_title VARCHAR(255) NOT NULL,
                upload_time VARCHAR(255) NOT NULL,
                upload_date VARCHAR(255) NOT NULL,
                view_count INTEGER NOT NULL,
                like_count INTEGER NOT NULL,
                comment_count INTEGER NOT NULL
            )""")
    curs.execute(sql_create_df)
    connection.commit()
#would ideally import upload time and date as time and date objects, 
#but was causing errors

In [287]:
curs = connection.cursor()
connection.commit()

Now we create a table on the AWS database with SQL which has the same column names as our pandas dataframe. After running the following line, I will check the backend in my database management software by running "SELECT * FROM video_data" (it worked). 

In [288]:
initialize_vid_table(curs)

Now to write the code that will add a video to the SQL table if they are not there currently, or update them if they are already present in the table. One thought is that if I write the loop so that it inserts the new videos as it finds them, it will also needlessly update those new videos, adding to the amount of time for the code to execute. This is negligible with only a few new videos, but if we were pulling from a large number of channels and updating many videos it could add to processing time substantially. Therefore I will store the new rows in a separate dataframe, and insert them all at once at the end.

In [267]:
# rough outline of what I want to code:

# for row in video_df:
#     if row in sql_table:
#         update(row)
#     else:
#         df = pd.concat(df,row)
# insert(df)
        
def vid_in_table(curs, vid_id):
    sql_query = ("""SELECT vid_id FROM video_data WHERE vid_id = %s""")
    curs.execute(sql_query,(vid_id,))
    
    return curs.fetchone() is not None
    
def update_vid(curs, vid_id, vid_title, view_count, like_count, comment_count):
    sql_query = ("""UPDATE video_data
                    SET vid_title = %s,
                        view_count = %s,
                        like_count = %s,
                        comment_count = %s,
                    WHERE vid_id = %s;""")
    update_vars = (vid_title, view_count, like_count, comment_count) #tuple of vars I want to update
    curs.execute(sql_query, update_vars)
    
def insert_vids(curs, vid_id, vid_title, upload_time, upload_date, view_count, like_count, comment_count):
    sql_query = ("""INSERT INTO video_data (
                        vid_id, vid_title, upload_time,
                        upload_date, view_count, like_count, 
                        comment_count)
                    VALUES(%s,%s,%s,%s,%s,%s,%s);""")
    insert_vars = (vid_id, vid_title, upload_time, upload_date, view_count, like_count, comment_count)
    curs.execute(sql_query, insert_vars)
        

Now that we have the functions we need in order to build our loop, I will pull it all together into one final function which will port the dataframe into the database. 

In [306]:
def df_to_db(curs, video_df):
    insert_df = pd.DataFrame(columns=["vid_id","vid_title","upload_time",
                                      "upload_date","view_count","like_count",
                                      "comment_count"]) 
    for i,vid in video_df.iterrows():
        if vid_in_table(curs, vid["vid_id"]):
            update_vid(curs, vid["vid_id"], vid["vid_title"], vid["view_count"],
                       vid["like_count"], vid["comment_count"])
        else:
            temp = pd.DataFrame(data = {"vid_id":[vid["vid_id"]],"vid_title":[vid["vid_title"]],
                                        "upload_time":[vid["upload_time"]],"upload_date":[vid["upload_date"]],
                                        "view_count":[vid["view_count"]],"like_count":[vid["like_count"]],
                                        "comment_count":[vid["comment_count"]]})
            insert_df = pd.concat([insert_df, temp], ignore_index = True)
    
    for i,vid in insert_df.iterrows():
        insert_vids(curs, vid["vid_id"], vid["vid_title"], vid["upload_time"],
                    vid["upload_date"], vid["view_count"],
                    vid["like_count"], vid["comment_count"])
    connection.commit()
        
        

And with that, I have a function (df_to_db) which will take our pandas dataframe, go line by line through it, and either update values (if the video is already in the database) or add a new video (if it is not yet in the database).

In [307]:
df_to_db(curs, video_df)

And we're done!