In [7]:
import requests 
import time
import pandas as pd
from dotenv import load_dotenv
import os
import psycopg2 as ps 
import html
from flair.models import TextClassifier
from flair.data import Sentence

In [8]:
def configure():
    load_dotenv() #securely loading in my credentials from .env

In [9]:
#key and ID
configure()
API_KEY = os.getenv("API_KEY")
CHANNEL_ID = "UCsXVk37bltHxD1rDPwtNM8Q" #ID for any YouTube channel (currently Kurzgesagt)

In [10]:
# Pulling channel data from YouTube API
def get_youtube_data(API_KEY, CHANNEL_ID):
    page = ""
    vid_df = pd.DataFrame(columns=["vid_id","vid_title","upload_time","upload_date","view_count","like_count","comment_count"]) 
    
    while True:
        url = "https://www.googleapis.com/youtube/v3/search?key="+API_KEY+"&channelId="+CHANNEL_ID+"&order=date&maxResults=2000&part=snippet,id&"+page

        video_info = requests.get(url).json()
        time.sleep(1) #waits for one second, letting call finish
        for video in video_info['items']:
            if video['id']['kind'] == "youtube#video":
                vid_id = video['id']['videoId']
                vid_title = video['snippet']['title']
                upload_date = video['snippet']['publishedAt']
                upload_time = str(upload_date).split("T")[1]
                upload_time = upload_time.replace("Z","")
                upload_date = str(upload_date).split("T")[0]
                
                #making an api call using the video id to pull video stats
                url_vid_stats = "https://www.googleapis.com/youtube/v3/videos?id="+vid_id+"&part=statistics&key="+API_KEY
                vid_stats = requests.get(url_vid_stats).json()
                
                view_count = vid_stats['items'][0]['statistics']['viewCount']
                like_count = vid_stats['items'][0]['statistics']['likeCount']
                comment_count = vid_stats['items'][0]['statistics']['commentCount']
                
                #concatenating into the dataframe
                d = {'vid_id':[vid_id], 'vid_title':[vid_title], 'upload_time':[upload_time],
                     'upload_date':[upload_date], 'view_count':[view_count], 
                     'like_count':[like_count], 'comment_count':[comment_count]}
                vid_df = pd.concat([vid_df, pd.DataFrame(data = d)], ignore_index = True)
                
                
        try:
            if video_info['nextPageToken'] != None: 
                page = "pageToken=" + video_info['nextPageToken'] # causes loop to end when we reach final page

        except:
            break
        
    #translating html codes in title names to their corresponding symbols
    for i in range(len(vid_df)):
        vid_df.vid_title[i] = html.unescape(vid_df.vid_title[i])
    vid_df['upload_date'] = pd.to_datetime(vid_df['upload_date'])

    return vid_df

In [128]:
video_df = get_youtube_data(API_KEY, CHANNEL_ID)

In [121]:
#Adds output of title sentment analysis as a new column
def add_sentiment(video_df):
    title_sentiment = []

    for i,vid in video_df.iterrows():
        title = Sentence(vid['vid_title'])
        classifier.predict(title)
        for label in title.labels:
            if label.score <= .75:
                title_sentiment.append("NEUTRAL")
            else:
                title_sentiment.append(label.value)

    video_df["title_sentiment"] = title_sentiment
        

In [129]:
add_sentiment(video_df)

In [131]:
#loading in AWS credentials
configure()
ENDPOINT=os.getenv("ENDPOINT")
PORT=os.getenv("PORT")
DB_NAME=os.getenv("DB_NAME")
USERNAME=os.getenv("USERNAME")
PASSWORD=os.getenv("PASSWORD")

In [132]:
def db_connect(host, database, user, password, port):
    try:
        connection = ps.connect(host=host, database=database, user=user, password=password, port=port)

    except ps.OperationalError as e:
        raise e
    else:
        print('Connected!')
        return connection

In [133]:
connection = db_connect(host = ENDPOINT,database = DB_NAME,
                        user = USERNAME,password = PASSWORD,
                        port = PORT)

Connected!


In [134]:
#creates a table on the database if there is not one already
def initialize_vid_table(curs):
    sql_create_df = ("""CREATE TABLE IF NOT EXISTS video_data (
                vid_id VARCHAR(255) PRIMARY KEY,
                vid_title VARCHAR(255) NOT NULL,
                upload_time VARCHAR(255) NOT NULL,
                upload_date VARCHAR(255) NOT NULL,
                view_count INTEGER NOT NULL,
                like_count INTEGER NOT NULL,
                comment_count INTEGER NOT NULL,
                title_sentiment VARCHAR(255) NOT NULL
            )""")
    curs.execute(sql_create_df)
    connection.commit()

In [135]:
curs = connection.cursor()
connection.commit()

initialize_vid_table(curs)

In [137]:
#determines if a video is in the database table
def vid_in_table(curs, vid_id):
    sql_query = ("""SELECT vid_id FROM video_data WHERE vid_id = %s""")
    curs.execute(sql_query,(vid_id,))
    
    return curs.fetchone() is not None

#updates video information
def update_vid(curs, vid_id, vid_title, view_count, like_count, comment_count, title_sentiment):
    sql_query = ("""UPDATE video_data
                    SET vid_title = %s,
                        view_count = %s,
                        like_count = %s,
                        comment_count = %s,
                        title_sentiment = %s,
                    WHERE vid_id = %s;""")
    update_vars = (vid_title, view_count, like_count, comment_count) #tuple of vars I want to update
    curs.execute(sql_query, update_vars)

#inserts new videos
def insert_vids(curs, vid_id, vid_title, upload_time, upload_date, view_count, like_count, comment_count, title_sentiment):
    sql_query = ("""INSERT INTO video_data (
                        vid_id, vid_title, upload_time,
                        upload_date, view_count, like_count, 
                        comment_count, title_sentiment)
                    VALUES(%s,%s,%s,%s,%s,%s,%s,%s);""")
    insert_vars = (vid_id, vid_title, upload_time, upload_date, view_count, like_count, comment_count, title_sentiment)
    curs.execute(sql_query, insert_vars)
        

In [138]:
#adds video data to database or updates if already present
def df_to_db(curs, video_df):
    insert_df = pd.DataFrame(columns=["vid_id","vid_title","upload_time",
                                      "upload_date","view_count","like_count",
                                      "comment_count", "title_sentiment"]) 
    for i,vid in video_df.iterrows():
        if vid_in_table(curs, vid["vid_id"]):
            update_vid(curs, vid["vid_id"], vid["vid_title"], vid["view_count"],
                       vid["like_count"], vid["comment_count"], vid["title_sentiment"])
        else:
            temp = pd.DataFrame(data = {"vid_id":[vid["vid_id"]],"vid_title":[vid["vid_title"]],
                                        "upload_time":[vid["upload_time"]],"upload_date":[vid["upload_date"]],
                                        "view_count":[vid["view_count"]],"like_count":[vid["like_count"]],
                                        "comment_count":[vid["comment_count"]],"title_sentiment":[vid["title_sentiment"]]})
            insert_df = pd.concat([insert_df, temp], ignore_index = True)
    
    for i,vid in insert_df.iterrows():
        insert_vids(curs, vid["vid_id"], vid["vid_title"], 
                    vid["upload_time"], vid["upload_date"], 
                    vid["view_count"], vid["like_count"], 
                    vid["comment_count"], vid["title_sentiment"])
    connection.commit()
        

In [139]:
df_to_db(curs, video_df) #commits youtube data to database