In [None]:
# define the ISO 8601 duration to seconds converter
def ISO8601toSeconds(inputStr):
    timeStr = inputStr.split("T")
    seconds = 0
    if "H" in timeStr[1]:
        timeStr = timeStr[1].split("H")
        seconds = seconds + 60 * 60 * int(timeStr[0])
    if "M" in timeStr[1]:
        timeStr = timeStr[1].split("M")
        seconds = seconds + 60 * int(timeStr[0])
    if "S" in timeStr[1]:
        timeStr = timeStr[1].split("S")
        seconds = seconds + int(timeStr[0])
    return seconds

In [None]:
#Import Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
import requests
import json
from config import gkey

In [None]:
#Import and read Csv file
csv_path = "Resources/superbowl-ads.csv"

ads_df = pd.read_csv(csv_path)
ads_df.head()

In [None]:
# removed column of superbowl ad website data
ads_reduced_df = ads_df[["year", "brand", "youtube_url", "funny", "show_product_quickly", "patriotic", "celebrity", 
                         "danger", "animals", "use_sex"]]
ads_reduced_df.head()

In [None]:
# gave columns to better names
ads_renamed_df = ads_reduced_df.rename(columns={"year":"Year", "brand":"Brand", "youtube_url":"YouTube URL", 
                                                "funny":"Funny", "show_product_quickly":"Shows Product Quickly",
                                                "patriotic":"Patriotic","celebrity":"Celebrity", "danger":"Danger",
                                                "animals": "Animals", "use_sex":"Use Sex"})
ads_renamed_df.head()

In [None]:
# remove all rows that didn't contain a YouTube URL
ads_clean_df = ads_renamed_df.dropna(how='any')
ads_clean_df.head()

In [None]:
# define the base URL for API requests
base_url = "https://www.googleapis.com/youtube/v3/videos"

# declare the api response lists
viewCounts = []
durations = []

# iterate through the YouTube URLs
for videoURL in ads_clean_df["YouTube URL"]:
    
    # extract the video's ID
    videoID = videoURL.split("=")[1]
    
    # define the request parameters
    parameters = {
                "part": "statistics,contentDetails",
                "id": videoID,
                "key": gkey}
    
    # perform the API request
    response_json = requests.get(base_url, params = parameters).json()
    
    # check if the API response contains a video resource
    if len(response_json["items"]) > 0:
        viewCounts.append(int(response_json["items"][0]["statistics"]["viewCount"]))
        durations.append(ISO8601toSeconds(response_json["items"][0]["contentDetails"]["duration"]))
    else:
        viewCounts.append("empty")
        durations.append("empty")

In [None]:
# create the new DataFrame
final_df = pd.DataFrame({
                    "Year": ads_clean_df["Year"],
                    "Brand": ads_clean_df["Brand"],
                    "YouTube URL": ads_clean_df["YouTube URL"],
                    "Funny": ads_clean_df["Funny"],
                    "Shows Product Quickly": ads_clean_df["Shows Product Quickly"],
                    "Patriotic": ads_clean_df["Patriotic"],
                    "Celebrity": ads_clean_df["Celebrity"],
                    "Danger": ads_clean_df["Danger"],
                    "Animals": ads_clean_df["Animals"],
                    "Use Sex": ads_clean_df["Use Sex"],
                    "View Counts": viewCounts,
                    "Duration (seconds)": durations})

In [None]:
# remove rows that don't contain view counts/durations
final_df = final_df.loc[(final_df["View Counts"] != "empty") & (final_df["Duration (seconds)"] != "empty")]

# save the DataFrame to file
final_df.to_csv("Resources/Cleaned Data.csv")

# preview the DataFrame
final_df.head()