In [None]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st

In [None]:
# import resources
main_df = pd.read_csv("Resources/Cleaned Data.csv")
main_df.head()

In [None]:

ad_tag_view_counts_df = pd.read_csv("Resources/ad_tag_view_counts.csv")
ad_tag_counts_df = pd.read_csv("Resources/ad_tag_counts.csv")

ad_tag_view_counts_df

# Bar Charts

In [None]:
# create one bar chart to get the total number of ads by type
plt.figure(figsize=(15,7))
plt.bar(ad_tag_counts_df.keys(), ad_tag_counts_df.iloc[0, :] , color="blue", align="center", width=0.5)
plt.xticks([value for value in ad_tag_counts_df.keys()], rotation ="vertical", fontsize=12)
plt.title("Total number of ads by type", fontsize=18)
plt.xlabel("Type", fontsize=12)
plt.ylabel("Total number of ads", fontsize=12)
plt.show()

In [None]:
# create one bar chart to get the total number of views by ads type
plt.figure(figsize=(15,7))
plt.bar(ad_tag_view_counts_df.keys(), ad_tag_view_counts_df.iloc[0, :] , color="blue", align="center", width=0.5)
plt.xticks([value for value in ad_tag_view_counts_df.keys()], rotation ="vertical", fontsize=12)
plt.title("Total number of views by ads type", fontsize=18)
plt.xlabel("Type", fontsize=12)
plt.ylabel("Total number of views by ads type", fontsize=12)
plt.show()

## Pie Charts 


In [None]:
# count of funny vs not funny by type
#funny 
funny_ads=len(main_df.loc[main_df["Funny"] == True])
#not funny
not_funny_ads=len(main_df.loc[main_df["Funny"] == False])

In [None]:
# Pie chart showing funny vs not funny by type

plt.figure(figsize=(15, 6))
#Labels for each slice of the pie chart
labels = ["Funny", "Not Funny"]

#Values representing the number of funny and not funny ad videos
sizes = [funny_ads, not_funny_ads]

#colors for each slice of the pie chart
colors = ["yellow", "lightskyblue"]

#funny separated from not funny
explode = [0.1, 0]
#create pie chart from values
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
         autopct="%1.1f%%", shadow=True, startangle=90)

plt.title("Number of Ads Tagged As Funny")

plt.show()




In [None]:
#count of views by funny and not funny ads
funny_views=main_df.loc[main_df["Funny"] == True] ["View Counts"].sum()

not_funny_views=main_df.loc[main_df["Funny"] == False] ["View Counts"].sum()

# pie chart of funny vs not funny by viewcount


plt.figure(figsize=(15,6))
#Labels for each slice of the pie chart
labels = ["Funny", "Not Funny"]

#Values representing the viewcounts of funny and not funny ad videos
sizes = [funny_views, not_funny_views]

#colors for each slice of the pie chart
colors = ["green", "red"]

#funny separated from not funny
explode = [0.1, 0]
#create pie chart from values
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
         autopct="%1.1f%%", shadow=True, startangle=90)

plt.title("Funny and Not Funny Ads by View Count")
plt.show()






# Hypothesis Test

###### Hypothesis
Ads that contain the tag 'Funny' would receive more views on YouTube.

###### Null Hypothesis
There is no statistically significant difference in view counts between ads that contain the 'Funny' tag and those that do not contain the 'Funny' tag.

###### Results
Because the calculated P-Value is exactly zero, we reject the null hypothesis.

In [None]:
# extract the relevant data from the main DataFrame
funnySeries = main_df.loc[main_df["Funny"] == True, :]["View Counts"]
unfunnySeries = main_df.loc[main_df["Funny"] == False, :]["View Counts"]

In [None]:
# construct the hypothesis test series
obsSeries = pd.Series(data = [sum(funnySeries), sum(unfunnySeries)], name = "observed")
expSeries = pd.Series(data = [sum(obsSeries) / 2, sum(obsSeries) / 2], name = "expected")

In [None]:
# perform a Chi-Squared test because this is comparing categorical data
pValue = st.chisquare(obsSeries, expSeries)[1]

# display the p-value
print(f"P-Value = {pValue:,.4f}")

### Batch Hypothesis Tests

In [None]:
# construct list of ad categories
categories = ["Funny", "Shows Product Quickly", "Celebrity", "Danger", "Animals", "Use Sex"]

# declare the p-value lists
chi_square_pvalues = []
student_ttest_pvalues = []
mannwhitneyu_pvalues = []
kruskal_pvalues = []

# declare the summary statistics lists
maxTrue_list = []
minTrue_list = []
sumTrue_list = []
meanTrue_list = []
medianTrue_list = []
maxFalse_list = []
minFalse_list = []
sumFalse_list = []
meanFalse_list = []
medianFalse_list = []

# iterate through the ad categories
for category in categories:
    
    # extract the series
    whenTrue = main_df.loc[main_df[category] == True, :]["View Counts"]
    whenFalse = main_df.loc[main_df[category] == False, :]["View Counts"]
    
    # construct the Chi Square observed series
    observed = pd.Series(data = [sum(whenTrue), sum(whenFalse)], name = "observed")
    expected = pd.Series(data = [sum(observed) / 2, sum(observed) / 2], name = "expected")
    
    # calculate the P-Value with a Chi Square Test
    chi_square_pvalues.append(st.chisquare(observed, expected)[1])
    
    # calculate the P-Value with a Student T-Test
    student_ttest_pvalues.append(st.ttest_ind(whenFalse, whenTrue)[1])
    
    # calculate the P-Value with a Mann-Whitney U Test
    mannwhitneyu_pvalues.append(st.mannwhitneyu(whenTrue, whenFalse, alternative = "less")[1])
    
    # calculate the P-Value with a Kruskal-Wallis Test
    kruskal_pvalues.append(st.kruskal(whenTrue, whenFalse)[1])
    
    # calculate the summary statistics
    maxTrue_list.append(max(whenTrue))
    minTrue_list.append(min(whenTrue))
    sumTrue_list.append(sum(whenTrue))
    meanTrue_list.append(whenTrue.mean())
    medianTrue_list.append(whenTrue.median())
    maxFalse_list.append(max(whenFalse))
    minFalse_list.append(min(whenFalse))
    sumFalse_list.append(sum(whenFalse))
    meanFalse_list.append(whenFalse.mean())
    medianFalse_list.append(whenFalse.median())

In [None]:
# create a DataFrame from the results then transpose it
pvalues_df = pd.DataFrame({
                    "Tag": categories,
                    "Chi Square": [f"{member:,.4f}" for member in chi_square_pvalues],
                    "Student T-Test": [f"{member:,.4f}" for member in student_ttest_pvalues],
                    "Mann-Whitney U": [f"{member:,.4f}" for member in mannwhitneyu_pvalues],
                    "Kruskal-Wallis": [f"{member:,.4f}" for member in kruskal_pvalues],
                    "Max (True)": [f"{member:,.0f}" for member in maxTrue_list],
                    "Max (False)": [f"{member:,.0f}" for member in maxFalse_list],
                    "Min (True)": [f"{member:,.0f}" for member in minTrue_list],
                    "Min (False)": [f"{member:,.0f}" for member in minFalse_list],
                    "Sum (True)": [f"{member:,.0f}" for member in sumTrue_list],
                    "Sum (False)": [f"{member:,.0f}" for member in sumFalse_list],
                    "Mean (True)": [f"{member:,.0f}" for member in meanTrue_list],
                    "Mean (False)": [f"{member:,.0f}" for member in meanFalse_list],
                    "Median (True)": [f"{member:,.0f}" for member in medianTrue_list],
                    "Median (False)": [f"{member:,.0f}" for member in medianFalse_list]}).T

# set the column headers to the tags
pvalues_df.columns = pvalues_df.iloc[0, :]

# remove the tags row
pvalues_df = pvalues_df.iloc[1:, :]

# display the DataFrame
print("P-Values & Statistics for View Counts")
pvalues_df

# Scatter Plot of Length of SuperBowl Ads Over the Past 20 Years

In [None]:
#scatter plot
plt.scatter(main_df["Year"], main_df["Duration (seconds)"],)

plt.xlabel("Year")
plt.ylabel("Length of Ad (seconds)")
plt.title("Length of SuperBowl Ads Over the Years")
plt.ylim(20,220)

plt.xticks(np.arange(2000, 2021, step=1), rotation = 90)

plt.show()

# Regression Line to See the Trends of Length of SuperBowl Ads

In [None]:
#creating regression values
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(main_df["Year"], main_df["Duration (seconds)"])

regress_value = main_df["Year"] * slope + intercept

#creating the linear regression equation
line_eqn = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

In [None]:
#scatter plot with regression line
plt.scatter(main_df["Year"], main_df["Duration (seconds)"],)

plt.plot(main_df["Year"], regress_value, "r-")
plt.annotate(line_eqn, (2000,130), fontsize= 15,  color = "red")

plt.xlabel("Year")
plt.ylabel("Length of Ad (seconds)")
plt.title("Length of SuperBowl Ads Over the Years")
plt.ylim(20,220)

plt.xticks(np.arange(2000, 2021, step=1), rotation = 90)

plt.show()

# Checking for Outliers

In [None]:
#Finding the quartiles and IQR
quartiles = main_df["Duration (seconds)"].quantile([0.25, 0.5, 0.75])
lower_quar = quartiles[.25]
upper_quar = quartiles[0.75]
iqr = upper_quar - lower_quar

#Creating the bounds
lower_bound = lower_quar - (1.5 * iqr)
upper_bound = upper_quar + (1.5 * iqr)

print(f'The lower quartile of duration of ads is: {lower_quar}.')
print(f'The upper quartile of duration of ads is: {upper_quar}.')
print(f'Values below {round(lower_bound,2)} could be outliers.')
print(f'Values above {round(upper_bound,2)} could be outliers.')

#creating a dataframe with the outliers
outlier_lengths = main_df.loc[(main_df['Duration (seconds)'] < lower_bound) | (main_df['Duration (seconds)'] > upper_bound)]
outlier_lengths

# Scatter Plot and Regression line without Outliers

In [None]:
#Creating a dataframe without outliers
no_outliers = main_df.drop(labels = [0,14,27,79,100,189,197], axis=0)
no_outliers.head()

In [None]:
#creating a scatter plot
plt.scatter(no_outliers["Year"], no_outliers["Duration (seconds)"],)

plt.xlabel("Year")
plt.ylabel("Length of Ad (seconds)")
plt.title("Length of SuperBowl Ads Over the Years")
plt.ylim(20,220)

plt.xticks(np.arange(2000, 2021, step=1), rotation = 90)

plt.show()

In [None]:
#creating regression values
(outlier_slope, outlier_intercept, outlier_rvalue, outlier_value, outlier_stderr) = st.linregress(
    no_outliers["Year"], no_outliers["Duration (seconds)"])

regress_value = no_outliers["Year"] * outlier_slope + outlier_intercept

#creating the linear regression equation
line_eqn = "y = " + str(round(outlier_slope,2)) + "x + " + str(round(outlier_intercept,2))

In [None]:
#scatter plot with regression line
plt.scatter(no_outliers["Year"], no_outliers["Duration (seconds)"],)

plt.plot(no_outliers["Year"], regress_value, "r-")
plt.annotate(line_eqn, (2000,125), fontsize= 15,  color = "red")

plt.xlabel("Year")
plt.ylabel("Length of Ad (seconds)")
plt.title("Length of SuperBowl Ads Over the Years")
plt.ylim(20,220)

plt.xticks(np.arange(2000, 2021, step=1), rotation = 90)

plt.show()

# Looking at the Data 10 Years at a Time

In [None]:
by_years_df= main_df.sort_values(by=["Year"])
by_years_df.head()

In [None]:
#creating bins
bins = [2000,2010,2020]

group_names = ["2000-2010", "2011-2020"]

by_years_df["Year Grouping"]= pd.cut(by_years_df["Year"], bins, labels = group_names, include_lowest= True)
by_years_df.head()

In [None]:
#creating a data frame for 200-2010 and creating a scatter plot
years_to_2010 = by_years_df.loc[(by_years_df["Year Grouping"] == "2000-2010"), :]


plt.scatter(years_to_2010["Year"], years_to_2010["Duration (seconds)"],)

plt.xlabel("Year")
plt.ylabel("Length of Ad (seconds)")
plt.title("Length of SuperBowl Ads From 2000 to 2010")
plt.ylim(20,225)

plt.xticks(np.arange(2000, 2011, step=1), rotation = 90)

plt.show()

In [None]:
#creating regression values
(slope_2010, intercept_2010, rvalue_2010, pvalue_2010, stderr_2010) = st.linregress(
    years_to_2010["Year"], years_to_2010["Duration (seconds)"])

regress_value = years_to_2010["Year"] * slope_2010 + intercept_2010


In [None]:
#creating the linear regression equation
line_eqn = "y = " + str(round(slope_2010,2)) + "x + " + str(round(intercept_2010,2))
#creating a scatter plot with the regression line
plt.scatter(years_to_2010["Year"], years_to_2010["Duration (seconds)"],)

plt.plot(years_to_2010["Year"], regress_value, "r-")
plt.annotate(line_eqn, (2000,125), fontsize= 15,  color = "red")

plt.xlabel("Year")
plt.ylabel("Length of Ad (seconds)")
plt.title("Length of SuperBowl Ads From 2000 to 2010")
plt.ylim(20,225)

plt.xticks(np.arange(2000, 2011, step=1), rotation = 90)

plt.show()

In [None]:
#creating a data frame for 200-2010 and creating a scatter plot
years_from_2011 = by_years_df.loc[(by_years_df["Year Grouping"] == "2011-2020"), :]


plt.scatter(years_from_2011["Year"], years_from_2011["Duration (seconds)"],)

plt.xlabel("Year")
plt.ylabel("Length of Ad (seconds)")
plt.title("Length of SuperBowl Ads From 2011 to 2020")
plt.ylim(20,225)

plt.xticks(np.arange(2011, 2021, step=1), rotation = 90)

plt.show()

In [None]:
#creating regression values
(slope_2011, intercept_2011, rvalue_2011, pvalue_2011, stderr_2011) = st.linregress(
    years_from_2011["Year"], years_from_2011["Duration (seconds)"])

regress_value = years_from_2011["Year"] * slope_2011 + intercept_2011

#creating the linear regression equation
line_eqn = "y = " + str(round(slope_2011,2)) + "x + " + str(round(intercept_2011,2))

In [None]:
#creating a scatter plot with the regression line
plt.scatter(years_from_2011["Year"], years_from_2011["Duration (seconds)"],)

plt.plot(years_from_2011["Year"], regress_value, "r-")
plt.annotate(line_eqn, (2011,150), fontsize= 15,  color = "red")

plt.xlabel("Year")
plt.ylabel("Length of Ad (seconds)")
plt.title("Length of SuperBowl Ads From 2011 to 2020")
plt.ylim(20,225)

plt.xticks(np.arange(2011, 2021, step=1), rotation = 90)

plt.show()