In [None]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st

In [None]:
# import resources
main_df = pd.read_csv("Resources/Cleaned Data.csv")
main_df.head()

In [None]:

ad_tag_view_counts_df = pd.read_csv("Resources/ad_tag_view_counts.csv")
ad_tag_counts_df = pd.read_csv("Resources/ad_tag_counts.csv")

ad_tag_view_counts_df

## Pie Charts 


In [None]:
# count of funny vs not funny by type
#funny 
len(main_df.loc[main_df["Funny"] == True])
#not funny
len(main_df.loc[main_df["Funny"] == False])

# Hypothesis Test

###### Hypothesis
Ads that contain the tag 'Funny' would receive more views on YouTube.

###### Null Hypothesis
There is no statistically significant difference in view counts between ads that contain the 'Funny' tag and those that do not contain the 'Funny' tag.

###### Results
Because the calculated P-Value is exactly zero, we reject the null hypothesis.

In [None]:
# extract the relevant data from the main DataFrame
funnySeries = main_df.loc[main_df["Funny"] == True, :]["View Counts"]
unfunnySeries = main_df.loc[main_df["Funny"] == False, :]["View Counts"]

In [None]:
# construct the hypothesis test series
obsSeries = pd.Series(data = [sum(funnySeries), sum(unfunnySeries)], name = "observed")
expSeries = pd.Series(data = [sum(obsSeries) / 2, sum(obsSeries) / 2], name = "expected")

In [None]:
# perform a Chi-Squared test because this is comparing categorical data
pValue = st.chisquare(obsSeries, expSeries)[1]

# display the p-value
print(f"P-Value = {pValue:,.4f}")

### Batch Hypothesis Tests

In [None]:
# construct list of ad categories
categories = ["Funny", "Shows Product Quickly", "Celebrity", "Danger", "Animals", "Use Sex"]

# declare the p-value lists
chi_square_pvalues = []
student_ttest_pvalues = []
mannwhitneyu_pvalues = []
kruskal_pvalues = []

# declare the summary statistics lists
maxTrue_list = []
minTrue_list = []
sumTrue_list = []
meanTrue_list = []
medianTrue_list = []
maxFalse_list = []
minFalse_list = []
sumFalse_list = []
meanFalse_list = []
medianFalse_list = []

# iterate through the ad categories
for category in categories:
    
    # extract the series
    whenTrue = main_df.loc[main_df[category] == True, :]["View Counts"]
    whenFalse = main_df.loc[main_df[category] == False, :]["View Counts"]
    
    # construct the Chi Square observed series
    observed = pd.Series(data = [sum(whenTrue), sum(whenFalse)], name = "observed")
    expected = pd.Series(data = [sum(observed) / 2, sum(observed) / 2], name = "expected")
    
    # calculate the P-Value with a Chi Square Test
    chi_square_pvalues.append(st.chisquare(observed, expected)[1])
    
    # calculate the P-Value with a Student T-Test
    student_ttest_pvalues.append(st.ttest_ind(whenFalse, whenTrue)[1])
    
    # calculate the P-Value with a Mann-Whitney U Test
    mannwhitneyu_pvalues.append(st.mannwhitneyu(whenTrue, whenFalse, alternative = "less")[1])
    
    # calculate the P-Value with a Kruskal-Wallis Test
    kruskal_pvalues.append(st.kruskal(whenTrue, whenFalse)[1])
    
    # calculate the summary statistics
    maxTrue_list.append(max(whenTrue))
    minTrue_list.append(min(whenTrue))
    sumTrue_list.append(sum(whenTrue))
    meanTrue_list.append(whenTrue.mean())
    medianTrue_list.append(whenTrue.median())
    maxFalse_list.append(max(whenFalse))
    minFalse_list.append(min(whenFalse))
    sumFalse_list.append(sum(whenFalse))
    meanFalse_list.append(whenFalse.mean())
    medianFalse_list.append(whenFalse.median())

In [None]:
# create a DataFrame from the results then transpose it
pvalues_df = pd.DataFrame({
                    "Tag": categories,
                    "Chi Square": [f"{member:,.4f}" for member in chi_square_pvalues],
                    "Student T-Test": [f"{member:,.4f}" for member in student_ttest_pvalues],
                    "Mann-Whitney U": [f"{member:,.4f}" for member in mannwhitneyu_pvalues],
                    "Kruskal-Wallis": [f"{member:,.4f}" for member in kruskal_pvalues],
                    "Max (True)": [f"{member:,.0f}" for member in maxTrue_list],
                    "Max (False)": [f"{member:,.0f}" for member in maxFalse_list],
                    "Min (True)": [f"{member:,.0f}" for member in minTrue_list],
                    "Min (False)": [f"{member:,.0f}" for member in minFalse_list],
                    "Sum (True)": [f"{member:,.0f}" for member in sumTrue_list],
                    "Sum (False)": [f"{member:,.0f}" for member in sumFalse_list],
                    "Mean (True)": [f"{member:,.0f}" for member in meanTrue_list],
                    "Mean (False)": [f"{member:,.0f}" for member in meanFalse_list],
                    "Median (True)": [f"{member:,.0f}" for member in medianTrue_list],
                    "Median (False)": [f"{member:,.0f}" for member in medianFalse_list]}).T

# set the column headers to the tags
pvalues_df.columns = pvalues_df.iloc[0, :]

# remove the tags row
pvalues_df = pvalues_df.iloc[1:, :]

# display the DataFrame
print("P-Values & Statistics for View Counts")
pvalues_df

In [None]:
# Pie chart showing funny vs not funny by type

plt.figure(figsize=(15, 6))
#Labels for each slice of the pie chart
labels = ["Funny", "Not Funny"]

#Values representing the number of funny and not funny ad videos
sizes = [152, 66]

#colors for each slice of the pie chart
colors = ["yellow", "lightskyblue"]

#funny separated from not funny
explode = [0.1, 0]
#create pie chart from values
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
         autopct="%1.1f%%", shadow=True, startangle=90)
plt.show




In [None]:
# pie chart of funny vs not funny by viewcount


plt.figure(figsize=(15,6))
#Labels for each slice of the pie chart
labels = ["Funny", "Not Funny"]

#Values representing the viewcounts of funny and not funny ad videos
sizes = [252042940, 103498750]

#colors for each slice of the pie chart
colors = ["green", "red"]

#funny separated from not funny
explode = [0.1, 0]
#create pie chart from values
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
         autopct="%1.1f%%", shadow=True, startangle=90)
plt.show






In [None]:
# create one bar chart to get the total number of ads by type
plt.figure(figsize=(15,7))
plt.bar(ad_tag_counts_df.keys(), ad_tag_counts_df.iloc[0, :] , color="blue", align="center", width=0.5)
plt.xticks([value for value in ad_tag_counts_df.keys()], rotation ="vertical", fontsize=12)
plt.title("Total number of ads by type", fontsize=18)
plt.xlabel("Type", fontsize=12)
plt.ylabel("Total number of ads", fontsize=12)
plt.show()

In [None]:
# create one bar chart to get the total number of views by ads type
plt.figure(figsize=(15,7))
plt.bar(ad_tag_view_counts_df.keys(), ad_tag_view_counts_df.iloc[0, :] , color="blue", align="center", width=0.5)
plt.xticks([value for value in ad_tag_view_counts_df.keys()], rotation ="vertical", fontsize=12)
plt.title("Total number of views by ads type", fontsize=18)
plt.xlabel("Type", fontsize=12)
plt.ylabel("Total number of views by ads type", fontsize=12)
plt.show()