-
Notifications
You must be signed in to change notification settings - Fork 0
/
YouTube Upload_Hour Exploration
180 lines (116 loc) · 5.44 KB
/
YouTube Upload_Hour Exploration
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
df_US = pd.read_csv('/Users/michaelsegaline/Desktop/WGU/New Program/D214 Capstone/USvideos_raw.csv')
df_GB = pd.read_csv('/Users/michaelsegaline/Desktop/YouTube Datasets/GBvideos.csv')
df_DE = pd.read_csv('/Users/michaelsegaline/Desktop/YouTube Datasets/DEvideos.csv')
df_IN = pd.read_csv('/Users/michaelsegaline/Desktop/YouTube Datasets/INvideos.csv')
df_FR = pd.read_csv('/Users/michaelsegaline/Desktop/YouTube Datasets/FRvideos.csv')
#--Exploring and Cleaning the dataset--#
df_US.head()
#-- Now inspecting the data types --#
df_US.dtypes
#-- Checking shape before dropping any null values --#
df_US.shape
#-- Dropping any null values --#
df_US = df_US.dropna()
#-- 98.6% full 1.4% Sparse --#
df_US.shape
#-- Exploring Descriptive Statistics --#
#-- The average number of 'views' per trending video is 2,370,839. --#
#-- The average number of 'likes' per trending video is 74,616 --#
#-- The average number of 'dislikes' per treding video is 3,508 --#
#-- The average 'comment_count' per video is 34,316 --#
df_US.describe()
#--Univariate Exploration--#
#-- Frequency distrobution of videos uploaded against the 'Upload_hour' on a 24hour time scale --#
#-- All time scales are indexed on 'Zulu' time (located in Greenwinch, England) --#
sns.histplot( x = df_US['Upload_hour'], kde = True)
#-- Now plotting the distrobution of 'views'.--#
df_US.views.plot()
#-- Now plotting the distrobtion of 'likes'. --#
df_US.likes.plot()
#-- Plotting the distrobution of the "comment_count". --#
df_US.comment_count.plot()
#-- Plotting the distrobution of 'dislikes'. --#
df_US.dislikes.plot()
#-- Bivariate Exploration --#
#-- Below is the frequency distrobution of the number of 'views' against the 'Upload_hour' --#
#-- Notice how it is opposite of the 'Upload_hour' frequency --#
sns.relplot(data=df_US, x="Upload_hour", y="views", kind="line")
#-- Below is the number of 'likes' plotted against 'upload_hour' --#
sns.relplot(
data=df_US, x="Upload_hour", y="likes",
sizes=(15, 200), kind = 'line'
)
#-- Now plotting the frequency distobution of 'comment_count' against 'upload_hour' --#
sns.relplot(data=df_US, x="Upload_hour", y="comment_count", kind="line")
#-- Now plotting 'dislikes' against the 'upload_hour' --#
sns.relplot(data=df_US, x="Upload_hour", y="dislikes", kind="line")
# -- 3D graphs of views, likes, and upload hour --#
# -- jointplot() plots the relationship or joint distribution of two variables while adding marginal axes that show the univariate distribution of each one separately.
#-- Imagine the video content is a rubber ball and if you drop your rubber ball at a certain time, the ball will bounce a certain hieght.
sns.jointplot(data=df_US, x="views", y="likes", hue="Upload_hour")
# Creating the KPI for exploration.¶
# Engagment Per View (EPV) = 'views' / ('likes' + 'dislikes' + 'comment_count')
#-- Creating Total Engagment Per View --#
EPV = df_US.views /(df_US.likes + df_US.dislikes + df_US.comment_count)
#-- Plotting Total Engagment Per View --#
#-- Outliers excluded, this distrobution appears to have one mode. --#
EPV.plot()
#-- Creating a denisty plot of EPV against Upload hour --#
#-- The distobution density is skewed towards the zero hour --#
sns.kdeplot(data=df_US, x= EPV, hue='Upload_hour', multiple="stack")
#-- Now plotting the frequency distobution of EPV against 'upload_hour' --#
sns.relplot(data=df_US, x="Upload_hour", y=EPV, kind="line")
# -- Conducting a Shapiro - Wilk Tests for normality.--#
#-- Running Shapiro Wilk test to gage normality of EPV--#
x6 = EPV
shapiro_test = stats.shapiro(x6)
shapiro_test
shapiro_test.statistic
shapiro_test.pvalue
#-- Running Shapiro Wilk test to gage normality of 'views'--#
x1 = df_US.views
shapiro_test = stats.shapiro(x1)
shapiro_test
shapiro_test.statistic
shapiro_test.pvalue
#-- Running Shapiro Wilk test to gage normality of 'comment_count'--#
x2 = df_US.comment_count
shapiro_test = stats.shapiro(x2)
shapiro_test
shapiro_test.statistic
shapiro_test.pvalue
#-- Running Shapiro Wilk test to gage normality of 'likes'--#
x4 = df_US.likes
shapiro_test = stats.shapiro(x4)
shapiro_test
shapiro_test.statistic
shapiro_test.pvalue
#-- Now Cleaning and Exploring the remaining Countries Datasets--#
#-- Dropping all the rows with missing values --#
df_GB.dropna()
df_DE.dropna()
df_IN.dropna()
df_FR.dropna()
#-- Creating Total Engagment Per View, Per Country --#
EPV_GB = df_GB.views /(df_GB.likes + df_GB.dislikes + df_GB.comment_count)
EPV_DE = df_DE.views /(df_DE.likes + df_DE.dislikes + df_DE.comment_count)
EPV_IN = df_IN.views /(df_IN.likes + df_IN.dislikes + df_IN.comment_count)
EPV_FR = df_FR.views /(df_FR.likes + df_FR.dislikes + df_FR.comment_count)
#--The Prime Time for Great Britian is:-_#
#-- Now plotting the frequency distobution of EPV against 'Upload_Hour' --#
sns.relplot(data=df_GB, x="Upload_Hour", y=EPV_GB, kind="line")
#-- The Prime Time for Denmark is:¶ --#
#-- Now plotting the frequency distobution of EPV against 'Upload_Hour' --#
sns.relplot(data=df_DE, x="Upload_Hour", y=EPV_DE, kind="line")
#--The Prime Time for India:--#
#-- Now plotting the frequency distobution of EPV against 'Upload_Hour' --#
sns.relplot(data=df_IN, x="Upload_Hour", y=EPV_FR, kind="line")
#-- Prime Time for France:--#
#-- Now plotting the frequency distobution of EPV against 'Upload_Hour' --#
sns.relplot(data=df_FR, x="Upload_Hour", y=EPV_FR, kind="line")