/
google-analytics.R
639 lines (504 loc) · 46.7 KB
/
google-analytics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
#NOTE: Ctrl-F "YOURPATH" to find all paths you need to change to your own.
### Digital Marketing Intelligence - Assignment 1 R-script
#Group 3-10:
# Carasatorre Parra, Teresa
# Dreemer, George
# Hadjipieri, Katerina
# Oikonomou, Asimina
rm(list = ls()) #clean workspace
# Preliminary work: packages, data and checks ----
## Install & Load libraries (librarian) ----
install.packages("librarian")
library(librarian)
librarian::shelf(dplyr,readxl,bannerCommenter,lubridate,ggplot2,car,nortest,janitor, quiet = TRUE)
## Load datasets ----
financial.df <- read_xlsx(file.choose()) #Dataset 1 for Financial related questions
behavior.df <- read_xlsx(file.choose()) #Dataset 2 for Customer/Behavior related questions
### In Financial df: Creating a new date variable (date object) and a week variable ----
# useful for time plots
financial.df$Date_new <- as.Date(as.character(financial.df$Date), "%Y%m%d") #convert the Date column into a date object
financial.df <- relocate(financial.df, Date_new, .after = Date) #for tidyness relocate the new variable next to the old one
financial.df$Week_iso <- isoweek(financial.df$Date_new)
financial.df <- relocate(financial.df, Week_iso, .after = Date_new)
### In Behavior df: Creating a new date variable (date object) and a week variable ----
# useful for time plots
behavior.df$Date_new <- as.Date(as.character(behavior.df$Date), "%Y%m%d") #convert the Date column into a date object
behavior.df <- relocate(behavior.df, Date_new, .after = Date) #for tidyness relocate the new variable next to the old one
behavior.df$Week_iso <- isoweek(behavior.df$Date_new)
behavior.df <- relocate(behavior.df, Week_iso, .after = Date_new)
#We change the format of AnalysisWeekNr as numeric
behavior.df$AnalysisWeekNr <- as.numeric(behavior.df$AnalysisWeekNr)
## Data checks ----
### Financial df ----
#### General ----
summary(financial.df); str(financial.df)
# if relevant / if not delete
# Quantity Revenue
# Min. : 4.0 Min. : 41.12
# 1st Qu.: 76.0 1st Qu.: 1131.01
# Median : 255.5 Median : 4509.11
# Mean : 475.5 Mean : 8551.17
# 3rd Qu.: 639.2 3rd Qu.:10229.77
# Max. :4822.0 Max. :97176.81
#### NA check ----
apply(financial.df, 2, function(x) any(is.na(x)))
# Fiscal Year Platform Day Category Level1 Category Level2
# FALSE FALSE FALSE FALSE FALSE
# Date Detail Views Quantity Revenue Period
# FALSE FALSE FALSE FALSE FALSE
# Launched
# FALSE
#### Logic check ----
##### Nominal variables check ----
# First, we check if all nominal variables make sense,
# by looking if the unique values of these variables match our expectations.
apply(financial.df[,-c(3,6,7,8,9)],2, function(x) unique(x)) #note: we excluded the non-nominal variables
# $`Fiscal Year`
# [1] "FY2021" "FY2022"
#
# $Platform
# [1] "App" "Web"
#
# $`Category Level1`
# [1] "BEAUTY"
#
# $`Category Level2`
# [1] "Beauty Accessories" "Care" "Fragances" "Makeup"
#
# $Period
# [1] "Before" "Intro" "After"
#
# $Launched
# [1] "No" "Yes"
##### Numeric variables check ----
# Next, we check if there are any values with a negative value or 0
apply(financial.df[,-c(1,2,4,5,10,11)],2, function(x) length(which(x <= 0))) #note: we excluded the non-nominal variables
# Day Date Detail Views Quantity Revenue
# 0 0 0 0 0
#### Outlier checks ----
boxplot(financial.df$Revenue, xlab = "Revenue")
outRev <- boxplot.stats(financial.df$Revenue)$out
outrowRev <- which(financial.df$Revenue %in% c(outRev))
outrowRev
# Revenue Outliers are at rows:
# [1] 220 259 260 261 262 263 264 265 266 267 268 424 425 427 428 429 430 431 432 433 434 435
# [23] 453 549 550 553 554 555 556 557 558 559 560 561 575 589 590 591 592 593 594 595 596 597
# [45] 598 599 600 601 602 603 604 605
financial.df[outrowRev,] # to see the rows
### Behavior df ----
#### General ----
summary(behavior.df); str(behavior.df)
# if relevant / if not delete
# Users Weekly Users
# Min. : 344.0 Min. : 858
# 1st Qu.: 473.2 1st Qu.: 1072
# Median :2164.0 Median : 6152
# Mean :2725.1 Mean : 7277
# 3rd Qu.:4799.0 3rd Qu.:12987
# Max. :8580.0 Max. :16412
#
# Sessions Transactions Revenue Quantity
# Min. : 456 Min. : 13.0 Min. : 1481 Min. : 27.0
# 1st Qu.: 731 1st Qu.: 43.0 1st Qu.: 5261 1st Qu.: 131.0
# Median : 3034 Median : 151.0 Median : 19257 Median : 469.0
# Mean : 4795 Mean : 294.2 Mean : 33504 Mean : 884.7
# 3rd Qu.: 8393 3rd Qu.: 520.0 3rd Qu.: 56625 3rd Qu.:1544.5
# Max. :17024 Max. :1110.0 Max. :120479 Max. :3431.0
#
# Beauty Revenue Beauty Quantity
# Min. : 0.00 Min. : 0.00
# 1st Qu.: 87.53 1st Qu.: 7.00
# Median : 504.90 Median : 36.00
# Mean :1176.31 Mean : 73.04
# 3rd Qu.:1828.63 3rd Qu.:102.25
# Max. :8384.11 Max. :588.00
#### NA check ----
apply(behavior.df, 2, function(x) any(is.na(x)))
# AnalysisWeekNr Date Device Category Users Weekly Users
# FALSE FALSE FALSE FALSE FALSE
# Sessions Transactions Revenue Quantity Category Level1
# FALSE FALSE FALSE FALSE FALSE
# Beauty Revenue Beauty Quantity
# FALSE FALSE
#### Logic check ----
##### Nominal variables check ----
# First, we check if all nominal variables make sense,
# by looking if the unique values of these variables match our expectations.
apply(behavior.df[,-c(2,4,5,6,7,8,9,11,12)],2, function(x) unique(x)) #note: we excluded the non-nominal variables
# $AnalysisWeekNr
# [1] " 1" " 2" " 3" " 4" " 5" " 6" " 7" " 8" " 9" "10" "11" "12" "13"
#
# $`Device Category`
# [1] "mobile" "tablet"
#
# $`Category Level1`
# [1] "BEAUTY"
##### Numeric variables check ----
# Next, we check if there are any values with a negative value or 0
apply(behavior.df[,c(2,4,5,6,7,8,9,11,12)],2, function(x) length(which(x <= 0))) #note: we excluded the non-nominal variables
# Date Users Weekly Users Sessions Transactions
# 0 0 0 0 0
# Revenue Quantity Beauty Revenue Beauty Quantity
# 0 0 3 3
# it seems there are cases, let's clarify:
apply(behavior.df[,c(2,4,5,6,7,8,9,11,12)],2, function(x) which(x == 0))
# $`Beauty Revenue`
# [1] 117 126 151
#
# $`Beauty Quantity`
# [1] 117 126 151
# 3 observations where Beauty Revenue and Beauty Quantity are 0: row 117, 126 and 151.
nonbeauty_observations.df <- as.data.frame(behavior.df[c(117,126,151),]) # just saving them here for now
nonbeauty_observations.df
# We can see that these observations do have revenue and have somehow been labeled BEAUTY but have no beauty spending.
# AnalysisWeekNr Date Device Category Users Weekly Users Sessions Transactions Revenue
# 1 4 20221126 tablet 489 1036 776 51 5676.25
# 2 5 20221205 tablet 353 902 534 13 1481.38
# 3 9 20221230 tablet 463 1274 743 41 5488.04
# Quantity Category Level1 Beauty Revenue Beauty Quantity
# 1 118 BEAUTY 0 0
# 2 27 BEAUTY 0 0
# 3 107 BEAUTY 0 0
#### Outlier checks ----
boxplot(behavior.df$Revenue, xlab = "Revenue")
outRev2 <- boxplot.stats(behavior.df$Revenue)$out
outrowRev2 <- which(behavior.df$Revenue %in% c(outRev2))
outrowRev2
# Revenue Outliers are at rows: - (no outliers)
behavior.df[outrowRev2,] # to see the rows (no outliers)
# A. Financial Questions ----
## 1) How much extra revenue did the magic mirror deliver? (Revenue) ----
# Here we want to look if the total revenue significantly differs between non-launch and launch observations in each of the periods
# Furthermore we want to see exactly how much extra revenue was generated in the launched observations compared to non-launched
### Extra revenue in "Intro" period of Launch vs. Non-launch observations ----
extrarevenue_intro <- sum(financial.df[financial.df$Launched == "Yes" & financial.df$Period == "Intro" & financial.df$`Category Level2` == "Makeup",c("Revenue")]) - sum(financial.df[financial.df$Launched == "No" & financial.df$Period == "Intro" & financial.df$`Category Level2` == "Makeup",c("Revenue")])
# €105,490.60 - €62,023.42 = €43,467.18
# During the same time frame (Intro period) in the non-launch era, we see that the revenue was 2.05x smaller
### Extra revenue in "After" period of Launch vs. Non-launch observations ----
extrarevenue_after <- sum(financial.df[financial.df$Launched == "Yes" & financial.df$Period == "After" & financial.df$`Category Level2` == "Makeup",c("Revenue")]) - sum(financial.df[financial.df$Launched == "No" & financial.df$Period == "After" & financial.df$`Category Level2` == "Makeup",c("Revenue")])
# €204,983.70 - €93,415.77 = €111,567.90
# During the same time frame (Intro period) in the non-launch era, we see that the revenue was 2.05x smaller
### Is this a significant difference? ----
# Wilcoxon signed rank exact test
wilcox.test(Revenue~as.factor(Launched), data = financial.df[financial.df$Period != "Before" & financial.df$`Category Level2` == "Makeup",], paired = TRUE)
# V = 80, p-value = 1.969e-09
# Results: Significant difference between the distributions of the two groups for Revenue (NoLaunch vs. Launch in the "Intro" or "After" period)
### Compute total extra revenue generated ----
total_extrarevenue <- extrarevenue_intro + extrarevenue_after
total_extrarevenue
# €155,035.10
### Create a boxplot to show the distribution ----
# Note: Exclude the Before period, as that is only present in the Non-launched and skews the scales
ggplot(financial.df[financial.df$Platform == "App" & financial.df$Period != "Before" & financial.df$'Category Level2' == "Makeup",], aes(x = Launched, y = Revenue, fill = Launched)) +
geom_boxplot() +
ggtitle('Revenue in App: Pre- and Post-launch \n Category: Makeup') +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5), axis.text = element_text(size = 15)) +
labs(x = "Launched",
y = "Revenue (\u20ac)")
## 2) How many extra items were sold? (Quantity) ----
# Here we apply the same logic as to the revenue, but we use the "Quantity" variable
### Extra items in "Intro" period of Launch vs. Non-launch observations ----
extraitems_intro <- sum(financial.df[financial.df$Launched == "Yes" & financial.df$Period == "Intro" & financial.df$`Category Level2` == "Makeup",c("Quantity")]) - sum(financial.df[financial.df$Launched == "No" & financial.df$Period == "Intro" & financial.df$`Category Level2` == "Makeup",c("Quantity")])
# 9458 items - 6065 items = 3393 extra items sold in the intro period of when the app was launched vs. non-launched
### Extra revenue in "After" period of Launch vs. Non-launch observations ----
extraitems_after <- sum(financial.df[financial.df$Launched == "Yes" & financial.df$Period == "After" & financial.df$`Category Level2` == "Makeup",c("Quantity")]) - sum(financial.df[financial.df$Launched == "No" & financial.df$Period == "After" & financial.df$`Category Level2` == "Makeup",c("Quantity")])
# 17819 items - 7294 items = 10525 extra items sold in the after period of when the app was launched vs. non-launched
### Is this a significant difference? ----
#Before testing we need to check the assumptions of Normality on Quantity
# Normality: Lilliefors (Kolmogorov-Smirnov) normality test
lillie.test(financial.df$Quantity)
#D = 0.21797, p-value < 2.2e-16
# Conclusion: the data doesn't come from a normal distribution therefore a Wilcoxon Rank Test is better
# Wilcoxon signed rank exact test
wilcox.test(Quantity ~ Launched, data = financial.df[financial.df$Period != "Before" & financial.df$`Category Level2` == "Makeup",], paired = TRUE)
# V = 107, p-value = 6.984e-09
# Results: Significant difference between the distributions of the two groups when it comes to Quantity (NoLaunch vs. Launch in the "Intro" or "After" periods)
### Compute total extra items sold ----
total_extraitems <- extraitems_intro + extraitems_after
total_extraitems
# 13918 extra items
### Create a boxplot to show the distribution ----
# Note: Exclude the Before period, as that is only present in the Non-launched and skews the scales
ggplot(financial.df[financial.df$Platform == "App" & financial.df$Period != "Before" & financial.df$'Category Level2' == "Makeup",], aes(x = Launched, y = Quantity, fill = Launched)) +
geom_boxplot() +
ggtitle('Quantity in App: Pre- and Post-launch \n Category: Makeup') +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5), axis.text = element_text(size = 15)) +
labs(x = "Launched",
y = "Quantity")
## 3) What is the strangest thing you see in the data and can you maybe find/give an explanation? ----
# IDEA 1:
# To be honest I'm not entirely sure, my logic checks didn't yield any strangeness (IF WE CAN USE THE ODDITY IN DATABASE 2, then it would be cool)
# The thing I find strange is that Fiscal year has two values FY2021 and FY2022, when there are clearly 2023 observations
# I guess the explanation for that is that the fiscal year 2022 ends on January 17 2023? As that is the last 2023 observation in our data set
# IDEA 2:
# Perhaps that there is no "Before" period for the Launched phase, so we cannot compare the Before period.
# IDEA 3 - Best?:
# We see there are outliers in the Revenue.
outrowRev
# # Revenue Outliers are at rows:
# # [1] 220 259 260 261 262 263 264 265 266 267 268 424 425 427 428 429 430 431 432 433 434 435
# # [23] 453 549 550 553 554 555 556 557 558 559 560 561 575 589 590 591 592 593 594 595 596 597
# # [45] 598 599 600 601 602 603 604 605
financial.df[outrowRev,c("Date_new")] # to see the rows
# Visualize outliers
ggplot(financial.df, aes(x = Platform, y = Revenue, fill = Platform)) +
geom_boxplot() +
ggtitle('Revenue Outliers in App & Web') +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5), axis.text = element_text(size = 15)) +
labs(x = "Platform",
y = "Daily Beauty Revenue (\u20ac)",
color = "Legend")
# B. Behavior Questions ----
#Find the launch date from the first dataset - we use this to illustrate the before-after launch periods in our time plots
launch_date <- financial.df$Date_new[financial.df$Launched == 'Yes' & financial.df$Period == 'Intro'][1]
#Define launch week based on where the above date is situated - we use this is Question 4, where our X-axis is AnalysisWeekNr (not date)
launch_week <- behavior.df$AnalysisWeekNr[behavior.df$Date_new == launch_date][1]
## 4) Did the magic mirror succeed in its goal to stimulate more repeat purchases for beauty products. Please explain your findings. ----
#We change the format of AnalysisWeekNr as numeric
behavior.df$AnalysisWeekNr <- as.numeric(behavior.df$AnalysisWeekNr)
# We find the mean of weekly users that made repeat purchases for beauty products
repeat_purchases_Beauty <- aggregate(behavior.df$AnalysisWeekNr ~ behavior.df$'Weekly Users' + behavior.df$'Beauty Quantity', FUN = function(x) mean(x))
names(repeat_purchases_Beauty) <- c('AnalysisWeekNr','WeeklyUsers','BeautyQuantity')
#We make a bar plot to illustrate how many customers came back and shop from the beauty category (yes it was successful)
# create the plot
ggplot(repeat_purchases_Beauty, aes(x = AnalysisWeekNr, y = WeeklyUsers, fill = BeautyQuantity)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Week of Year", y = "Number of unique users") +
scale_x_continuous(breaks = 1:52, labels = 1:52) + theme_bw()
## 5) Did it increase the frequency of app use for the visitors that interacted with the magic mirror? What might be possible drawbacks you are missing by using this dataset? ----
unique(behavior.df$AnalysisWeekNr) #there are 13 weeks
# NOTE: With all ifelse/loops we are doing below, we aggregate tablet and mobile users together, because the app is both on mobile and tablet by definition.
#TrueWeeklyUsers/TotalWeeklyUsers: a variable the aggregates mobile & tablet users and signifies the total users (not unique users)
behavior.df$TrueWeeklyUsers <- ifelse(behavior.df$AnalysisWeekNr == 1, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 1]),
ifelse(behavior.df$AnalysisWeekNr == 2, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 2]),
ifelse(behavior.df$AnalysisWeekNr == 3, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 3]),
ifelse(behavior.df$AnalysisWeekNr == 4, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 4]),
ifelse(behavior.df$AnalysisWeekNr == 5, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 5]),
ifelse(behavior.df$AnalysisWeekNr == 6, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 6]),
ifelse(behavior.df$AnalysisWeekNr == 7, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 7]),
ifelse(behavior.df$AnalysisWeekNr == 8, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 8]),
ifelse(behavior.df$AnalysisWeekNr == 9, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 9]),
ifelse(behavior.df$AnalysisWeekNr == 10, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 10]),
ifelse(behavior.df$AnalysisWeekNr == 11, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 11]),
ifelse(behavior.df$AnalysisWeekNr == 12, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 12]),
ifelse(behavior.df$AnalysisWeekNr == 13, sum(behavior.df$Users[behavior.df$AnalysisWeekNr == 13]), NA)))))))))))))
behavior.df$WeeklySessions <- ifelse(behavior.df$AnalysisWeekNr == 1, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 1]),
ifelse(behavior.df$AnalysisWeekNr == 2, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 2]),
ifelse(behavior.df$AnalysisWeekNr == 3, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 3]),
ifelse(behavior.df$AnalysisWeekNr == 4, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 4]),
ifelse(behavior.df$AnalysisWeekNr == 5, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 5]),
ifelse(behavior.df$AnalysisWeekNr == 6, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 6]),
ifelse(behavior.df$AnalysisWeekNr == 7, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 7]),
ifelse(behavior.df$AnalysisWeekNr == 8, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 8]),
ifelse(behavior.df$AnalysisWeekNr == 9, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 9]),
ifelse(behavior.df$AnalysisWeekNr == 10, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 10]),
ifelse(behavior.df$AnalysisWeekNr == 11, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 11]),
ifelse(behavior.df$AnalysisWeekNr == 12, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 12]),
ifelse(behavior.df$AnalysisWeekNr == 13, sum(behavior.df$Sessions[behavior.df$AnalysisWeekNr == 13]), NA)))))))))))))
behavior.df$WeeklySessionsPerUser <- ifelse(behavior.df$AnalysisWeekNr == 1, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 1]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 1],
ifelse(behavior.df$AnalysisWeekNr == 2, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 2]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 2],
ifelse(behavior.df$AnalysisWeekNr == 3, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 3]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 3],
ifelse(behavior.df$AnalysisWeekNr == 4, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 4]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 4],
ifelse(behavior.df$AnalysisWeekNr == 5, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 5]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 5],
ifelse(behavior.df$AnalysisWeekNr == 6, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 6]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 6],
ifelse(behavior.df$AnalysisWeekNr == 7, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 7]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 7],
ifelse(behavior.df$AnalysisWeekNr == 8, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 8]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 8],
ifelse(behavior.df$AnalysisWeekNr == 9, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 9]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 9],
ifelse(behavior.df$AnalysisWeekNr == 10, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 10]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 10],
ifelse(behavior.df$AnalysisWeekNr == 11, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 11]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 11],
ifelse(behavior.df$AnalysisWeekNr == 12, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 12]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 12],
ifelse(behavior.df$AnalysisWeekNr == 13, behavior.df$WeeklySessions[behavior.df$AnalysisWeekNr == 13]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 13], NA)))))))))))))
#### PLOT of Weekly Sessions per User over time ----
#color_plots contains the colors for the plots in question 5 and 6
colors_plots <- c("Weekly Sessions per User" = "dark green", "Weekly Beauty Revenue per User" = "purple", "Weekly Non-Beauty Revenue per User" = "orange", "Weekly Total Revenue per User" = "blue", "Launch of Magic Mirror" = "red")
# Create Launched variable that says "1" if the date is on and beyond 12-20-2022 & "0" if the date is before this date.
# We chose this date because that is the first day in the "Intro" period in the Financial dataset
behavior.df$Launched <- ifelse(behavior.df$Date_new >= as.Date(as.character("20221220"), "%Y%m%d"), 1, 0)
ggplot(behavior.df, aes(x = Date_new)) +
geom_line(aes(y = WeeklySessionsPerUser, color = "Weekly Sessions per User"), size = 2) +
ggtitle('Weekly Sessions per User Over Time \n Mobile & Tablet Combined') + geom_vline(aes(xintercept = launch_date, color = "Launch of Magic Mirror"), linetype = "dashed", size = 1.5) +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5), axis.text = element_text(size = 15)) +
labs(x = "Time",
y = "Weekly Sessions/User",
color = "Legend") +
scale_color_manual(values = colors_plots)
# Now we look if this difference is statistically significant thanks to the newly created Launched variable
# Normality: Lilliefors (Kolmogorov-Smirnov) normality test
# The null hypothesis (H0) for the test is the data comes from a normal distribution.
# The alternate hypothesis (H1) is that the data doesn’t come from a normal distribution.
lillie.test(behavior.df$WeeklySessionsPerUser)
# D = 0.19082, p-value < 2.2e-16
# Conclusion: the data doesn't come from a normal distribution therefore a Wilcoxon Rank Test is better
# Wilcoxon signed rank exact test -
# NOTE: Due to Launched (84) and Non-launched observations (98) having different # of observations,
# we need to remove the first 7 days (based on the date), which will remove 7 days from mobile and 7 from tablet.
# Which will reduce the total "No" observations by a total of 14.
wilcox.test(WeeklySessionsPerUser ~ as.factor(Launched), data = behavior.df[behavior.df$Date_new > as.Date(as.character("20221107"), "%Y%m%d"),], paired = TRUE)
# V = 1883, p-value = 0.6626
# Results: No significant difference between the distributions of WeeklySessionsPerUser in the Launch vs. Non-launch
#### Relocations of the new variables to keep the df tidy
behavior.df <- relocate(behavior.df, TrueWeeklyUsers, .after = "Weekly Users") #for tidyness relocate the new variable next to the old one
behavior.df <- relocate(behavior.df, WeeklySessions, .after = "Sessions") #for tidyness relocate the new variable next to the old one
behavior.df <- relocate(behavior.df, WeeklySessionsPerUser, .after = WeeklySessions) #for tidyness relocate the new variable next to the old one
## 6) Did it increase the value of the customers that used the magic mirror? ----
# In the next three code sections, we calculate the weekly beauty-only revenue, non-beauty revenue (total - beauty) and total revenue.
# We distinguish between these three to see not only the effects of the app on beauty revenue per user (value),
# but also the spillover effects illustrated through non-beauty revenue and total revenue per user.
### BEAUTY VALUE ONLY ----
behavior.df$WeeklyBeautyRevenue <- ifelse(behavior.df$AnalysisWeekNr == 1, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 1]),
ifelse(behavior.df$AnalysisWeekNr == 2, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 2]),
ifelse(behavior.df$AnalysisWeekNr == 3, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 3]),
ifelse(behavior.df$AnalysisWeekNr == 4, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 4]),
ifelse(behavior.df$AnalysisWeekNr == 5, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 5]),
ifelse(behavior.df$AnalysisWeekNr == 6, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 6]),
ifelse(behavior.df$AnalysisWeekNr == 7, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 7]),
ifelse(behavior.df$AnalysisWeekNr == 8, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 8]),
ifelse(behavior.df$AnalysisWeekNr == 9, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 9]),
ifelse(behavior.df$AnalysisWeekNr == 10, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 10]),
ifelse(behavior.df$AnalysisWeekNr == 11, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 11]),
ifelse(behavior.df$AnalysisWeekNr == 12, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 12]),
ifelse(behavior.df$AnalysisWeekNr == 13, sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 13]), NA)))))))))))))
behavior.df$WeeklyBeautyRevenuePerUser <- ifelse(behavior.df$AnalysisWeekNr == 1, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 1]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 1],
ifelse(behavior.df$AnalysisWeekNr == 2, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 2]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 2],
ifelse(behavior.df$AnalysisWeekNr == 3, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 3]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 3],
ifelse(behavior.df$AnalysisWeekNr == 4, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 4]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 4],
ifelse(behavior.df$AnalysisWeekNr == 5, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 5]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 5],
ifelse(behavior.df$AnalysisWeekNr == 6, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 6]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 6],
ifelse(behavior.df$AnalysisWeekNr == 7, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 7]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 7],
ifelse(behavior.df$AnalysisWeekNr == 8, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 8]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 8],
ifelse(behavior.df$AnalysisWeekNr == 9, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 9]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 9],
ifelse(behavior.df$AnalysisWeekNr == 10, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 10]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 10],
ifelse(behavior.df$AnalysisWeekNr == 11, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 11]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 11],
ifelse(behavior.df$AnalysisWeekNr == 12, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 12]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 12],
ifelse(behavior.df$AnalysisWeekNr == 13, behavior.df$WeeklyBeautyRevenue[behavior.df$AnalysisWeekNr == 13]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 13], NA)))))))))))))
#### PLOT ----
ggplot(behavior.df, aes(x = Date_new)) +
geom_line(aes(y = WeeklyBeautyRevenuePerUser, color = "Weekly Beauty Revenue per User"), size = 2) +
ggtitle('Weekly Beauty Revenue per User Over Time \n Mobile & Tablet Combined') + geom_vline(aes(xintercept = launch_date, color = "Launch of Magic Mirror"), linetype = "dashed", size = 1.5) +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5), axis.text = element_text(size = 15)) +
labs(x = "Time",
y = "Weekly Beauty Revenue/User (\u20ac)",
color = "Legend") +
scale_color_manual(values = colors_plots)
#### Relocations of the new variables to keep the df tidy
behavior.df <- relocate(behavior.df, WeeklyBeautyRevenue, .after = "Beauty Revenue") #for tidyness relocate the new variable next to the old one
behavior.df <- relocate(behavior.df, WeeklyBeautyRevenuePerUser, .after = WeeklyBeautyRevenue) #for tidyness relocate the new variable next to the old one
### NON-BEAUTY VALUE ONLY (TOTAL - BEAUTY) ----
behavior.df$WeeklyNonBeautyRevenue <- ifelse(behavior.df$AnalysisWeekNr == 1, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 1]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 1])),
ifelse(behavior.df$AnalysisWeekNr == 2, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 2]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 2])),
ifelse(behavior.df$AnalysisWeekNr == 3, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 3]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 3])),
ifelse(behavior.df$AnalysisWeekNr == 4, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 4]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 4])),
ifelse(behavior.df$AnalysisWeekNr == 5, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 5]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 5])),
ifelse(behavior.df$AnalysisWeekNr == 6, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 6]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 6])),
ifelse(behavior.df$AnalysisWeekNr == 7, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 7]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 7])),
ifelse(behavior.df$AnalysisWeekNr == 8, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 8]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 8])),
ifelse(behavior.df$AnalysisWeekNr == 9, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 9]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 9])),
ifelse(behavior.df$AnalysisWeekNr == 10, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 10]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 10])),
ifelse(behavior.df$AnalysisWeekNr == 11, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 11]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 11])),
ifelse(behavior.df$AnalysisWeekNr == 12, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 12]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 12])),
ifelse(behavior.df$AnalysisWeekNr == 13, (sum(behavior.df$Revenue[behavior.df$AnalysisWeekNr == 13]) - sum(behavior.df$`Beauty Revenue`[behavior.df$AnalysisWeekNr == 13])), NA)))))))))))))
behavior.df$WeeklyNonBeautyRevenuePerUser <- ifelse(behavior.df$AnalysisWeekNr == 1, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 1]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 1],
ifelse(behavior.df$AnalysisWeekNr == 2, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 2]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 2],
ifelse(behavior.df$AnalysisWeekNr == 3, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 3]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 3],
ifelse(behavior.df$AnalysisWeekNr == 4, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 4]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 4],
ifelse(behavior.df$AnalysisWeekNr == 5, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 5]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 5],
ifelse(behavior.df$AnalysisWeekNr == 6, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 6]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 6],
ifelse(behavior.df$AnalysisWeekNr == 7, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 7]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 7],
ifelse(behavior.df$AnalysisWeekNr == 8, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 8]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 8],
ifelse(behavior.df$AnalysisWeekNr == 9, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 9]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 9],
ifelse(behavior.df$AnalysisWeekNr == 10, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 10]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 10],
ifelse(behavior.df$AnalysisWeekNr == 11, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 11]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 11],
ifelse(behavior.df$AnalysisWeekNr == 12, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 12]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 12],
ifelse(behavior.df$AnalysisWeekNr == 13, behavior.df$WeeklyNonBeautyRevenue[behavior.df$AnalysisWeekNr == 13]/behavior.df$TrueWeeklyUsers[behavior.df$AnalysisWeekNr == 13], NA)))))))))))))
#### PLOT ----
ggplot(behavior.df, aes(x = Date_new)) +
geom_line(aes(y = WeeklyNonBeautyRevenuePerUser, color = "Weekly Non-Beauty Revenue per User"), size = 2) +
ggtitle('Weekly Non-Beauty Revenue per User Over Time \n Mobile & Tablet Combined') + geom_vline(aes(xintercept = launch_date, color = "Launch of Magic Mirror"), linetype = "dashed", size = 1.5) +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5), axis.text = element_text(size = 15)) +
labs(x = "Time",
y = "Weekly Non-Beauty Revenue/User (\u20ac)",
color = "Legend") +
scale_color_manual(values = colors_plots)
#### Relocations of the new variables to keep the df tidy
behavior.df <- relocate(behavior.df, WeeklyNonBeautyRevenue, .after = WeeklyBeautyRevenuePerUser) #for tidyness relocate the new variable next to the old one
behavior.df <- relocate(behavior.df, WeeklyNonBeautyRevenuePerUser, .after = WeeklyNonBeautyRevenue) #for tidyness relocate the new variable next to the old one
### Is it significant? ----
# Here we test both for WeeklyNonBeautyRevenuePerUser and WeeklyBeautyRevenuePerUser
# Normality: Lilliefors (Kolmogorov-Smirnov) normality test
# The null hypothesis (H0) for the test is the data comes from a normal distribution.
# The alternate hypothesis (H1) is that the data doesn’t come from a normal distribution.
lillie.test(behavior.df$WeeklyNonBeautyRevenuePerUser)
# D = 0.1734, p-value = 1.378e-14
# Conclusion: the data doesn't come from a normal distribution therefore a Wilcoxon Rank Test is better
lillie.test(behavior.df$WeeklyBeautyRevenuePerUser)
# D = 0.14481, p-value = 5.669e-10
# Conclusion: the data doesn't come from a normal distribution therefore a Wilcoxon Rank Test is better
# Wilcoxon signed rank exact test -
# NOTE: Due to Launched (84) and Non-launched observations (98) having different # of observations,
# we need to remove the first 7 days (based on the date), which will remove 7 days from mobile and 7 from tablet.
# Which will reduce the total "No" observations by a total of 14.
wilcox.test(WeeklyNonBeautyRevenuePerUser ~ as.factor(Launched), data = behavior.df[behavior.df$Date_new > as.Date(as.character("20221107"), "%Y%m%d"),], paired = TRUE)
# V = 3570, p-value = 1.398e-15
# Results: Significant difference between the distributions of WeeklyNonBeautyRevenuePerUser in the Launch vs. Non-launch
wilcox.test(WeeklyBeautyRevenuePerUser ~ as.factor(Launched), data = behavior.df[behavior.df$Date_new > as.Date(as.character("20221107"), "%Y%m%d"),], paired = TRUE)
# V = 1792, p-value = 0.9768
# Results: No significant difference between the distributions of WeeklyBeautyRevenuePerUser in the Launch vs. Non-launch
# Mean of WeeklyBeautyRevenuePerUser Before and After Launch
mean(behavior.df$WeeklyBeautyRevenuePerUser[behavior.df$Date_new > as.Date(as.character("20221107"), "%Y%m%d") & behavior.df$Launched == 0])
mean(behavior.df$WeeklyBeautyRevenuePerUser[behavior.df$Date_new > as.Date(as.character("20221107"), "%Y%m%d") & behavior.df$Launched == 1])
# Non-coding questions
## 7) Using all information and answers so far, what could you do to elaborate on the magic mirror and integrate app and web around this feature? An important question, because we know that multi device and multi platform customers are higher value customers. ----
### App vs. Web on Revenue: Where do people convert? ----
ggplot(financial.df[financial.df$Launched == "Yes" & financial.df$Period != "Before",], aes(x = Platform, y = Revenue, fill = Platform)) +
geom_boxplot() +
ggtitle('Daily Beauty Revenue: Which platform converts? \n App vs. Web - Launched: Yes') +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5), axis.text = element_text(size = 15)) +
labs(x = "Platform",
y = "Daily Beauty Revenue (\u20ac)",
color = "Legend")
ggplot(financial.df[financial.df$Launched == "No",], aes(x = Platform, y = Revenue, fill = Platform)) +
geom_boxplot() +
ggtitle('Daily Beauty Revenue: Which platform converts? \n App vs. Web - Launched: No') +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5), axis.text = element_text(size = 15)) +
labs(x = "Platform",
y = "Daily Beauty Revenue (\u20ac)",
color = "Legend")
# Normality: Lilliefors (Kolmogorov-Smirnov) normality test
# The null hypothesis (H0) for the test is the data comes from a normal distribution.
# The alternate hypothesis (H1) is that the data doesn’t come from a normal distribution.
lillie.test(financial.df$Revenue)
# D = 0.2599, p-value < 2.2e-16
# Conclusion: the data doesn't come from a normal distribution therefore a Wilcoxon Rank Test is better
# Wilcoxon signed rank exact test
wilcox.test(Revenue ~ as.factor(Platform), data = financial.df[financial.df$Period != "Before",], paired = TRUE)
# V = 168, p-value < 2.2e-16
# Results: Significant difference between the distributions of Revenue in the App vs. Web
### App vs. Web on Browsing: Where do people browse? ----
financial.df$DetailViews <- financial.df$'Detail Views'
ggplot(financial.df[financial.df$Launched == "Yes" & financial.df$Period != "Before",], aes(x = Platform, y = DetailViews, fill = Platform)) +
geom_boxplot() +
ggtitle('Daily Product Page Views: Where do people browse? \n App vs. Web - Launched: Yes') +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5), axis.text = element_text(size = 15)) +
labs(x = "Platform",
y = "Daily Product Page Views",
color = "Legend")
ggplot(financial.df[financial.df$Launched == "No",], aes(x = Platform, y = DetailViews, fill = Platform)) +
geom_boxplot() +
ggtitle('Daily Product Page Views: Where do people browse? \n App vs. Web - Launched: No') +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5), axis.text = element_text(size = 15)) +
labs(x = "Platform",
y = "Daily Product Page Views",
color = "Legend")
# Normality: Lilliefors (Kolmogorov-Smirnov) normality test
# The null hypothesis (H0) for the test is the data comes from a normal distribution.
# The alternate hypothesis (H1) is that the data doesn’t come from a normal distribution.
lillie.test(financial.df$`Detail Views`)
# D = 0.16304, p-value < 2.2e-16
# Conclusion: the data doesn't come from a normal distribution therefore a Wilcoxon Rank Test is better
financial.df$DetailViews <- financial.df$'Detail Views'
# Wilcoxon signed rank exact test
wilcox.test(DetailViews ~ as.factor(Platform), data = financial.df[financial.df$Period != "Before",], paired = TRUE)
# V = 1958.5, p-value < 2.2e-16
# Results: Significant difference between the distribution of the Detail page views generated in the App vs. Web
## 8) Summarize the limitations (and potential solutions) on the last slide. Be clear on why you think these are limitations.
# non-coding question