## Setup

### import packages

In [7]:
library(tidyr)
library(gridExtra)
library(dplyr)
library(datasets)
library(ggplot2)
library(Ecdat)
library(car)
library(multcomp)
library(gmodels)



### import datasets

In [59]:

WinningNumbers <- read.csv("Lottery_Mega_Millions_Winning_Numbers__Beginning_2002_Wrangled.csv")
Wins <- read.csv("AllWinners.csv")
View(Wins)

X,Draw.Date,Amount,cash.prize,Location,State,Gender,Win,Jackpot
0,2002-05-17,2.33e+07,,"Chatham, Ill.",IL,Mixed,Y,Y
1,2002-05-24,2.33e+07,,"Chicago, Ill.",IL,F,Y,Y
2,2002-07-16,2.33e+07,,"Cliffside Park, N.J.",NJ,M,Y,Y
3,2002-08-27,2.33e+07,,"New York City, N.Y.",NY,Mixed,Y,Y
4,2002-09-06,2.33e+07,,"Kentwood, Mich.",MI,M,Y,Y
5,2002-09-27,2.33e+07,,"Mount Prospect, Ill.",IL,Mixed,Y,Y
6,2002-11-08,2.33e+07,,"Hoquiam, Wash.",WA,Mixed,Y,Y
7,2002-11-19,2.33e+07,,"New York City, N.Y.",NY,Mixed,Y,Y
8,2002-12-24,2.33e+07,,Unclaimed in N.Y.,NY,Unk,Y,Y
9,2003-02-11,2.33e+07,,"Brooklyn, N.Y.",NY,M,Y,Y


In [28]:
Jackpots<- filter (Wins, Jackpot == "Y")


In [30]:
JackpotByGender <- aggregate(JackpotAmount~Gender, Wins, mean)
JackpotByGender

Gender,JackpotAmount
F,93696552
M,107261728
Mixed,106522059
Unk,502384444


In [27]:
NonJackpots<- filter (Wins, Jackpot == "N")

In [50]:

NonJackpotByGender <- aggregate(NonJackpotAmount~Gender, Wins, mean)
NonJackpotByGender

Gender,NonJackpotAmount
F,1345265
M,9750284
M,1000000
Mixed,1149327
Unk,48750000


## Analysis

## 1. In the Mega Millions, what are the optimal numbers to select in order to achieve a return on investment (ROI)?

## 2. Does gender influence Prize Amount? 

### Jackpots

In [49]:
Jackpots$Gender <- as.factor(Jackpots$Gender)

#### Testing Assumptions



##### 1. Normality


In [None]:
plotNormalHistogram(Jackpots$JackpotAmount)
#  positive skew

In [None]:
Jackpots$JackpotAmountSQRT <- sqrt(Jackpots$JackpotAmount)

In [None]:
plotNormalHistogram(Jackpots$JackpotAmountSQRT)
#  positive skew

In [None]:
Jackpots$JackpotAmountLOG <-log(Jackpots$JackpotAmount)

In [None]:
plotNormalHistogram(Jackpots$JackpotAmountLOG)
# normal


##### 2. Homogeneity of Variance


In [None]:

bartlett.test(JackpotAmountLOG~Gender, data=Jackpots)


	Bartlett test of homogeneity of variances

data:  JackpotAmountLOG by Gender
Bartlett's K-squared = 8.4638, df = 3, p-value = 0.03734


this test is not significant, the assumption is met

##### 4. Sample size


this assumption is met - need 20 per IV or CV and I have 1, so need at least 20 and there are 223 jackpot winners!



#### Running the Analysis


In [None]:
JackpotsANOVA <- aov(Jackpots$JackpotAmountLOG~Jackpots$Gender)
summary(JackpotsANOVA)

                 Df Sum Sq Mean Sq F value Pr(>F)
Jackpots$Gender   3   7.11   2.369   2.026  0.111
Residuals       219 256.14   1.170               

The ANOVA is not significant, indicating there is no difference in Jackpot size based on Gender. 

In [None]:
pairwise.t.test(Jackpots$JackpotAmountLOG, Jackpots$Gender, p.adjust="none")


	Pairwise comparisons using t tests with pooled SD 

data:  Jackpots$JackpotAmountLOG and Jackpots$Gender 

      F     M     Mixed
M     0.560 -     -    
Mixed 0.436 0.776 -    
Unk   0.032 0.038 0.078

P value adjustment method: none 

In [None]:
pairwise.t.test(Jackpots$JackpotAmountLOG, Jackpots$Gender, p.adjust="bonferroni")


	Pairwise comparisons using t tests with pooled SD 

data:  Jackpots$JackpotAmountLOG and Jackpots$Gender 

      F    M    Mixed
M     1.00 -    -    
Mixed 1.00 1.00 -    
Unk   0.19 0.23 0.47 

P value adjustment method: bonferroni 

Pairwise testing indicate that 

### Non-Jackpots

In [48]:
NonJackpots$Gender <- as.factor(NonJackpots$Gender)

#### Testing Assumptions



##### 1. Normality


In [None]:
plotNormalHistogram(NonJackpots$NonJackpotAmount)
#  positive skew


In [None]:

NonJackpots$NonJackpotAmountSQRT <- sqrt(NonJackpots$NonJackpotAmount)
plotNormalHistogram(NonJackpots$NonJackpotAmountSQRT)
#positive skew


In [52]:

NonJackpots$NonJackpotAmountLOG <-log(NonJackpots$NonJackpotAmount)


In [None]:

plotNormalHistogram(NonJackpots$NonJackpotAmountLOG)

##### 2. Homogeneity of Variance


In [53]:

bartlett.test(NonJackpotAmountLOG~Gender, data=NonJackpots)


	Bartlett test of homogeneity of variances

data:  NonJackpotAmountLOG by Gender
Bartlett's K-squared = Inf, df = 4, p-value < 2.2e-16


this test is  significant, the assumption is violated

In [54]:
fligner.test(NonJackpotAmountLOG~Gender, data=NonJackpots)


	Fligner-Killeen test of homogeneity of variances

data:  NonJackpotAmountLOG by Gender
Fligner-Killeen:med chi-squared = 12.597, df = 4, p-value = 0.01342


This test is not significant the assumption is met

##### 4. Sample size


this assumption is met - need 20 per IV or CV and I have 1, so need at least 20 and there are 274 non-jackpot winners!



#### Running the Analysis


In [55]:
NonJackpotsANOVA <- aov(NonJackpots$NonJackpotAmountLOG~NonJackpots$Gender)
summary(NonJackpotsANOVA)

                    Df Sum Sq Mean Sq F value Pr(>F)  
NonJackpots$Gender   4   15.5   3.874   3.256 0.0125 *
Residuals          269  320.1   1.190                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The ANOVA is significant, indicating there is  Non-Jackpot prize do vary based on Gender. 

In [56]:
pairwise.t.test(NonJackpots$NonJackpotAmountLOG, NonJackpots$Gender, p.adjust="none")


	Pairwise comparisons using t tests with pooled SD 

data:  NonJackpots$NonJackpotAmountLOG and NonJackpots$Gender 

      F      M      M      Mixed 
M     0.1639 -      -      -     
M     0.8767 0.6143 -      -     
Mixed 0.8080 0.1649 0.9419 -     
Unk   0.0012 0.0108 0.1597 0.0017

P value adjustment method: none 

In [58]:
pairwise.t.test(NonJackpots$NonJackpotAmountLOG, NonJackpots$Gender, p.adjust="bonferroni")


	Pairwise comparisons using t tests with pooled SD 

data:  NonJackpots$NonJackpotAmountLOG and NonJackpots$Gender 

      F     M     M     Mixed
M     1.000 -     -     -    
M     1.000 1.000 -     -    
Mixed 1.000 1.000 1.000 -    
Unk   0.012 0.108 1.000 0.017

P value adjustment method: bonferroni 

Pairwise testing indicate that 

## 3. In NYS, how much does the lottery give back to society? Do certain counties benefit more?

### Testing Assumptions



#### 1. Normality



In [None]:

plotNormalHistogram(cellPhone$Night.Mins)
#  normal   




#### 2. Homogeneity of Variance


In [None]:

bartlett.test(avgPriceSQRT ~ region, data=avocados1)

# Bartlett test of homogeneity of variances

# data:  avgPriceSQRT by region
# Bartlett's K-squared = 70.75, df = 2, p-value = 4.333e-16

fligner.test(avgPriceSQRT ~ region, data=avocados1)

# Fligner-Killeen test of homogeneity of variances
# 
# data:  avgPriceSQRT by region
# Fligner-Killeen:med chi-squared = 48.139, df = 2, p-value = 3.522e-11


# Does not meet the assumption for homogeneity of variance



#### 3. Sample Size


n = 1014, minium requirement of 20 cases is met



### Running the Analysis


In [None]:

# Do the Test, with unequal variance
ANOVA1 <- lm(avgPriceSQRT ~ region, data=avocados1)
Anova(ANOVA1, Type="II", white.adjust=TRUE)

# Analysis of Deviance Table (Type II tests)
# 
# Response: avgPriceSQRT
#              Df      F      Pr(>F)    
# region       2     258.85   < 2.2e-16 ***
#   Residuals 1011                     
# ---
#   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1




#### Post Hocs Analysis


In [None]:
pairwise.t.test(avocados1$avgPriceSQRT, avocados1$region, p.adjust="bonferroni", pool.sd = FALSE)

# Pairwise comparisons using t tests with non-pooled SD 
# 
# data:  avocados1$avgPriceSQRT and avocados1$region 
# 
#            Albany   Houston
#   Houston  < 2e-16   -      
#   Seattle  8.1e-06   < 2e-16
# 
# P value adjustment method: bonferroni 



##### Find means and draw conclusions


In [None]:

avocadosMeans <- avocados1 %>% group_by(region) %>% summarize(Mean = mean(AveragePrice))
# A tibble: 3 x 2
#     region   Mean
#      <chr>   <dbl>
#   1 Albany   1.56
#   2 Houston  1.05
#   3 Seattle  1.44

# There is no significant difference in average price between the regions.


### Testing Assumptions



#### 1. Normality


In [None]:
head(EdAid)

In [None]:

plotNormalHistogram(EdAid$Amount.of.Aid)

#### 2. Homogeneity of Variance


In [None]:
Homogeneity_RegrSlp = lm(Night.Mins~vMail.Plan, data=cellPhone)
anova(Homogeneity_RegrSlp)

# Analysis of Variance Table
# 
# Response: Night.Mins
#               Df    Sum Sq Mean Sq  F value  Pr(>F)
# vMail.Plan    1      488   487.73   0.1909   0.6622
# Residuals    4615 11791809 2555.10  

# This assumption is met.


#### 3. Homogeneity of Regression Slopes


In [None]:

Homogeneity_RegrSlp = lm(Night.Mins~vMail.Plan, data=cellPhone)
anova(Homogeneity_RegrSlp)

# Analysis of Variance Table
# 
# Response: Night.Mins
#               Df    Sum Sq Mean Sq  F value  Pr(>F)
# vMail.Plan    1      488   487.73   0.1909   0.6622
# Residuals    4615 11791809 2555.10  

# This assumption is met. 



#### 4. Sample size


 is met - need 20 per IV or CV and I have 2, so need at least 40 and there are 4617 cases!


### Running the Analysis

In [None]:
ANCOVA = lm(Night.Mins~vMail.Plan + International.Plan*vMail.Plan, data=cellPhone)
anova(ANCOVA)

# Analysis of Variance Table
# 
# Response: Night.Mins
# Df   Sum Sq Mean Sq F value  Pr(>F)  
# vMail.Plan                       1      488   487.7  0.1909 0.66216  
# International.Plan               1     7979  7978.6  3.1235 0.07724 .
# vMail.Plan:International.Plan    1      435   435.0  0.1703 0.67989  
# Residuals                     4613 11783396  2554.4                  
# ---
#   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

 
# the presence or absence of an international phone plan (International.Plan)
# does not influence the use of nighttime minutes (Night.Mins), even holding
# whether or not the client has a voicemail plan (vMail.Plan) constant.