# 1. Frequency and contingency tables

In [1]:
library(vcd)
head(Arthritis)

Loading required package: grid



Unnamed: 0_level_0,ID,Treatment,Sex,Age,Improved
Unnamed: 0_level_1,<int>,<fct>,<fct>,<int>,<ord>
1,57,Treated,Male,27,Some
2,46,Treated,Male,29,
3,77,Treated,Male,30,
4,17,Treated,Male,32,Marked
5,36,Treated,Male,46,Marked
6,23,Treated,Male,58,Marked


## ONE-WAY TABLES

In [2]:
mytable <- with(Arthritis, table(Improved))
mytable

Improved
  None   Some Marked 
    42     14     28 

In [3]:
prop.table(mytable)

Improved
     None      Some    Marked 
0.5000000 0.1666667 0.3333333 

In [4]:
prop.table(mytable)*100

Improved
    None     Some   Marked 
50.00000 16.66667 33.33333 

## TWO-WAY TABLES

In [5]:
mytable <- xtabs(~ Treatment + Improved, data=Arthritis)
mytable

         Improved
Treatment None Some Marked
  Placebo   29    7      7
  Treated   13    7     21

In [6]:
margin.table(mytable, 1) 

Treatment
Placebo Treated 
     43      41 

In [7]:
prop.table(mytable, 1) # For row sums and row proportions

         Improved
Treatment      None      Some    Marked
  Placebo 0.6744186 0.1627907 0.1627907
  Treated 0.3170732 0.1707317 0.5121951

In [8]:
margin.table(mytable, 2) 

Improved
  None   Some Marked 
    42     14     28 

In [9]:
prop.table(mytable, 2) # For column sums and column proportions

         Improved
Treatment      None      Some    Marked
  Placebo 0.6904762 0.5000000 0.2500000
  Treated 0.3095238 0.5000000 0.7500000

In [10]:
prop.table(mytable)

         Improved
Treatment       None       Some     Marked
  Placebo 0.34523810 0.08333333 0.08333333
  Treated 0.15476190 0.08333333 0.25000000

In [11]:
addmargins(mytable)

Unnamed: 0,None,Some,Marked,Sum
Placebo,29,7,7,43
Treated,13,7,21,41
Sum,42,14,28,84


In [12]:
addmargins(prop.table(mytable))

Unnamed: 0,None,Some,Marked,Sum
Placebo,0.3452381,0.08333333,0.08333333,0.5119048
Treated,0.1547619,0.08333333,0.25,0.4880952
Sum,0.5,0.16666667,0.33333333,1.0


In [13]:
addmargins(prop.table(mytable,1),2)

Unnamed: 0,None,Some,Marked,Sum
Placebo,0.6744186,0.1627907,0.1627907,1
Treated,0.3170732,0.1707317,0.5121951,1


In [14]:
addmargins(prop.table(mytable,2),1)

Unnamed: 0,None,Some,Marked
Placebo,0.6904762,0.5,0.25
Treated,0.3095238,0.5,0.75
Sum,1.0,1.0,1.0


## Tests of independence

In [15]:
# CHI-SQUARE TEST OF INDEPENDENCE

mytable

         Improved
Treatment None Some Marked
  Placebo   29    7      7
  Treated   13    7     21

In [16]:
chisq.test(mytable)


	Pearson's Chi-squared test

data:  mytable
X-squared = 13.055, df = 2, p-value = 0.001463


In [17]:
mytable <- xtabs( ~Improved + Sex, data=Arthritis)
mytable

        Sex
Improved Female Male
  None       25   17
  Some       12    2
  Marked     22    6

In [18]:
chisq.test(mytable)

“Chi-squared approximation may be incorrect”



	Pearson's Chi-squared test

data:  mytable
X-squared = 4.8407, df = 2, p-value = 0.08889


In [19]:
# FISHER’S EXACT TEST

mytable <- xtabs(~Treatment+Improved, data = Arthritis)
mytable

         Improved
Treatment None Some Marked
  Placebo   29    7      7
  Treated   13    7     21

In [20]:
fisher.test(mytable)


	Fisher's Exact Test for Count Data

data:  mytable
p-value = 0.001393
alternative hypothesis: two.sided


In [21]:
# COCHRAN–MANTEL–HAENSZEL TEST

mytable <- xtabs(~Treatment+Improved+Sex,data = Arthritis)
mytable

, , Sex = Female

         Improved
Treatment None Some Marked
  Placebo   19    7      6
  Treated    6    5     16

, , Sex = Male

         Improved
Treatment None Some Marked
  Placebo   10    0      1
  Treated    7    2      5


In [22]:
mantelhaen.test(mytable)


	Cochran-Mantel-Haenszel test

data:  mytable
Cochran-Mantel-Haenszel M^2 = 14.632, df = 2, p-value = 0.0006647


## Measures of association

In [23]:
mytable <- xtabs(~Treatment+Improved, data = Arthritis)
assocstats(mytable)

                    X^2 df  P(> X^2)
Likelihood Ratio 13.530  2 0.0011536
Pearson          13.055  2 0.0014626

Phi-Coefficient   : NA 
Contingency Coeff.: 0.367 
Cramer's V        : 0.394 

# 2. Correlations

## Type of correlations
### PEARSON, SPEARMAN, AND KENDALL CORRELATIONS
The Pearson product-moment correlation assesses the degree of linear relationship
between two quantitative variables. Spearman’s rank-order correlation coefficient assesses the degree of relationship between two rank-ordered variables. Kendall’s tau
is also a nonparametric measure of rank correlation.

In [24]:
# Covariances and correlations
states <- state.x77[,1:6]
head(states)

Unnamed: 0,Population,Income,Illiteracy,Life Exp,Murder,HS Grad
Alabama,3615,3624,2.1,69.05,15.1,41.3
Alaska,365,6315,1.5,69.31,11.3,66.7
Arizona,2212,4530,1.8,70.55,7.8,58.1
Arkansas,2110,3378,1.9,70.66,10.1,39.9
California,21198,5114,1.1,71.71,10.3,62.6
Colorado,2541,4884,0.7,72.06,6.8,63.9


In [25]:
cov(states)

Unnamed: 0,Population,Income,Illiteracy,Life Exp,Murder,HS Grad
Population,19931683.7588,571229.7796,292.8679592,-407.8424612,5663.523714,-3551.509551
Income,571229.7796,377573.3061,-163.7020408,280.6631837,-521.894286,3076.76898
Illiteracy,292.868,-163.702,0.3715306,-0.4815122,1.581776,-3.235469
Life Exp,-407.8425,280.6632,-0.4815122,1.8020204,-3.86948,6.312685
Murder,5663.5237,-521.8943,1.5817755,-3.8694804,13.627465,-14.549616
HS Grad,-3551.5096,3076.769,-3.2354694,6.3126849,-14.549616,65.237894


In [26]:
cor(states)

Unnamed: 0,Population,Income,Illiteracy,Life Exp,Murder,HS Grad
Population,1.0,0.2082276,0.1076224,-0.06805195,0.3436428,-0.09848975
Income,0.20822756,1.0,-0.4370752,0.34025534,-0.2300776,0.61993232
Illiteracy,0.10762237,-0.4370752,1.0,-0.58847793,0.7029752,-0.65718861
Life Exp,-0.06805195,0.3402553,-0.5884779,1.0,-0.7808458,0.5822162
Murder,0.34364275,-0.2300776,0.7029752,-0.78084575,1.0,-0.48797102
HS Grad,-0.09848975,0.6199323,-0.6571886,0.5822162,-0.487971,1.0


In [27]:
cor(states, method = "spearman")

Unnamed: 0,Population,Income,Illiteracy,Life Exp,Murder,HS Grad
Population,1.0,0.1246098,0.3130496,-0.1040171,0.3457401,-0.3833649
Income,0.1246098,1.0,-0.3145948,0.324105,-0.2174623,0.5104809
Illiteracy,0.3130496,-0.3145948,1.0,-0.5553735,0.6723592,-0.6545396
Life Exp,-0.1040171,0.324105,-0.5553735,1.0,-0.7802406,0.523941
Murder,0.3457401,-0.2174623,0.6723592,-0.7802406,1.0,-0.436733
HS Grad,-0.3833649,0.5104809,-0.6545396,0.523941,-0.436733,1.0


In [28]:
x <- states[,c("Population","Income","Illiteracy")]
y <- states[,c("Life Exp","Murder")]
cor(x,y)

Unnamed: 0,Life Exp,Murder
Population,-0.06805195,0.3436428
Income,0.34025534,-0.2300776
Illiteracy,-0.58847793,0.7029752


### PARTIAL CORRELATIONS

In [29]:
library(ggm)
colnames(states)

In [30]:
pcor(c(1,5,2,3,6), cov(states))

### Testing correlations for significance

In [31]:
# Testing a correlation coefficient for significance

cor.test(x=states[,3],y = states[,5])


	Pearson's product-moment correlation

data:  states[, 3] and states[, 5]
t = 6.8479, df = 48, p-value = 1.258e-08
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5279280 0.8207295
sample estimates:
      cor 
0.7029752 


In [32]:
library(psych)
corr.test(states, use = "complete")

Call:corr.test(x = states, use = "complete")
Correlation matrix 
           Population Income Illiteracy Life Exp Murder HS Grad
Population       1.00   0.21       0.11    -0.07   0.34   -0.10
Income           0.21   1.00      -0.44     0.34  -0.23    0.62
Illiteracy       0.11  -0.44       1.00    -0.59   0.70   -0.66
Life Exp        -0.07   0.34      -0.59     1.00  -0.78    0.58
Murder           0.34  -0.23       0.70    -0.78   1.00   -0.49
HS Grad         -0.10   0.62      -0.66     0.58  -0.49    1.00
Sample Size 
[1] 50
Probability values (Entries above the diagonal are adjusted for multiple tests.) 
           Population Income Illiteracy Life Exp Murder HS Grad
Population       0.00   0.59       1.00      1.0   0.10       1
Income           0.15   0.00       0.01      0.1   0.54       0
Illiteracy       0.46   0.00       0.00      0.0   0.00       0
Life Exp         0.64   0.02       0.00      0.0   0.00       0
Murder           0.01   0.11       0.00      0.0   0.00       0
H

# 4.T-tests

## Independent t-test

In [33]:
library(MASS)
t.test(Prob ~ So, data = UScrime)


	Welch Two Sample t-test

data:  Prob by So
t = -3.8954, df = 24.925, p-value = 0.0006506
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 -0.03852569 -0.01187439
sample estimates:
mean in group 0 mean in group 1 
     0.03851265      0.06371269 


In [34]:
sapply(UScrime[c("U1","U2")], function(x)(c(mean=mean(x),sd=sd(x))))

Unnamed: 0,U1,U2
mean,95.46809,33.97872
sd,18.02878,8.44545


## Dependent t-test

In [35]:
t.test(UScrime$U1,UScrime$U2,paired = TRUE)


	Paired t-test

data:  UScrime$U1 and UScrime$U2
t = 32.407, df = 46, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 57.67003 65.30870
sample estimates:
mean of the differences 
               61.48936 


# 4.Nonparametric tests of group differences

## Comparing two groups

In [36]:
with(UScrime, by(Prob, So, median))

So: 0
[1] 0.038201
------------------------------------------------------------ 
So: 1
[1] 0.055552

In [37]:
wilcox.test(Prob ~ So, data = UScrime)


	Wilcoxon rank sum exact test

data:  Prob by So
W = 81, p-value = 8.488e-05
alternative hypothesis: true location shift is not equal to 0


In [38]:
sapply(UScrime[c("U1","U2")], median)

In [39]:
with(UScrime, wilcox.test(U1,U2,paired = TRUE))

“cannot compute exact p-value with ties”



	Wilcoxon signed rank test with continuity correction

data:  U1 and U2
V = 1128, p-value = 2.464e-09
alternative hypothesis: true location shift is not equal to 0


## Comparing more than two groups

In [40]:
states <- data.frame(state.region, state.x77)

In [41]:
kruskal.test(Illiteracy~state.region, data = states)


	Kruskal-Wallis rank sum test

data:  Illiteracy by state.region
Kruskal-Wallis chi-squared = 22.672, df = 3, p-value = 4.726e-05


In [42]:
source("http://www.statmethods.net/RiA/wmc.txt")
states <- data.frame(state.region, state.x77)
wmc(Illiteracy~state.region, data = states, method = "holm")

“cannot xtfrm data frames”


Descriptive Statistics

           West North Central Northeast    South
n      13.00000      12.00000   9.00000 16.00000
median  0.60000       0.70000   1.10000  1.75000
mad     0.14826       0.14826   0.29652  0.59304

Multiple Comparisons (Wilcoxon Rank Sum Tests)
Probability Adjustment = holm

        Group.1       Group.2    W            p    
1          West North Central 88.0 8.665618e-01    
2          West     Northeast 46.5 8.665618e-01    
3          West         South 39.0 1.788186e-02   *
4 North Central     Northeast 20.5 5.359707e-02   .
5 North Central         South  2.0 8.051509e-05 ***
6     Northeast         South 18.0 1.187644e-02   *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
