<a href="https://colab.research.google.com/github/DepartmentOfStatisticsPUE/cda-2022/blob/main/notebooks/cda_5_cattab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installation of required modules, packages etc.

In [None]:
!pip install pingouin 

In [10]:
import numpy as np
import scipy.stats as st
import pingouin as pg
import pandas as pd

In [44]:
%load_ext rpy2.ipython

In [None]:
%%R
install.packages("vcd")

In [46]:
%%R
library(vcd)

R[write to console]: Loading required package: grid



In [None]:
%%bash
wget -q https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.2-linux-x86_64.tar.gz
tar zxvf julia-1.7.2-linux-x86_64.tar.gz
## python's module
pip install julia

In [91]:
import julia
julia.install(julia = "/content/julia-1.7.2/bin/julia")
from julia import Julia
jl = Julia(runtime="/content/julia-1.7.2/bin/julia",compiled_modules=False)
%load_ext julia.magic


Precompiling PyCall...
Precompiling PyCall... DONE
PyCall is installed and built successfully.

PyCall is setup for non-default Julia runtime (executable) `/content/julia-1.7.2/bin/julia`.
To use this Julia runtime, PyJulia has to be initialized first by
    from julia import Julia
    Julia(runtime='/content/julia-1.7.2/bin/julia')


Initializing Julia interpreter. This may take some time...




In [None]:
%%julia
using Pkg
Pkg.add("Distributions")
Pkg.add("HypothesisTests")
Pkg.add("StatsBase")
Pkg.add("DataFrames")

In [94]:
%%julia
using Random
using Distributions
using HypothesisTests
using StatsBase
using DataFrames

# 1. Contingency tables -- $\chi^2$ test and Cramer's V correlation coefficient

### R solution

In [68]:
%%R
data = matrix(data = c(762, 327, 468, 484, 239, 477),
              nrow = 2, ncol = 3, byrow = T,
              dimnames = list(c("F", "M"), c('demo', 'ind', 'rep')))
data

  demo ind rep
F  762 327 468
M  484 239 477


In [50]:
%%R
chisq.test(data)


	Pearson's Chi-squared test

data:  data
X-squared = 30.07, df = 2, p-value = 2.954e-07



In [51]:
%%R
assocstats(data)

                    X^2 df   P(> X^2)
Likelihood Ratio 30.017  2 3.0336e-07
Pearson          30.070  2 2.9536e-07

Phi-Coefficient   : NA 
Contingency Coeff.: 0.104 
Cramer's V        : 0.104 


Using xtabs / table

In [54]:
%%R
summary(as.table(data))

Number of cases in table: 2757 
Number of factors: 2 
Test for independence of all factors:
	Chisq = 30.07, df = 2, p-value = 2.954e-07


In [89]:
%%R
data <- as.data.frame(data)
data$sex <- rownames(data)
data_long <- reshape(data, 
                     direction = "long",
                     varying = list(1:3),
                     v.names = "counts")

xtabs(counts ~ sex + time, data_long) |> summary()

Call: xtabs(formula = counts ~ sex + time, data = data_long)
Number of cases in table: 2757 
Number of factors: 2 
Test for independence of all factors:
	Chisq = 30.07, df = 2, p-value = 2.954e-07


### Python solution

In [2]:
data = np.array([[ 762, 327, 468], [484, 239, 477]])
data

array([[762, 327, 468],
       [484, 239, 477]])

In [3]:
st.chi2_contingency(data)

(30.070149095754672,
 2.9535891832117595e-07,
 2,
 array([[703.67138194, 319.64526659, 533.68335147],
        [542.32861806, 246.35473341, 411.31664853]]))

Pingouin requires a `pd.DataFrame` to work.

In [43]:
data_df = pd.DataFrame(data, columns = ['demo', 'ind', 'rep'], index = ['F', 'M'])

data_long = (data_df
  .stack()
  .reset_index()
  .rename(columns={'level_0' : 'sex', 
                   'level_1' : 'party',
                   0 : 'counts'})
  )

data_long = data_long.reindex(data_long.index.repeat(data_long.counts))

pg.chi2_independence(data_long, x = "sex", y = "party")

(party        demo         ind         rep
 sex                                      
 F      703.671382  319.645267  533.683351
 M      542.328618  246.354733  411.316649, party  demo  ind  rep
 sex                  
 F       762  327  468
 M       484  239  477,                  test    lambda       chi2  dof          pval    cramer  \
 0             pearson  1.000000  30.070149  2.0  2.953589e-07  0.104436   
 1        cressie-read  0.666667  30.043391  2.0  2.993371e-07  0.104389   
 2      log-likelihood  0.000000  30.016693  2.0  3.033598e-07  0.104343   
 3       freeman-tukey -0.500000  30.020001  2.0  3.028584e-07  0.104349   
 4  mod-log-likelihood -1.000000  30.043220  2.0  2.993627e-07  0.104389   
 5              neyman -2.000000  30.149263  2.0  2.839035e-07  0.104573   
 
       power  
 0  0.999239  
 1  0.999233  
 2  0.999227  
 3  0.999227  
 4  0.999233  
 5  0.999258  )

### Julia solution

In [102]:
%%julia
data = [762 327 468; 484 239 477]
res = ChisqTest(data)

<PyCall.jlwrap Pearson's Chi-square Test
-------------------------
Population details:
    parameter of interest:   Multinomial Probabilities
    value under h_0:         [0.255231, 0.19671, 0.11594, 0.0893561, 0.193574, 0.14919]
    point estimate:          [0.276387, 0.175553, 0.118607, 0.0866884, 0.16975, 0.173014]
    95% confidence interval: [(0.2545, 0.2994), (0.1573, 0.1955), (0.1033, 0.1358), (0.07357, 0.1019), (0.1517, 0.1894), (0.1548, 0.1928)]

Test summary:
    outcome with 95% confidence: reject h_0
    one-sided p-value:           <1e-06

Details:
    Sample size:        2757
    statistic:          30.070149095754687
    degrees of freedom: 2
    residuals:          [2.19886, -2.50467, 0.41137, -0.468583, -2.84324, 3.23867]
    std. residuals:     [4.50205, -4.50205, 0.699452, -0.699452, -5.31595, 5.31595]
>

In [108]:
## Cramer's V
%%julia
sqrt(res.stat / sum(data))

0.1044358023564678

# 2. Capture-recapture


### R solution

In [109]:
%%R

## size
n_1 <- 39862
n_2 <- 1819
n_11 <- 374

## population
N <- n_1*n_2/n_11 

## variance and std. error
varN <- n_1^2*n_2*(n_2-n_11)/n_11^3
seN <- sqrt(varN)

## confidence interval
z95 <- qnorm(1-0.05/2)
cat("95% CI :", N-z95*seN, N+z95*seN)

95% CI : 176361.7 211386.8

In [119]:
%%R

set.seed(123)
## calculate p vector
p <- c(n_11, n_1-n_11, n_2 - n_11, 
      round(N) - (n_1 + n_2 - n_11))/round(N)
## 500 bootstrap samples
b_samples <- rmultinom(n = 500, size = round(N), prob = p)
## estimate N_b and standard error
N_b <- (b_samples[1,]+b_samples[2,])*
       (b_samples[1,]+b_samples[3,])/b_samples[1,]
seN_b <- sd(N_b)

## confidence interval
cat("95% CI :", quantile(N_b, c(0.025, 0.975)))

95% CI : 177125.8 212115.5

### Python solution


In [118]:
## size
n_1 = 39862
n_2 = 1819
n_11 = 374

## population
N = n_1*n_2/n_11 

## variance and std. error
varN = n_1**2*n_2*(n_2-n_11)/n_11**3
seN = np.sqrt(varN)

## confidence interval
z95 = st.norm.ppf(1-0.05/2)

N-z95*seN, N+z95*seN

(176361.6962197761, 211386.84923476938)

In [134]:
np.random.seed(1)
## calculate p vector
p = np.array([n_11, n_1-n_11, n_2 - n_11, round(N) - np.sum(n_1 + n_2 - n_11)]) / round(N)
p
## 500 bootstrap samples
b_samples = st.multinomial(n = round(N), p = p).rvs(500)
## estimate N_b and standard error
N_b =  (b_samples[:,0]+b_samples[:,1])*(b_samples[:,0]+b_samples[:,2])/b_samples[:,0]
seN_b = np.std(N_b)
## confidence interval
np.quantile(N_b, [0.025, 0.975])

array([176652.21276063, 212446.08990597])

### Julia solution

In [135]:
%%julia

n_1 = 39862
n_2 = 1819
n_11 = 374

## population
N = n_1*n_2/n_11 

## variance and std. error
varN = n_1^2*n_2*(n_2-n_11)/n_11^3
seN = sqrt(varN)

## confidence interval
z95 = quantile(Normal(), 1-0.05/2)
[N-z95*seN, N+z95*seN]

array([176361.69621978, 211386.84923477])

In [137]:
%%julia

Random.seed!(123)
p = @. [n_11, n_1-n_11, n_2 - n_11,  N - sum(n_1 + n_2 - n_11)] / N

d = rand(Multinomial(Int(round(N)), p), 500)
N_b = @. (d[1,:] + d[2,:])*(d[1,:] + d[3,:]) / d[1,:]
quantile(N_b, [0.025, 0.975])


array([179716.59743744, 212716.56941803])