<a href="https://colab.research.google.com/github/DepartmentOfStatisticsPUE/cda-2022/blob/main/notebooks/cda_2_pseudorandom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup environment

### Python libraries

In [1]:
import scipy.stats as st
import numpy as np
import pandas as pd

## Setup R via Python

In [2]:
%load_ext rpy2.ipython

## Setup Julia via Python

In [3]:
%%bash
wget https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.2-linux-x86_64.tar.gz
tar zxvf julia-1.7.2-linux-x86_64.tar.gz
## pythons module
pip install julia

julia-1.7.2/
julia-1.7.2/LICENSE.md
julia-1.7.2/lib/
julia-1.7.2/lib/libjulia.so.1.7
julia-1.7.2/lib/julia/
julia-1.7.2/lib/julia/libgit2.so
julia-1.7.2/lib/julia/libcamd.so.2
julia-1.7.2/lib/julia/libgomp.so.1
julia-1.7.2/lib/julia/libgmp.so
julia-1.7.2/lib/julia/libgit2.so.1.1.0
julia-1.7.2/lib/julia/libamd.so
julia-1.7.2/lib/julia/libpcre2-8.so.0
julia-1.7.2/lib/julia/libumfpack.so
julia-1.7.2/lib/julia/libatomic.so.1
julia-1.7.2/lib/julia/libmpfr.so
julia-1.7.2/lib/julia/libnghttp2.so
julia-1.7.2/lib/julia/libsuitesparseconfig.so.5
julia-1.7.2/lib/julia/libumfpack.so.5.7.9
julia-1.7.2/lib/julia/libgmpxx.so.4.6.1
julia-1.7.2/lib/julia/libcamd.so.2.4.6
julia-1.7.2/lib/julia/libmbedx509.so
julia-1.7.2/lib/julia/libcolamd.so.2
julia-1.7.2/lib/julia/libssh2.so.1
julia-1.7.2/lib/julia/libz.so.1
julia-1.7.2/lib/julia/libklu.so
julia-1.7.2/lib/julia/libmbedtls.so
julia-1.7.2/lib/julia/libsuitesparseconfig.so.5.10.1
julia-1.7.2/lib/julia/libquadmath.so.0
julia-1.7.2/lib/julia/libcurl.so.4
j

--2022-03-08 20:07:17--  https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.2-linux-x86_64.tar.gz
Resolving julialang-s3.julialang.org (julialang-s3.julialang.org)... 151.101.2.49, 151.101.66.49, 151.101.130.49, ...
Connecting to julialang-s3.julialang.org (julialang-s3.julialang.org)|151.101.2.49|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 123295596 (118M) [application/x-tar]
Saving to: ‘julia-1.7.2-linux-x86_64.tar.gz.1’

     0K .......... .......... .......... .......... ..........  0% 4.31M 27s
    50K .......... .......... .......... .......... ..........  0% 5.58M 24s
   100K .......... .......... .......... .......... ..........  0% 21.4M 18s
   150K .......... .......... .......... .......... ..........  0% 21.6M 15s
   200K .......... .......... .......... .......... ..........  0% 8.69M 15s
   250K .......... .......... .......... .......... ..........  0% 34.7M 13s
   300K .......... .......... .......... .......... ..........  0% 4

Install python's julia module and setup Julia

In [4]:
import julia
julia.install(julia = "/content/julia-1.7.2/bin/julia")
from julia import Julia
jl = Julia(runtime="/content/julia-1.7.2/bin/julia",compiled_modules=False)
%load_ext julia.magic

Initializing Julia interpreter. This may take some time...




Install relevant Julia packages and load them

In [None]:
%%julia
import Pkg; Pkg.add("Distributions")
import Pkg; Pkg.add("DataFrames")
using Distributions
using DataFrames
using Random

# Exercies

Generate

$$
\begin{cases}
X_1 & \sim N(4, 1.5)  \\
X_2 & \sim Exp(1) \\
\epsilon & \sim N(0,1) 
\end{cases}
$$

We will calculate the following variable

$$
Y = 2 + 2.5*X_1 - 3*X_2 + \epsilon.
$$

Then, let's assume that probability of success is generated from the following function (it is called logit or logistic function)

$$
p = \frac{\exp(0.5 + 0.5*X_1 + 1*X_2)}{1 + \exp(0.5 + 0.5*X_1 + 1*X_2)}
$$

or

$$
\text{logit}(p) = 0.5 + 0.5*X_1 + 1*X_2
$$

where $X_1$ and $X_2$ are the same as above and then we generate variable $Y_2$ from Bernoulli distribution with $p$ as a parameter

$$
Y_2 \sim \text{Bern}(p)
$$




## Solution in R

In [6]:
%%R
set.seed(1)
N <- 100000
x1 <- rnorm(N, 4,  1.5) ## random variable from normal dist
x2 <- rexp(N, 1) ## random variable from exp dist
epsilon <- rnorm(N) ## random variable from normal dist
y1 <- 2 + 2.5*x1 - 3*x2 + epsilon
p <- exp(0.5 + 0.5*x1 + x2) / (1 + exp(0.5 + 0.5*x1 + x2))
y2 <- rbinom(n = length(p), size = 1, prob = p)
ex1 <- data.frame(x1,x2,y1,p, y2)
head(ex1)

        x1        x2        y1         p y2
1 3.060319 0.5713136  8.068432 0.9309563  1
2 4.275465 0.0508544 13.117475 0.9363498  1
3 2.746557 0.3050659  7.945739 0.8982879  1
4 6.392921 0.5102495 16.429954 0.9853233  1
5 4.494262 0.9660488 11.839728 0.9761814  1
6 2.769297 0.5253947  9.202053 0.9175900  1


## Solution in Python

In [7]:
np.random.seed(1)
N = 100000
x1 = st.norm(4,1.5).rvs(N)
x2 = st.expon(1).rvs(N)
epsilon = st.norm(0,1).rvs(N)
y1 = 2 + 2.5*x1 - 3*x2 + epsilon
p  = np.exp(0.5 + 0.5*x1 + 1*x2) / (1 + np.exp(0.5 + 0.5*x1 + 1*x2))
y2 = st.bernoulli(p).rvs(N)
ex1 = pd.DataFrame({"x1": x1, "x2": x2, "y1":y1,"p":p, "y2":y2})
ex1.head()

Unnamed: 0,x1,x2,y1,p,y2
0,6.436518,1.762151,14.325169,0.99585,1
1,3.082365,1.471667,4.882759,0.971051,1
2,3.207742,2.251618,2.569307,0.987327,1
3,2.390547,1.034158,5.507306,0.938741,1
4,5.298111,1.896112,8.712747,0.993601,1


## Solution in Julia

In [8]:
%%julia
Random.seed!(1)
N = 100_000
x1 = rand(Normal(4,1.5), N)
x2 = rand(Exponential(1), N)
epsilon = randn(N)
y1 = @. 2 + 2.5*x1 - 3*x2 + epsilon
p  = @. exp(0.5 + 0.5*x1 + 1*x2) / (1 + exp(0.5 + 0.5*x1 + 1*x2))
y2 = rand.(Bernoulli.(p), 1)
y2 = vcat(y2...)
ex1 = DataFrame(x1=x1,x2=x2,y1=y1,p=p,y2=y2)
first(ex1,5)

<PyCall.jlwrap 5×5 DataFrame
 Row │ x1       x2        y1        p         y2
     │ Float64  Float64   Float64   Float64   Bool
─────┼──────────────────────────────────────────────
   1 │ 3.89413  2.04521    4.73037  0.988929  false
   2 │ 4.79722  0.413854  12.2767   0.964854   true
   3 │ 2.78972  0.181897   7.77235  0.888624   true
   4 │ 7.68549  0.697491  18.6629   0.993569   true
   5 │ 5.74731  0.807885  14.9342   0.984955   true>