<a href="https://colab.research.google.com/github/Duckkapon/Portfolio/blob/main/Missing_Data_101.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook is prepared for writing articles about missing data

## Part 1 Understanding The Missing Data

https://pakkapontontiwich.wordpress.com/2023/12/04/missing-data-101-part-1-understanding-the-missing-data/

In [None]:
#############################
##### 1. Import library #####
#############################
import numpy as np
import pandas as pd
import random

######################################################
##### 2. Generating sample data with 3 variables #####
######################################################
nPop = 100000
mu = np.array([1, 2, 3])
cov = np.array([
                [3.5, 2.7, 2.7],
                [2.7, 3.5, 2.7],
                [2.7, 2.7, 3.5]
                ])
rng = np.random.default_rng()
pop = rng.multivariate_normal(mu, cov, size = nPop)
dfPop = pd.DataFrame(pop)

print(dfPop.shape)
print(dfPop.mean())
print(dfPop.cov())

(100000, 3)
0    1.002781
1    2.007399
2    3.001975
dtype: float64
          0         1         2
0  3.513308  2.707447  2.702038
1  2.707447  3.507702  2.701849
2  2.702038  2.701849  3.502347


In [None]:
idxPop = np.array([i for i in range(nPop)])

#######################
##### 3. Sampling #####
#######################
idxSam = np.random.choice(idxPop, size = 1000, replace = False)

######################################
##### 4. Simulate Missing Values #####
######################################
popVar0 = dfPop[0]
popVar1 = dfPop[1]
popVar2 = dfPop[2]
popVar0_2SD = 2 * np.std(popVar0)

sampVar0  = list()
sampVar1 = list()
sampVar2 = list()
MNARPos = np.random.choice(idxPop, size = 10000, replace = False)

for pos in idxSam :
        # missing completely at random
        if np.random.rand() < 0.05 :
                sampVar0.append(np.nan)
        else :
                sampVar0.append(popVar0[pos])

        # missing at random
        if popVar0[pos] > popVar0_2SD :
                sampVar1.append(np.nan)
        else :
                sampVar1.append(popVar1[pos])

        # missing not at random
        if pos in MNARPos :
                sampVar2.append(np.nan)
        else :
                sampVar2.append(popVar2[pos])

dfMissEx =  pd.DataFrame(list(zip(sampVar0, sampVar1, sampVar2)))

In [None]:
##########################################
##### 5. Display correlation metrix ######
##########################################
print(f"correlation Missing Value : \n{dfMissEx.corr()}\n")

print(f"correlation Drop Missing Value : \n{dfMissEx.dropna(axis=0).corr()}\n")


correlation No Missing Value : 
          0         1         2
0  1.000000  0.766122  0.754565
1  0.766122  1.000000  0.759776
2  0.754565  0.759776  1.000000

correlation Missing Value : col 1 คือ missing at random, col 2 คือ missing not at random
          0         1         2
0  1.000000  0.738580  0.780748
1  0.738580  1.000000  0.727593
2  0.780748  0.727593  1.000000

correlation Drop Missing Value : 
          0         1         2
0  1.000000  0.733959  0.738369
1  0.733959  1.000000  0.725394
2  0.738369  0.725394  1.000000



In [None]:
print(dfMissEx.shape)
print(dfMissEx.mean())
print(dfMissEx.cov())

(1000, 3)
0    0.955806
1    1.719180
2    2.987532
dtype: float64
          0         1         2
0  3.781158  2.236013  2.998325
1  2.236013  3.089500  2.294771
2  2.998325  2.294771  3.856279


## Part 2 Imputation Methods