In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('simulated_data.csv')

In [3]:
TOTAL_DATA_COUNT = len(data)
TOTAL_DATA_COUNT

1000

In [4]:
data.head()

Unnamed: 0,Age,Risk Factors,COVID19 Status,Cough,Loss of Taste or Smell,Tested Result
0,2,0,2,1,0,1
1,2,0,1,0,1,1
2,1,0,1,0,1,0
3,2,0,0,0,1,2
4,2,0,3,0,0,1


### Local Probability table for "Age"

Age does not depend on any of the other variables. Hence table can be calculated based on the column and target variable.

Count all number of times each age category occured and divide by the total number of samples to get the probabilities.

In [5]:
age_df = data[['Age']]
age_dist = age_df.pivot_table(index='Age' , aggfunc="size", fill_value=0).T/TOTAL_DATA_COUNT
age_dist

Age
0    0.095
1    0.320
2    0.585
dtype: float64

### Local Probability table for "Risk Factors"

Risk Factors does not depend on any of the other variables. Hence table can be calculated based on the column and target variable.

Count all number of times each category occured and divide by the total number of samples to get the probabilities.

In [6]:
risk_df = data[['Risk Factors']]
risk_dist = risk_df.pivot_table(index='Risk Factors' , aggfunc="size", fill_value=0).T/TOTAL_DATA_COUNT
risk_dist

Risk Factors
0    0.621
1    0.265
2    0.114
dtype: float64

### Local Probability table for "Covid19 Status"

Covid19 Status depends on both Age and Risk Factor values. Hence probability values are conditional and need to be calculated considering the categories of dependent values.

Group samples by the (Age, Risk Factors) and count Covid19 status within each group. Then divide the counts by the total samples in each group to get the probabilities.

In [7]:
cov19_df = data[['Age', 'Risk Factors', 'COVID19 Status']]
temp = cov19_df.pivot_table(index='COVID19 Status',columns=['Age', 'Risk Factors'], aggfunc="size", fill_value=0)

covid19_dist = temp.div(temp.sum()).T
covid19_dist

Unnamed: 0_level_0,COVID19 Status,0,1,2,3
Age,Risk Factors,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0.215385,0.553846,0.123077,0.107692
0,1,0.047619,0.333333,0.047619,0.571429
0,2,0.333333,0.222222,0.222222,0.222222
1,0,0.331658,0.065327,0.38191,0.221106
1,1,0.493976,0.313253,0.084337,0.108434
1,2,0.289474,0.263158,0.368421,0.078947
2,0,0.207283,0.187675,0.40056,0.204482
2,1,0.440994,0.248447,0.086957,0.223602
2,2,0.149254,0.208955,0.253731,0.38806


### Local Probability table for "Loss of Taste or Smell"

Loss of Taste or Smell depends on Covid19 Status values. Hence probability values are conditional and need to be calculated considering the categories of dependent values.

In [8]:
taste_df = data[['Loss of Taste or Smell', 'COVID19 Status']]
temp = taste_df.pivot_table(index='Loss of Taste or Smell',columns=['COVID19 Status'], aggfunc="size", fill_value=0)

taste_dist = temp.div(temp.sum()).T
taste_dist

Loss of Taste or Smell,0,1
COVID19 Status,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.233677,0.766323
1,0.12093,0.87907
2,0.521277,0.478723
3,0.306604,0.693396


### Local Probability table for "Cough"

Cough depends on Covid19 Status values. Hence probability values are conditional and need to be calculated considering the categories of dependent values.

In [9]:
cough_df = data[['Cough', 'COVID19 Status']]
temp = cough_df.pivot_table(index='Cough',columns=['COVID19 Status'], aggfunc="size", fill_value=0)

cough_dist = temp.div(temp.sum()).T
cough_dist

Cough,0,1
COVID19 Status,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.804124,0.195876
1,0.437209,0.562791
2,0.492908,0.507092
3,0.688679,0.311321


### Local Probability table for "TestedResults"

TestedResults depends on Covid19 Status values. Hence probability values are conditional and need to be calculated considering the categories of dependent values.

In [10]:
testresult_df = data[['Tested Result', 'COVID19 Status']]
temp = testresult_df.pivot_table(index='Tested Result',columns=['COVID19 Status'], aggfunc="size", fill_value=0)

testresult_dist = temp.div(temp.sum()).T
testresult_dist

Tested Result,0,1,2
COVID19 Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.312715,0.19244,0.494845
1,0.293023,0.427907,0.27907
2,0.095745,0.514184,0.390071
3,0.09434,0.792453,0.113208


Stochastic Sampling with Likelihood Weighting

P(Risk Factors | Loss of Taste or Smell = 1, Cough = 0)

Let before variable names denotes the Factors:

- L = Loss of taste or smell
- C = Cough
- R = Risk Factors
- CovStatus = Covid19 Status
- A = Age
- T = Tested Results

In [11]:
NUM_SAMPLES_TO_GENERATE = 100000

rng = np.random.default_rng(19951115)

In [12]:
generated_samples = []
sample_weights = []

for _ in range(NUM_SAMPLES_TO_GENERATE):
    weight = 1
    L = 1 # Evidence
    C = 0 # Evidence

    A = age_dist.sample(n=1, weights=age_dist, random_state=rng).index[0] # Integer value denoting the category
    R = risk_dist.sample(n=1, weights=risk_dist, random_state=rng).index[0] # Integer value denoting the category

    cov_possible_samples = covid19_dist.loc[(A, R)] # this is a series with index
    CovStatus = cov_possible_samples.sample(n=1, weights=cov_possible_samples, random_state=rng).index[0] # Integer value denoting the category

    # weighting due to Cough evidence P(cough =0 | covidstatus)
    cough_weights = cough_dist.loc[CovStatus] # Selecting cough possible samples
    weight = weight * cough_weights.loc[C]

    # weighting due to Loss of Taste or Smell evidence P(Loss of Taste or Smell = 1 | covidstatus)
    taste_weights = taste_dist.loc[CovStatus] # Selecting cough possible samples
    weight = weight * taste_weights.loc[L]

    testresult_possible_sample = testresult_dist.loc[CovStatus]
    T = testresult_possible_sample.sample(n=1, weights=testresult_possible_sample, random_state=rng).index[0]

    generated_samples.append((A, R, CovStatus, L, C, T))
    sample_weights.append(weight)


In [13]:
# Calculating the probability distribution based on the sampled ewight values.
total_weight = 0
risk_factors_dict = [0, 0, 0]
for s, w in zip(generated_samples, sample_weights):
    r = s[1]
    risk_factors_dict[r] += w
    total_weight += w

Asked_probability = risk_factors_dict/total_weight

In [14]:
Asked_probability

array([0.58760363, 0.30105133, 0.11134504])