In [None]:
from sklearn.datasets import fetch_openml
import pandas as pd
import os

data =  fetch_openml(data_id=40945, as_frame=True)

In [None]:
X, y = data.data, data.target
survived2int = {'1': 1, '0': 0}
y = y.apply(lambda x: survived2int[x])
df = pd.concat((X, y), axis=1)
df.head()

In [None]:
# P(A and B) / P(B)
# P(pclass=1 and survived) / P(survived)
p_survived = len(df[df['survived'] == 1]) / len(df)
count_ab = len(df[(df['pclass'] == 1) & (df['survived'] == 1)])
p_ab = count_ab / len(df)
cond = p_ab / p_survived
print(f'P(class 1 | survived) = {cond}')

In [None]:
# Look at it from the other side P(survived | class 1)
p_class1 = len(df[df['pclass'] == 1]) / len(df)
cond = p_ab / p_class1
print(f'P(survived | class 1) = {cond}')

In [None]:
# Look at it from the other side P(survived | class 3)
p_class3 = len(df[df['pclass'] == 3]) / len(df)
count_ab = len(df[(df['pclass'] == 3) & (df['survived'] == 1)])
p_ab = count_ab / len(df)
cond = p_ab / p_class3
print(f'P(survived | class 3) = {cond}')

In [None]:
# P(survived | woman)
p_woman = len(df[df['sex'] == 'female']) / len(df)
p_ab = len(df[(df['sex'] == 'female') & (df['survived'] == 1)]) / len(df)
cond = p_ab / p_woman
print(f'P(survived | woman) = {cond}')

In [None]:
# P(survived | man)
p_man = len(df[df['sex'] == 'male']) / len(df)
p_ab = len(df[(df['sex'] == 'male') & (df['survived'] == 1)]) / len(df)
cond_alive_man = p_ab / p_man
print(f'P(survived | man) = {cond_alive_man}')

Probabilities of all outcomes within B will sum to 1. For instance, $P(X | man)$ will mean that all outcomes given that they're a man added together will sum to 1.

In [None]:
# P(died | man)
p_ab = len(df[(df['sex'] == 'male') & (df['survived'] == 0)]) / len(df)
cond_dead_man = p_ab / p_man
print(f'P(died | man) = {cond_dead_man}')
print(f'Total prob of something happening given you are a man: {cond_dead_man + cond_alive_man}')

Knowing that also allows you to directly compute the remaining probability. $P(dead|man) = 1 - P(survived|man)$

In [None]:
1 - cond_alive_man

Conditional probability can also be conditioned on multiple things. For instance, you might want to know what is the probability someone survived given they're a woman and first class. For this you end up getting $P(A|B,C)=\frac{P(A\cap B|C)}{P(B|C)}=\frac{P(A\cap B\cap C)}{P(C)}\cdot \frac{P(C)}{P(B\cap C)}=\frac{P(A\cap B\cap C)}{P(B\cap C)}$

In [None]:
# P(survived | woman, first class)
count_woman_alive_firstclass = len(df[(df['pclass'] == 1) &\
                                  (df['sex'] == 'female') &\
                                  (df['survived'] == 1)])
p_abc = count_woman_alive_firstclass / len(df)
count_woman_firstclass = len(df[(df['pclass'] == 1) &\
                            (df['sex'] == 'female')])
p_bc = count_woman_firstclass / len(df)
cond = p_abc / p_bc
print(f'P(survived | woman, first class) = {cond}')

Can directly compare probability of outcome with different conditions. You might wonder how class effects the distribution. We can define a function to hold variables static as we change one and compare the results.

In [None]:
def get_cond_probs(cond_a, cond_b, cond_cs, df):
  probs = []
  for cond_c in cond_cs:
    p_abc = len(df[(cond_a) & (cond_b) & (cond_c)]) / len(df)
    p_bc = len(df[(cond_b) & (cond_c)]) / len(df)
    cond_prob = p_abc / p_bc
    probs.append(cond_prob)
  return probs

In [None]:
cond_a = df['survived'] == 1
cond_b = df['sex'] == 'female'
cond_cs = [
    df['pclass'] == 1,
    df['pclass'] == 2,
    df['pclass'] == 3,
]

probs = get_cond_probs(cond_a, cond_b, cond_cs, df)
probs

Similarly we can hold class steady at first class and fluctuate between male and female.

In [None]:
cond_a = df['survived'] == 1
cond_b = df['pclass'] == 1
cond_cs = [
    df['sex'] == 'female',
    df['sex'] == 'male'
]
get_cond_probs(cond_a, cond_b, cond_cs, df)

And now you can iterate over b conditionals to gain even more insight.

In [None]:
for pclass in range(1,3+1):
  cond_b = df['pclass'] == pclass
  probs = get_cond_probs(cond_a, cond_b, cond_cs, df)
  sex = ['female', 'male']
  for i,s in enumerate(sex):
    print(f'P(survived|pclass={pclass},sex={s}) = {probs[i]}')

I like to think of conditional probability as a way to index into data

![conditional prob](https://www.mathbootcamps.com/wp-content/uploads/two-way-table-conditional-probability-example2.png)

In the above example you can directly index into the "given" section and compute 8/45. You can also use full probability to do the same $P(>=4|FT) = \frac{P(>=4\cap FT)}{P(FT)}=\frac{\frac{8}{58}}{\frac{45}{58}}=\frac{8}{45}$