# Discrete Probabilities

## Data

In [8]:
import pandas as pd

from labm8.py import bazelutil

df = pd.read_csv(bazelutil.DataPath("phd/datasets/baseball.csv"), index_col="Day")
df

Unnamed: 0_level_0,Outlook,Temperature,Humidity,Wind,Baseball
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D1,Sunny,Hot,High,Weak,No
D2,Sunny,Hot,High,Strong,No
D3,Overcast,Hot,High,Weak,Yes
D4,Rain,Mild,High,Weak,Yes
D5,Rain,Cool,Normal,Weak,Yes
D6,Rain,Cool,Normal,Strong,No
D7,Overcast,Cool,Normal,Strong,Yes
D8,Sunny,Mild,High,Weak,No
D9,Sunny,Cool,Normal,Weak,Yes
D10,Rain,Mild,Normal,Weak,Yes


## Marginal Probabilities

In [50]:
counts = df.groupby(["Outlook"]).count().T.mean()

pSunny = counts["Sunny"] / counts.sum()
pOvercast = counts["Overcast"] / counts.sum()
pRain = counts["Rain"] / counts.sum()

assert pSunny + pOvercast + pRain == 1

counts = df.groupby(["Wind"]).count().T.mean()

pWindy = counts["Strong"] / counts.sum()
pCalm = counts["Weak"] / counts.sum()

assert pWindy + pCalm == 1

print(f"""\
P(Sunny)        = {pSunny:.3%}
P(Overcast)     = {pOvercast:.3%}
P(Rain)         = {pRain:.3%}

P(Windy)        = {pWindy:.3%}
P(Calm)         = {pCalm:.3%}\
""")

P(Sunny)        = 35.714%
P(Overcast)     = 28.571%
P(Rain)         = 35.714%

P(Windy)        = 42.857%
P(Calm)         = 57.143%


## Conditional Probabilities

In [57]:
pSunnyJointCalm = len(df[(df["Wind"] == "Weak") & (df["Outlook"] == "Sunny")]) / len(df[df["Wind"] == "Weak"])

print(f"""\
P(Sunny | Calm) = {pSunnyJointCalm:.3%}\
""")

P(Sunny | Calm) = 37.500%


## Joint Probabilities

In [61]:
pSunnyAndCalm = pWindy * pCalm
print(f"""\
P(Sunny ∩ Calm) = {pSunnyJointCalm * pCalm:.3%}

P(Sunny | Calm) = P(Sunny ∩ Calmn) / P(Calmn) = {(pSunnyJointCalm * pCalm) / pCalm:.3%}
""")

P(Sunny ∩ Calm) = 21.429%

P(Sunny | Calm) = P(Sunny ∩ Calmn) / P(Calmn) = 37.500%

