In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

## Exercise 1
- probability(rolling doubles on rolling 2 dice)
- p(e) 
- p(event)
- p(rolling doubles on 2 dice)

In [2]:
rolls = np.random.choice([1, 2, 3, 4, 5, 6], size=(2, 100_000))
rolls

array([[4, 5, 3, ..., 4, 1, 4],
       [5, 5, 2, ..., 4, 4, 6]])

In [3]:
rolls[0].size

100000

In [4]:
# In this approach, the array values at each index are the trial, the event of rolling two dice.
die1 = rolls[0]
die2 = rolls[1]

In [5]:
print(die1[1])
print(die2[1])

5
5


In [6]:
# vectorization here is that we don't have to worry about the index
(die1 == die2).mean()

0.16813

In [7]:
# a Pandas assisted approach to the same problem
rolls = rolls.T # .T means transpose
df = pd.DataFrame(rolls)
df.head(3)

Unnamed: 0,0,1
0,4,5
1,5,5
2,3,2


In [8]:
df["doubles"] = df[0] == df[1]
df.doubles.mean()

0.16813

## Exercise 2

### If you flip 8 coins...
- What is the probability of getting exactly 3 heads? 
- What is the probability of getting more than 3 heads?
- trial is flipping 8 coins

In [9]:
# If you flip 8 coins
# What is the probability of getting exactly 3 heads? 
# What is the probability of getting more than 3 heads?

# Flip 8 coins
n_coins = 8

# With this many simulations
n_simulations = 100_000

# Heads is 1 and Tails is 0.
# flips = np.random.choice([0, 1], n_simulations * n_coins).reshape(n_simulations, n_coins)
flips = np.random.choice([0, 1], size=(n_simulations * n_coins))

In [10]:
flips

array([1, 0, 1, ..., 0, 1, 0])

In [11]:
# Peek at the first row
flips[0]

1

In [12]:
# .sum(axis=1) sums up each row
results = flips.sum(axis=1)
results

AxisError: axis 1 is out of bounds for array of dimension 1

In [None]:
# What is the probability of getting exactly 3 heads? 
(results == 3).mean()

In [None]:
(results == 8).mean()

In [None]:
# What is the probability of getting more than 3 heads?
(results > 3).mean()

## Exercise 3

There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. 

Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?


In [None]:
outcomes = np.random.choice(["Web Dev", "Data Science"], size=(100_000, 2), p=[.75, .25])

In [None]:
df = pd.DataFrame(outcomes)
df.columns = ["first_billboard", "second_billboard"]

# both column holds the result of the & operation
# comparisons that return a series of boolean can be combined with &, | for "or"
df["both"] = (df.first_billboard == "Data Science") & (df.second_billboard == "Data Science")
df.head(5)

In [None]:
both = df.both.mean()
f"The probabiility of both being Data Science billboards is {both}"

In [None]:
df.both.value_counts(normalize=True)
# DS, DS,
# DS, WD
# WD, DS
# WD, WD

## Exercise 4
Codeup students buy, on average, 3 poptart packages (+- 1.5) a day from the snack vending machine. 

If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon?


In [None]:
# Round the poptart values b/c we can't buy fractions of poptarts from a vending machine
poptarts = np.round(np.random.normal(3, 1.5, size=(100_000, 5)))

In [None]:
print(f"Standard Deviation is {poptarts.std()}")
print(f"Mean is {poptarts.mean()}")

In [None]:
poptarts.min()

In [None]:
# First week's poptart consumption. 
# Each row simulates a week
poptarts[0]

In [None]:
# .sum(axis=1) sums up each row which means the total of poptarts purchased that week
weekly_demand = poptarts.sum(axis=1)
weekly_demand

In [None]:
# how many weeks have less demand than 17 poptarts
(weekly_demand < 17).mean()

## Exercise 5

#### Compare Heights

- Men have an average height of 178 cm and standard deviation of 8cm.
- Women have a mean of 170, sd = 6cm.
- If a man and woman are chosen at random, P(woman taller than man)?


In [None]:
trials = 100_000
m_heights = np.random.normal(178, 8, trials)
f_heights = np.random.normal(170, 6, trials)

In [None]:
f_heights

In [None]:
(m_heights < f_heights)

In [None]:
# p(f_heights > m_heights)
(m_heights < f_heights).mean()

In [None]:
# p(m_heights == f_heights)
f_heights == m_heights

## Exercise 6

When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails

- What are the odds that after having 50 students download anaconda, no one has an installation issue? 
- 100 students?
- What is the probability that we observe an installation issue within the first 150 students that download anaconda?
- How likely is it that 450 students all download anaconda without an issue?




In [None]:
trials = 10_000

In [None]:
students_per_trial = 50

# 0 is a fail, 1 is success
installs = np.random.choice([0, 1], size=(trials, students_per_trial), p=([1/250, 249/250]))
df = pd.DataFrame(installs)
df.head()

In [None]:
# integer location
# .iloc[rows, columns]
# rows 1 and 2, column 0 through 4
df.iloc[0:2, 0:4]

In [None]:
df["all_good"] = df.sum(axis=1) == students_per_trial
df.head()

In [None]:
df.all_good.mean()

In [None]:
students_per_trial = 100

installs = np.random.choice([0, 1], size=(trials, students_per_trial), p=([1/250, 249/250]))
df = pd.DataFrame(installs)
df["all_good"] = df.sum(axis=1) == students_per_trial
df.all_good.mean()

In [None]:
# What is the probability that we observe an installation issue
# within the first 150 students that download anaconda?
students_per_trial = 150

installs = np.random.choice([0, 1], size=(trials, students_per_trial), p=([1/250, 249/250]))
df = pd.DataFrame(installs)
df["failures"] = df.sum(axis=1) < students_per_trial

f"Average rate of any failure on installs in 150 is {df.failures.mean()}"

In [None]:
students_per_trial = 450

installs = np.random.choice([0, 1], size=(trials, students_per_trial), p=([1/250, 249/250]))
df = pd.DataFrame(installs)
df["all_good"] = df.sum(axis=1) == students_per_trial
df.all_good.mean()

## Exercise 7

- There's a 70% chance on any given day that there will be at least one food truck at Travis Park. 
- However, you haven't seen a food truck there in 3 days. 
- How unlikely is this?



In [None]:
# 1 is a truck, 0 is no truck
trucks = np.random.choice([1, 0], p=[.7, .3], size=(100_000, 3))
df = pd.DataFrame(trucks)
df.columns = ["day_1", "day_2", "day_3"]
df

In [None]:
# Theoretical probability of seeing no food trucks across 3 days
.3 * .3 * .3

In [None]:
df["appearances"] = df.day_1 + df.day_2 + df.day_3
(df.appearances == 0).mean()

In [None]:
# How likely is it that a food truck will show up sometime this week?

trucks = np.random.choice([1, 0], p=[.7, .3], size=(100_000, 7))
df = pd.DataFrame(trucks)
df["appearances"] = df.sum(axis=1) # sum values across the rows
df.head()

In [None]:
(df.appearances > 0).mean()

## If 23 people are in the same room, what are the odds that two of them share a birthday? What if it's 20 people? 40?

In [None]:
# calculate the odds of two people share a birthday
n_simulations = 100_000
n_people = 23
birthdays = np.random.choice(range(365), size=(n_simulations, n_people))
df = pd.DataFrame(birthdays)

# Get the number of unique values per row
df["n_unique"] = df.nunique(axis=1)

df.head(3)

In [None]:
# For exactly 2 people sharing a birthday in a room of 23 people, the number of n_unique values shoudl be 23-2
(df.n_unique == 21).mean()

In [None]:
# calculate the odds of two people share a birthday in a room of 20 people
n_simulations = 100_000
n_people = 20
birthdays = np.random.choice(range(365), size=(n_simulations, n_people))
df = pd.DataFrame(birthdays)

# Get the number of unique values per row
df["n_unique"] = df.nunique(axis=1)

(df.n_unique == n_people - 2).mean()

In [None]:
# calculate the odds of two people share a birthday in a room of 20 people
n_simulations = 100_000
n_people = 40
birthdays = np.random.choice(range(365), size=(n_simulations, n_people))
df = pd.DataFrame(birthdays)

# Get the number of unique values per row
df["n_unique"] = df.nunique(axis=1)

(df.n_unique == n_people - 2).mean()

In [None]:
# In a room of 30 people
# What are the odds that at least 2 people share a birthday

n_simulations = 10_000
n_people = 30
birthdays = np.random.choice(range(365), size=(n_simulations, n_people))
df = pd.DataFrame(birthdays)

# Get the number of unique values per row
df["n_unique"] = df.nunique(axis=1)

(df.n_unique <= n_people - 2).mean()

In [None]:
# In a room of 40 people
# What are the odds that at least 2 people share a birthdau

n_simulations = 10_000
n_people = 40
birthdays = np.random.choice(range(365), size=(n_simulations, n_people))
df = pd.DataFrame(birthdays)

# Get the number of unique values per row
df["n_unique"] = df.nunique(axis=1)

(df.n_unique <= n_people - 2).mean()