In [38]:
import numpy as np
import pandas as pd

np.random.seed(42)

## Exercise 1
- probability(rolling doubles on rolling 2 dice)
- p(e) 
- p(event)
- p(rolling doubles on 2 dice)

In [46]:
rolls = np.random.choice([1, 2, 3, 4, 5, 6], size=(2, 100_000))
rolls

array([[4, 6, 1, ..., 3, 1, 5],
       [2, 1, 2, ..., 5, 4, 1]])

In [47]:
rolls[0].size

100000

In [48]:
# In this approach, the array values at each index are the trial, the event of rolling two dice.
die1 = rolls[0]
die2 = rolls[1]

In [49]:
print(die1[1])
print(die2[1])

6
1


In [50]:
# vectorization here is that we don't have to worry about the index
(die1 == die2).mean()

0.16591

In [51]:
# a Pandas assisted approach to the same problem
rolls = rolls.T # .T means transpose
df = pd.DataFrame(rolls)
df.head(3)

Unnamed: 0,0,1
0,4,2
1,6,1
2,1,2


In [52]:
df["doubles"] = df[0] == df[1]
df.doubles.mean()

0.16591

## Exercise 2

### If you flip 8 coins...
- What is the probability of getting exactly 3 heads? 
- What is the probability of getting more than 3 heads?
- trial is flipping 8 coins

In [53]:
# If you flip 8 coins
# What is the probability of getting exactly 3 heads? 
# What is the probability of getting more than 3 heads?

# Flip 8 coins
n_coins = 8

# With this many simulations
n_simulations = 100_000

# Heads is 1 and Tails is 0.
# flips = np.random.choice([0, 1], n_simulations * n_coins).reshape(n_simulations, n_coins)
flips = np.random.choice([0, 1], size=(n_simulations * n_coins))

In [54]:
flips

array([[1, 0, 1, ..., 0, 0, 1],
       [0, 1, 0, ..., 1, 1, 0],
       [1, 0, 1, ..., 1, 0, 0],
       ...,
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 1],
       [0, 1, 0, ..., 0, 1, 1]])

In [55]:
# Peek at the first row
flips[0]

array([1, 0, 1, 0, 0, 0, 0, 1])

In [56]:
# .sum(axis=1) sums up each row
results = flips.sum(axis=1)
results

array([3, 5, 5, ..., 3, 3, 4])

In [58]:
# What is the probability of getting exactly 3 heads? 
(results == 3).mean()

0.21746

In [60]:
(results == 8).mean()

0.0039

In [61]:
# What is the probability of getting more than 3 heads?
(results > 3).mean()

0.63667

## Exercise 3

There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. 

Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?


In [65]:
outcomes = np.random.choice(["Web Dev", "Data Science"], size=(100_000, 2), p=[.75, .25])

In [67]:
df = pd.DataFrame(outcomes)
df.columns = ["first_billboard", "second_billboard"]

# both column holds the result of the & operation
# comparisons that return a series of boolean can be combined with &, | for "or"
df["both"] = (df.first_billboard == "Data Science") & (df.second_billboard == "Data Science")
df.head(5)

Unnamed: 0,first_billboard,second_billboard,both
0,Web Dev,Web Dev,False
1,Web Dev,Web Dev,False
2,Web Dev,Web Dev,False
3,Web Dev,Web Dev,False
4,Web Dev,Data Science,False


In [68]:
both = df.both.mean()
f"The probabiility of both being Data Science billboards is {both}"

'The probabiility of both being Data Science billboards is 0.06317'

In [69]:
df.both.value_counts(normalize=True)
# DS, DS,
# DS, WD
# WD, DS
# WD, WD

False    0.93683
True     0.06317
Name: both, dtype: float64

## Exercise 4
Codeup students buy, on average, 3 poptart packages (+- 1.5) a day from the snack vending machine. 

If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon?


In [78]:
# Round the poptart values b/c we can't buy fractions of poptarts from a vending machine
poptarts = np.round(np.random.normal(3, 1.5, size=(100_000, 5)))

In [79]:
print(f"Standard Deviation is {poptarts.std()}")
print(f"Mean is {poptarts.mean()}")

Standard Deviation is 1.5275069880023464
Mean is 3.002098


In [80]:
poptarts.min()

-4.0

In [81]:
# First week's poptart consumption. 
# Each row simulates a week
poptarts[0]

array([2., 4., 2., 5., 2.])

In [82]:
# .sum(axis=1) sums up each row which means the total of poptarts purchased that week
weekly_demand = poptarts.sum(axis=1)
weekly_demand

array([15., 12., 15., ..., 21., 13., 13.])

In [83]:
# how many weeks have less demand than 17 poptarts
(weekly_demand < 17).mean()

0.6686

## Exercise 5

#### Compare Heights

- Men have an average height of 178 cm and standard deviation of 8cm.
- Women have a mean of 170, sd = 6cm.
- If a man and woman are chosen at random, P(woman taller than man)?


In [84]:
trials = 100_000
m_heights = np.random.normal(178, 8, trials)
f_heights = np.random.normal(170, 6, trials)

In [86]:
f_heights

array([169.95369631, 167.74097689, 167.12177522, ..., 178.08216366,
       180.94570101, 179.00590344])

In [89]:
(m_heights < f_heights)

array([False, False, False, ..., False,  True, False])

In [91]:
# p(f_heights > m_heights)
(m_heights < f_heights).mean()

0.21063

In [94]:
# p(m_heights == f_heights)
f_heights == m_heights

array([False, False, False, ..., False, False, False])

## Exercise 6

When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails

- What are the odds that after having 50 students download anaconda, no one has an installation issue? 
- 100 students?
- What is the probability that we observe an installation issue within the first 150 students that download anaconda?
- How likely is it that 450 students all download anaconda without an issue?




In [95]:
trials = 10_000

In [96]:
students_per_trial = 50

# 0 is a fail, 1 is success
installs = np.random.choice([0, 1], size=(trials, students_per_trial), p=([1/250, 249/250]))
df = pd.DataFrame(installs)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,0,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [104]:
# integer location
# .iloc[rows, columns]
# rows 1 and 2, column 0 through 4
df.iloc[0:2, 0:4]

Unnamed: 0,0,1,2,3
0,1,1,1,1
1,1,1,1,1


In [106]:
df["all_good"] = df.sum(axis=1) == students_per_trial
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,all_good
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,True
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,True
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,True
3,0,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,False
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,True


In [107]:
df.all_good.mean()

0.8213

In [109]:
students_per_trial = 100

installs = np.random.choice([0, 1], size=(trials, students_per_trial), p=([1/250, 249/250]))
df = pd.DataFrame(installs)
df["all_good"] = df.sum(axis=1) == students_per_trial
df.all_good.mean()

0.6674

In [121]:
# What is the probability that we observe an installation issue
# within the first 150 students that download anaconda?
students_per_trial = 150

installs = np.random.choice([0, 1], size=(trials, students_per_trial), p=([1/250, 249/250]))
df = pd.DataFrame(installs)
df["failures"] = df.sum(axis=1) < students_per_trial

f"Average rate of any failure on installs in 150 is {df.failures.mean()}"

'Average rate of any failure on installs in 150 is 0.4477'

In [122]:
students_per_trial = 450

installs = np.random.choice([0, 1], size=(trials, students_per_trial), p=([1/250, 249/250]))
df = pd.DataFrame(installs)
df["all_good"] = df.sum(axis=1) == students_per_trial
df.all_good.mean()

0.1644

## Exercise 7

- There's a 70% chance on any given day that there will be at least one food truck at Travis Park. 
- However, you haven't seen a food truck there in 3 days. 
- How unlikely is this?



In [124]:
# 1 is a truck, 0 is no truck
trucks = np.random.choice([1, 0], p=[.7, .3], size=(100_000, 3))
df = pd.DataFrame(trucks)
df.columns = ["day_1", "day_2", "day_3"]
df

Unnamed: 0,day_1,day_2,day_3
0,1,1,1
1,0,0,0
2,1,0,1
3,1,1,0
4,1,1,1
...,...,...,...
99995,1,1,0
99996,0,0,1
99997,1,1,1
99998,1,1,1


In [127]:
# Theoretical probability of seeing no food trucks across 3 days
.3 * .3 * .3

0.027

In [126]:
df["appearances"] = df.day_1 + df.day_2 + df.day_3
(df.appearances == 0).mean()

0.02691

In [128]:
# How likely is it that a food truck will show up sometime this week?

trucks = np.random.choice([1, 0], p=[.7, .3], size=(100_000, 7))
df = pd.DataFrame(trucks)
df["appearances"] = df.sum(axis=1) # sum values across the rows
df.head()

Unnamed: 0,0,1,2,3,4,5,6,appearances
0,1,1,1,1,0,1,1,6
1,1,1,1,0,1,0,1,5
2,0,0,1,1,1,1,1,5
3,1,0,1,1,1,1,1,6
4,0,1,1,0,1,1,1,5


In [129]:
(df.appearances > 0).mean()

0.99981

## If 23 people are in the same room, what are the odds that two of them share a birthday? What if it's 20 people? 40?

In [36]:
# calculate the odds of two people share a birthday
n_simulations = 10_000
n_people = 23
birthdays = np.random.choice(range(365), size=(n_simulations, n_people))
df = pd.DataFrame(birthdays)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,307,277,131,252,175,249,337,114,345,59,...,129,220,334,209,139,12,354,314,312,300
1,151,312,205,153,360,201,220,357,210,21,...,30,61,50,135,0,17,222,16,297,288
2,18,85,79,336,168,80,96,76,235,215,...,103,152,213,54,310,60,75,287,217,198
3,358,340,268,143,215,354,288,219,63,274,...,47,167,176,270,330,246,46,62,346,111
4,344,91,157,173,75,222,304,62,189,353,...,242,232,140,117,173,337,208,236,285,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,354,191,202,40,136,155,317,6,359,320,...,330,128,111,150,44,180,123,363,71,85
9996,182,181,29,80,362,156,294,240,323,216,...,59,123,303,346,93,75,0,239,309,232
9997,5,360,80,90,104,1,224,49,306,202,...,271,347,153,127,171,253,103,254,62,31
9998,69,111,272,93,75,211,323,151,264,115,...,158,69,195,97,198,173,202,20,246,35


In [37]:
# Get the number of unique value sper row
df["n_unique"] = df.nunique(axis=1)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,n_unique
0,307,277,131,252,175,249,337,114,345,59,...,220,334,209,139,12,354,314,312,300,23
1,151,312,205,153,360,201,220,357,210,21,...,61,50,135,0,17,222,16,297,288,23
2,18,85,79,336,168,80,96,76,235,215,...,152,213,54,310,60,75,287,217,198,23
3,358,340,268,143,215,354,288,219,63,274,...,167,176,270,330,246,46,62,346,111,23
4,344,91,157,173,75,222,304,62,189,353,...,232,140,117,173,337,208,236,285,21,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,354,191,202,40,136,155,317,6,359,320,...,128,111,150,44,180,123,363,71,85,23
9996,182,181,29,80,362,156,294,240,323,216,...,123,303,346,93,75,0,239,309,232,23
9997,5,360,80,90,104,1,224,49,306,202,...,347,153,127,171,253,103,254,62,31,23
9998,69,111,272,93,75,211,323,151,264,115,...,69,195,97,198,173,202,20,246,35,22
