In [2]:
import pandas as pd
import numpy as np

1. How likely is it that you roll doubles when rolling two dice?

In [3]:
n_trials = 100_000 #rows
n_dice = 2 #columns

rolls = np.random.choice([1, 2, 3, 4, 5, 6], n_trials * n_dice)\
.reshape(n_trials, n_dice)
rolls

array([[4, 5],
       [3, 1],
       [3, 3],
       ...,
       [3, 3],
       [4, 5],
       [3, 4]])

In [4]:
(rolls[:,0] == rolls[:,1]).mean()

0.16785

In [5]:
# pandas solution
rolls = pd.DataFrame()
rolls['die1'] = np.random.choice([1, 2, 3, 4, 5, 6], size = 100_00)
rolls['die2'] = np.random.choice([1, 2, 3, 4, 5, 6], size = 100_00)
rolls.head()

Unnamed: 0,die1,die2
0,3,1
1,5,5
2,1,3
3,2,1
4,5,2


In [6]:
rolls['is_pair'] = rolls.die1 == rolls.die2
rolls.is_pair.mean()

0.1708

In [7]:
# numpy solution
a = np.random.choice([1, 2, 3, 4, 5, 6], size = 100_00)
b = np.random.choice([1, 2, 3, 4, 5, 6], size = 100_00)
(a == b).mean()

0.1668

2. If you flip 8 coins, what is the probability of getting exactly 3 heads? What is the probability of getting more than 3 heads?

In [8]:
n_trials = nrows = 100_000
n_coins = ncols = 8

# 1 is heads, 0 is tails (encoded)
flips = np.random.choice([1, 0], n_trials * n_coins)\
.reshape(nrows, ncols)
flips

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 1],
       [0, 1, 1, ..., 0, 1, 1],
       ...,
       [0, 0, 1, ..., 1, 1, 1],
       [1, 0, 0, ..., 0, 1, 1],
       [1, 0, 1, ..., 1, 0, 1]])

In [9]:
# axis=1 is sum by row
number_of_heads = flips.sum(axis=1)
number_of_heads

array([4, 4, 6, ..., 4, 3, 5])

In [10]:
(number_of_heads == 3).mean()

0.21801

In [11]:
(number_of_heads > 3).mean()

0.63793

3. There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?

In [12]:
n_trials = 100_000
n_billboard = 2

cohort = np.random.choice(['webdev', 'ds'], size=(n_trials, n_billboard), p=[.75, .25])
cohort

array([['webdev', 'webdev'],
       ['webdev', 'webdev'],
       ['webdev', 'webdev'],
       ...,
       ['webdev', 'webdev'],
       ['webdev', 'ds'],
       ['ds', 'webdev']], dtype='<U6')

In [13]:
df = pd.DataFrame(cohort)
df.columns = ['first_billboard', 'second_billboard']
df.head()

Unnamed: 0,first_billboard,second_billboard
0,webdev,webdev
1,webdev,webdev
2,webdev,webdev
3,webdev,webdev
4,webdev,webdev


In [14]:
df['both_ds'] = (df.first_billboard == 'ds') & (df.second_billboard == 'ds')
df.both_ds.mean()

0.06279

In [15]:
.25 * .25 

0.0625

4. Codeup students buy, on average, 3 poptart packages (+- 1.5) a day from the snack vending machine. If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon?

In [16]:
mean = 3
st_dev = 1.5
poptarts = np.round(np.random.normal(mean, st_dev, size = (100_000, 5)))
poptarts.mean()

3.001486

In [17]:
poptarts[0]

array([1., 5., 0., 3., 3.])

In [18]:
# axis = 1 sums each row, each row is..
weekly_demand = poptarts.sum(axis=1)

In [19]:
# weekly demand is less than 17, it means theres more poptarts to buy
# prob is machine started with 17 poptarts that you can buy on friday
(weekly_demand < 17).mean()

0.66983

5. Compare Heights

    - Men have an average height of 178 cm and standard deviation of 8cm.
    - Women have a mean of 170, sd = 6cm.
    - If a man and woman are chosen at random, P(woman taller than man)?

In [20]:
men_avg = 178
m_sd = 8
women_avg = 170
w_sd = 6

men = np.random.normal(men_avg, m_sd, size=100000)
women = np.random.normal(women_avg, w_sd, size=100000)

(women > men).mean()

0.20993

6. When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails. What are the odds that after having 50 students download anaconda, no one has an installation issue? 100 students?

    - What is the probability that we observe an installation issue within the first 150 students that download anaconda?

    - How likely is it that 450 students all download anaconda without an issue?

In [21]:
# 50 students success
chance = [(249/250), (1/250)]
n_trials = 100000 # rows
n_students = 50 # columns

download = np.random.choice([1,0], n_students * n_trials, p=chance)\
.reshape(n_trials, n_students)
download

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [22]:
(download.sum(axis=1) == 50).mean()

0.81831

In [23]:
# 100 students success
chance = [(249/250), (1/250)]
n_trials = 100000 # rows
n_students = 100 # columns

download = np.random.choice([1,0], n_students * n_trials, p=chance)\
.reshape(n_trials, n_students)

(download.sum(axis=1) == 100).mean()

0.67399

In [24]:
# 150 students fail
chance = [(249/250), (1/250)]
n_trials = 100000 # rows
n_students = 150 # columns

download = np.random.choice([1,0], n_students * n_trials, p=chance)\
.reshape(n_trials, n_students)

1-(download.sum(axis=1) == 150).mean()

0.45216999999999996

In [25]:
# 450 students succeed
chance = [(249/250), (1/250)]
n_trials = 100000 # rows
n_students = 450 # columns

download = np.random.choice([1,0], n_students * n_trials, p=chance)\
.reshape(n_trials, n_students)

(download.sum(axis=1) == 450).mean()

0.16641

7. There's a 70% chance on any given day that there will be at least one food truck at Travis Park. However, you haven't seen a food truck there in 3 days. How unlikely is this?

    - How likely is it that a food truck will show up sometime this week?

In [26]:
chance = [.7, .3]
n_trials = 1000000 # rows
n_days = 3 # columns

food_truck = np.random.choice(['truck','none'], n_days * n_trials, p=chance)\
.reshape(n_trials, n_days)
food_truck

array([['truck', 'truck', 'none'],
       ['none', 'truck', 'none'],
       ['truck', 'truck', 'truck'],
       ...,
       ['truck', 'truck', 'none'],
       ['none', 'truck', 'truck'],
       ['truck', 'none', 'none']], dtype='<U5')

In [27]:
((food_truck[:,0] == 'none') & (food_truck[:,1] == 'none') & (food_truck[:,2] == 'none')).mean()

0.027124

In [28]:
trucks = np.random.choice([1,0], size=(100_000, 3), p=[.7,.3])
df = pd.DataFrame(trucks)
df.columns = ['day_1', 'day_2', 'day_3']
df.head()

Unnamed: 0,day_1,day_2,day_3
0,1,1,1
1,1,1,1
2,1,1,1
3,1,1,1
4,1,1,1


In [29]:
df['appear'] = df.day_1 + df.day_2 + df.day_3
df.head()

Unnamed: 0,day_1,day_2,day_3,appear
0,1,1,1,3
1,1,1,1,3
2,1,1,1,3
3,1,1,1,3
4,1,1,1,3


In [30]:
(df.appear == 0).mean()

0.02696

In [31]:
# How likely is it that a food truck will show up sometime this week?
trucks = np.random.choice([1,0], size=(100_000, 7), p=[.7,.3])
df = pd.DataFrame(trucks)

df['appear'] = df.sum(axis=1)
(df.appear > 0).mean()

0.99983

8. If 23 people are in the same room, what are the odds that two of them share a birthday? What if it's 20 people? 40?

In [40]:
# 23 people
n_trials = 100000 # rows
n_people = 23 # columns

birthday = np.random.choice(range(1,366), size=(n_trials, n_people))
df = pd.DataFrame(birthday)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,245,24,12,52,117,208,259,50,312,51,...,123,303,109,331,226,267,144,146,61,329
1,338,102,345,2,274,340,140,218,319,31,...,124,167,102,347,59,166,312,254,357,77
2,205,132,329,361,323,179,18,343,7,238,...,359,185,287,144,14,262,122,270,32,213
3,318,349,22,281,216,96,289,351,72,21,...,42,187,207,84,169,12,99,243,328,234
4,55,222,353,78,88,207,254,27,66,201,...,103,93,230,233,220,180,310,271,300,23


In [41]:
# get number of unique values per row (per observation)
# if ther unmber of unique values == number of people in rooom, 
# then it means everybody has a differnet birhtday
df['n_unique'] = df.nunique(axis=1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,n_unique
0,245,24,12,52,117,208,259,50,312,51,...,303,109,331,226,267,144,146,61,329,23
1,338,102,345,2,274,340,140,218,319,31,...,167,102,347,59,166,312,254,357,77,22
2,205,132,329,361,323,179,18,343,7,238,...,185,287,144,14,262,122,270,32,213,23
3,318,349,22,281,216,96,289,351,72,21,...,187,207,84,169,12,99,243,328,234,23
4,55,222,353,78,88,207,254,27,66,201,...,93,230,233,220,180,310,271,300,23,22


In [42]:
# if n_unique == 23 then no one shares a birthday
(df.n_unique != 23).mean()

0.50656

In [36]:
# 20 people
n_trials = 100000 # rows
n_people = 20 # columns

birthday = np.random.choice(range(1,366), size=(n_trials, n_people))
df = pd.DataFrame(birthday)

df['n_unique'] = df.nunique(axis=1)

(df.n_unique != 20).mean()

0.41203

In [37]:
# 40 people
n_trials = 100000 # rows
n_people = 40 # columns

birthday = np.random.choice(range(1,366), size=(n_trials, n_people))
df = pd.DataFrame(birthday)

df['n_unique'] = df.nunique(axis=1)

(df.n_unique != 40).mean()

0.89275