In [1]:
# Let's do our imports:
# matplotlib inline for notebook visualization display
%matplotlib inline
# numpy for matrix manipulation
import numpy as np
# pandas for dataframe manipulation
import pandas as pd
# curriculum example visualizations
import viz 
# and setting our random seed
np.random.seed(1349)

### Question #1:
## How likely is it that you roll doubles when rolling two dice?

Mentally visualize the situation:

Two dice, each have six sides.

The probability is evenly distributed amongst six sides for each die

In this way, you have an equal probability of getting any of the six sides of each die on a given roll.
Therefore, rolling on a specific number is a 1/6 probability.

There are 36 possible outcomes of dice rolls (6x6)

And there are 6 possible ways to get doubles (11,22,33,44,55,66)

6/36 = 1/6

making for a probability of roughly 0.167

<img src="http://www.stayorswitch.com/blog/wp-content/uploads/2014/06/Screen-Shot-2016-10-27-at-11.39.17-PM.png">

In [2]:
# Let's do it with a simulation in Python:

# Represent our data's possible outcomes:
outcomes = [1,2,3,4,5,6]
# Create the data!
n_rows = 1_000_000
n_cols = 2

In [3]:
# Let's get our rolls. We'll make a simulation of 1 million trials or simulated rolls for two dice
rolls = np.random.choice(outcomes, size=(n_rows, n_cols))

In [4]:
rolls[:5]

array([[3, 2],
       [5, 3],
       [4, 4],
       [6, 2],
       [1, 1]])

In [5]:
rolls.shape

(1000000, 2)

Using a sum isn't the best option here, since we are looking for two matching elements, or the number of unique elements.

In [8]:
len(np.unique(rolls[4]))

1

In [16]:
# Let's use a list comprehension: 
# a list of the length of the uniques for each instance for the full number of simulations by index,
# but only if the number of uniques is 1
dubs = [len(np.unique(rolls[n])) for n in range(0, n_rows-1) if len(np.unique(rolls[n])) ==1]

In [17]:
len(dubs)

166136

In [18]:
# The length of this is going to be the number of times we rolled doubles, and we can divide that by the total number of simulations:
calculated_prob = len(dubs) / len(rolls)

In [19]:
calculated_prob

0.166136

In [11]:

print(f'The probability that we will flip at least 3 heads over {n_cols} coins is {calculated_prob}')

The probability that we will flip at least 3 heads over 2 coins is 1.0


### Question #2:
### If you flip 8 coins, what is the probability of getting exactly 3 heads? What is the probability of getting more than 3 heads?

Mentally visualize the situation:
Eight coins, each have two sides.
The probability on a "fair" coin is distributed evenly among the two sides on a given flip
The probability of getting H or T is equal, 1/2
Order does not matter here; it does not matter *when* the heads come up in the mix
Situation = {3H, 5T}

<img src="https://i.ytimg.com/vi/qyd1bQlPW-8/hqdefault.jpg">

In [None]:
# number of ways that we could get three heads out of eight flips, 
# divided by number of possible flip outcomes of eight flips (2 * 2 * 2 * 2 * 2 * 2 * 2 * 2)

In [None]:
# 8C3/(2^8) = (8!/3!(8-3)!)/2^8 = 7/32 = ~0.219

In [None]:
# Does that look a little confusing? Let's do it with a simulation in Python!

In [20]:
# Let's make a million simulated flips of 8 trials, or independent coins.
n_rows = 1_000_000
n_cols = 8
heads = 1
tails = 0
flips = np.random.choice([heads, tails], size=(n_rows, n_cols))

In [21]:
flips

array([[1, 0, 0, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [1, 1, 0, ..., 1, 0, 1],
       [1, 1, 0, ..., 0, 1, 0]])

In [23]:
# Since we assigned heads as a value of 1, the sum of any given row of 8 trials will be 3 if there were three heads!
numheads = flips.sum(axis=1)

In [25]:
calculated_prob = (numheads == 3).mean()

# And if we take the average number of times where that sum equaled 3:
print(f'The probability that we will flip exactly 3 heads over {n_cols} coins is {calculated_prob}')

The probability that we will flip exactly 3 heads over 8 coins is 0.218409


In [None]:
# Ta Da! We did the thing! Congratulations!! We subverted math with programming! Computer!

#### And for the second part? If the sum is equal or over to 3, we know that we rolled at least 3 heads, so:

In [26]:
calculated_prob = (numheads >= 3).mean()
print(f'The probability that we will flip at least 3 heads over {n_cols} coins is {calculated_prob}')

The probability that we will flip at least 3 heads over 8 coins is 0.856079


### Question #3:
## There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?
#### Mentally visualize the situation:
3 Web Dev cohorts for every 1 Data Science cohort, which is a ratio of 3:1,

or think of it this way:

each sign is a biased coin flip, where we know we have a 1 out of 4 chance of getting a data science student

In [27]:
# theoretical probability:
(1/4) * (1/4)

0.0625

In [28]:
n_rows = 1_000_000
n_cols = 2
prob_ds = 0.25

In [29]:
data = np.random.random((n_rows, n_cols))

In [32]:
((data < prob_ds).sum(axis=1) == 2).mean()

0.062592

## Question 4:
###  Codeup students buy, on average, 3 poptart packages (+- 1.5) a day from the snack vending machine. If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon?

In [34]:
# average number of poptarts consumed:

pop_avg = 3

# deviation of potars: 1.5

pop_std = 1.5
n_cols = 5
n_rows = 1_000_000
simulated_consumed_potars = np.random.normal(pop_avg, pop_std, size=(n_rows, n_cols))

In [35]:
simulated_consumed_potars

array([[ 2.08937582,  0.57041803,  2.1655974 ,  5.36198635,  1.85548684],
       [ 1.87487985,  4.33936208,  3.65843688,  1.17990199,  2.64849652],
       [ 2.82825378,  4.23193019,  3.33124924, -0.10144635,  3.30713518],
       ...,
       [ 3.40889516, -0.00598547,  2.46620979,  2.54324558,  2.32343313],
       [ 1.90065327,  4.27512477,  3.0115505 ,  5.29584011,  3.72412469],
       [ 3.06469503,  5.27239628,  5.21644994,  2.54622125,  1.23641873]])

In [37]:
simulated_consumed_potars.sum(axis=1)

array([12.04286444, 13.70107733, 13.59712203, ..., 10.73579819,
       18.20729333, 17.33618123])

In [38]:
calculated_prob = (simulated_consumed_potars.sum(axis=1) <= 16).mean()
print(f'The probability that there will still be poptarts in the vending machine after {n_cols} days is {calculated_prob}')

The probability that there will still be poptarts in the vending machine after 5 days is 0.617463


## Question 5:

### Compare Heights: 
 - Men have an average height of 178 cm and standard deviation of 8cm. 

 - Women have a mean of 170, sd = 6cm. 

If a man and woman are chosen at random, P(woman taller than man)?

In [40]:
men_avg = 178
men_std = 8
wmn_avg = 170
wmn_std = 6

In [41]:
# Since we have an average and a standard deviation, let's use np.random.normal

In [42]:
s_men = np.random.normal(men_avg, men_std, 1_000_000)

In [45]:
s_men[:5]

array([169.06372258, 175.13212603, 178.98027681, 175.70583194,
       185.37511168])

In [46]:
s_wmn = np.random.normal(wmn_avg, wmn_std, 1_000_000)

array([ True, False, False, ..., False, False, False])

In [48]:
calculated_prob = (s_wmn > s_men).mean()
print(f'The probability that we will have a woman taller than a man presuming a normal distribution is {calculated_prob}')

The probability that we will have a woman taller than a man presuming a normal distribution is 0.211925


## Question 6:

### When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails. What are the odds that after having 50 students download anaconda, no one has an installation issue? 100 students?

### What is the probability that we observe an installation issue within the first 150 students that download anaconda?

### How likely is it that 450 students all download anaconda without an issue?

In [49]:
nrows = 1_000_000

# n_cols in this case is going to be the number of students installing Anaconda.

n_cols = 50
conda_failure = 1
great_success = 0
prob_failure = (1/250)

In [50]:
prob_failure

0.004

In [51]:
installs = np.random.random((nrows, n_cols))

In [53]:
((installs < prob_failure).sum(axis=1) == 0).mean()

0.818334

In [54]:
nrows = 1_000_000

# n_cols in this case is going to be the number of students installing Anaconda.

n_cols = 100
conda_failure = 1
great_success = 0
prob_failure = (1/250)

In [55]:
prob_failure

0.004

In [56]:
installs = np.random.random((nrows, n_cols))

In [57]:
((installs < prob_failure).sum(axis=1) == 0).mean()

0.670379

In [62]:
nrows = 1_000_000

# n_cols in this case is going to be the number of students installing Anaconda.

n_cols = 450
conda_failure = 1
great_success = 0
prob_failure = (1/250)

In [63]:
prob_failure

0.004

In [64]:
installs = np.random.random((nrows, n_cols))

In [66]:
((installs < prob_failure).sum(axis=1) == 0).mean()

0.16446

In [58]:
nrows = 1_000_000

# n_cols in this case is going to be the number of students installing Anaconda.

n_cols = 150
conda_failure = 1
great_success = 0
prob_failure = (1/250)

In [59]:
prob_failure

0.004

In [60]:
installs = np.random.random((nrows, n_cols))

In [61]:
((installs < prob_failure).sum(axis=1) > 0).mean()

0.451436

In [None]:
# The one in 250 is going to come up with our probability bias for the two outcomes.  
# 1/250 = 0.004 probability that we will have an anaconda failure.



In [None]:
calculated_prob = 
print(f'The probability that we will have one or more failure over {n_cols} is {calculated_prob}')

## Question 7:
### There's a 70% chance on any given day that there will be at least one food truck at Travis Park. However, you haven't seen a food truck there in 3 days. How unlikely is this?

### How likely is it that a food truck will show up sometime this week?

In [None]:
# You havent been to Travis Park in like a year because we're in the middle of a pancetta and you're 
# attending codeup from inside your home, so its 0% likely, congratulations.

In [None]:
# Let's pretend its regular times for the sake of doing some statistics, though.
# We are still looking at these like independent events, so:
# There either will be or will not be a food truck, with a probability of 0.7 in favor of there being a food truck.
# 3 days of the week have passed, with two more left, assuming a regular business week.

In [67]:
n_rows = 1_000_000
n_cols = 3
food_truck = 1
no_truck = 0
truck_prob = 0.7

In [68]:
data = np.random.random((n_rows, n_cols))

In [69]:
truck_prob = 0.7

In [70]:
((data < truck_prob).sum(axis=1) == 0).mean()

0.026799

In [71]:
calculated_prob = ((data < truck_prob).sum(axis=1) == 0).mean()
print(f'The probability that we will not have seen a food truck over the course of {n_cols} days is {calculated_prob}')

The probability that we will not have seen a food truck over the course of 3 days is 0.026799


In [None]:
# The presence of a food truck is not dependent on whether or not one showed up on the previous day, 
# its independent.  Let's see what its like for the last two days

In [104]:
n_rows = 1_000_000
n_cols = 2
food_truck = 1
no_truck = 0
truck_prob = 0.7

In [105]:
lunch_days = np.random.random((n_rows, n_cols))

In [108]:
calculated_prob = ((lunch_days < truck_prob).sum(axis=1) > 0).mean() 

In [109]:
print(f'The probability that we have seen a food truck over the course of {n_cols} days is {calculated_prob}')

The probability that we have seen a food truck over the course of 2 days is 0.909307


## Question 8:
### If 23 people are in the same room, what are the odds that two of them share a birthday? What if it's 20 people? 40?

In [None]:
# 365 days in a year (typically)
# 23 people in the room
# we want an instance where both are the same number!

# Hey, this is exactly the same as our first problem with a few extra steps!

In [77]:
# Represent our data's possible outcomes, the number of days in a year
# People born on leap days don't actually exist, so we are going to exclude them here:

outcomes = range(0, 365)
# Create the data!
n_trials = 23
n_simulations = 1_000_000 

In [78]:
# Let's get our simulations. We'll make a simulation of 1 million classrooms of 23 students.
#
# outcomes: possible unique days of the year that a person could have.
# n_simulations: the number of simulated classroom trials
# n_trials: the number of student birthdays
#

In [79]:
classrooms = np.random.choice(outcomes, size=(n_simulations, n_trials))

##### Great, so we can say that when len(np.unique()) == 22 or less, we have a situation of doubles)

#### Let's make a list of every instance where this is the case in our array of simulated classes:

In [None]:
# Let's use a list comprehension: 
# a list of the length of the uniques for each instance for the full number of simulations by index, 
# but only if the number of uniques is less than the number of students in the class

In [86]:
list_of_twin_bdays = [len(np.unique(classrooms[n])) for n in range(0, n_simulations-1) if len(np.unique(classrooms[n])) < 23]

#### The length of this is going to be the number of times we had a class with shared birthdays, and we can divide that by the total number of simulations:

In [87]:
prop_twinsies = len(list_of_twin_bdays) / n_simulations
print(f'The probability that we will have one or more shared birthdays over {n_trials} students is {prop_twinsies}')

The probability that we will have one or more shared birthdays over 23 students is 0.507283


### 20?

In [88]:
# Represent our data's possible outcomes, the number of days in a year
# People born on leap days don't actually exist, so we are going to exclude them here:

outcomes = range(0, 365)
# Create the data!
n_trials = 20
n_simulations = 1_000_000 

In [89]:
# Let's get our simulations. We'll make a simulation of 1 million classrooms of 23 students.
#
# outcomes: possible unique days of the year that a person could have.
# n_simulations: the number of simulated classroom trials
# n_trials: the number of student birthdays
#

In [90]:
classrooms = np.random.choice(outcomes, size=(n_simulations, n_trials))

##### Great, so we can say that when len(np.unique()) == 22 or less, we have a situation of doubles)

#### Let's make a list of every instance where this is the case in our array of simulated classes:

In [91]:
# Let's use a list comprehension: 
# a list of the length of the uniques for each instance for the full number of simulations by index, 
# but only if the number of uniques is less than the number of students in the class

In [92]:
list_of_twin_bdays = [len(np.unique(classrooms[n])) for n in range(0, n_simulations-1) if len(np.unique(classrooms[n])) < 20]

#### The length of this is going to be the number of times we had a class with shared birthdays, and we can divide that by the total number of simulations:

In [93]:
prop_twinsies = len(list_of_twin_bdays) / n_simulations
print(f'The probability that we will have one or more shared birthdays over {n_trials} students is {prop_twinsies}')

The probability that we will have one or more shared birthdays over 20 students is 0.411631


### 40?

In [94]:
# Represent our data's possible outcomes, the number of days in a year
# People born on leap days don't actually exist, so we are going to exclude them here:

outcomes = range(0, 365)
# Create the data!
n_trials = 40
n_simulations = 1_000_000 

In [95]:
# Let's get our simulations. We'll make a simulation of 1 million classrooms of 23 students.
#
# outcomes: possible unique days of the year that a person could have.
# n_simulations: the number of simulated classroom trials
# n_trials: the number of student birthdays
#

In [96]:
classrooms = np.random.choice(outcomes, size=(n_simulations, n_trials))

##### Great, so we can say that when len(np.unique()) == 22 or less, we have a situation of doubles)

#### Let's make a list of every instance where this is the case in our array of simulated classes:

In [97]:
# Let's use a list comprehension: 
# a list of the length of the uniques for each instance for the full number of simulations by index, 
# but only if the number of uniques is less than the number of students in the class

In [98]:
list_of_twin_bdays = [len(np.unique(classrooms[n])) for n in range(0, n_simulations-1) if len(np.unique(classrooms[n])) < 40]

#### The length of this is going to be the number of times we had a class with shared birthdays, and we can divide that by the total number of simulations:

In [99]:
prop_twinsies = len(list_of_twin_bdays) / n_simulations
print(f'The probability that we will have one or more shared birthdays over {n_trials} students is {prop_twinsies}')

The probability that we will have one or more shared birthdays over 40 students is 0.891622
