In [1]:
from numpy import random
import numpy as np
def roll_the_dice(n_simulations = 1000):
    '''
    input: int
    output: float
    two unbiased, six sided, dice are thrown once and the sum of the 
    showing faces is observed (so if you rolled a 3 and a 1, you would 
    observe the sum, 4). use a simulation to find the estimated 
    probability that the total score is an even number or a number 
    greater than 7.  your function should return an estimated 
    probability, based on rolling the two dice n_simulations times.
    '''

    lst = []
    total = 0
    prob = 0
    
    for i in range(n_simulations):
        die1 = np.random.randint(1,7)
        die2 = np.random.randint(1,7)
        total = die1 + die2
        lst.append(total)
    
    for num in lst:
        if num % 2 ==0 or num > 7:
            prob += 1
    
    return prob/n_simulations

#     better method:
#     total = 0
#     num_repeats = 10000
#     for i in range(num_repeats):
#         die1 = random.randint(1, 6+1)
#         die2 = random.randint(1, 6+1)
#         score = die1 + die2
#         if score % 2 == 0 or score > 7:
#             total += 1
#     return float(total) / num_repeats

In [2]:
import scipy.stats as stats
def calculate_t_test(sample1, sample2, type_i_error_rate):
    '''
    input: numpy array, numpy array
    output: float, boolean
    you are asked to evaluate whether the two samples come from a 
    population with the same population mean.  return a tuple 
    containing the p-value for the pair of samples and true or false 
    depending if the p-value is considered significant at the provided 
    type i error rate (i.e. false positive rate, i.e. alpha).
    '''
    x, pvalue = stats.ttest_ind(sample1, sample2)
    return pvalue, pvalue > type_i_error_rate

calculate_t_test(sample1=stats.norm.rvs(loc=5,scale=10,size=500), 
                 sample2 = stats.norm.rvs(loc=8, scale=20, size=100), 
                 type_i_error_rate = 0.5)

(2.0822462377669572e-08, False)

In [3]:
import numpy as np
def add_column(arr, col):
    '''
    INPUT: 2 DIMENSIONAL NUMPY ARRAY, NUMPY ARRAY
    OUTPUT: 2 DIMENSIONAL NUMPY ARRAY
    Return a numpy array containing arr with col added as a final 
    column. You can assume that the number of rows in arr is the same 
    as the length of col.
    E.g.  np.array([[1, 2], [3, 4]]), np.array([5, 6))
              ->  np.array([[1, 2, 5], [3, 4, 6]])
    '''

    new_arr = np.insert(arr, len(arr), col, axis = 1)
    return arr, len(arr[0]), new_arr, len(new_arr[0])

add_column(np.array([[1, 2], [3, 4]]), np.array([5, 6]))

(array([[1, 2],
        [3, 4]]),
 2,
 array([[1, 2, 5],
        [3, 4, 6]]),
 3)

In [4]:
import numpy as np
def only_positive(arr):
    '''
    INPUT: 2 DIMENSIONAL NUMPY ARRAY
    OUTPUT: 2 DIMENSIONAL NUMPY ARRAY
    Return a numpy array containing only the rows from arr where all 
    the values in that row are positive.
    E.g.  np.array([[1, -1, 2], 
                    [3, 4, 2], 
                    [-8, 4, -4]])
              ->  np.array([[3, 4, 2]])
    Use numpy methods to do this, full credit will not be awarded for a 
    python for loop.
    '''
    return np.min(arr,1), arr[np.min(arr,1) > 0]

only_positive(np.array([[1, -1, 2], 
                    [3, 4, 2], 
                    [-8, 4, -4]]))

(array([-1,  2, -8]), array([[3, 4, 2]]))

In [5]:
def df_to_numpy(df, y_column):
    '''
    input: dataframe, string
    output: 2 dimensional numpy array, numpy array
    make the column named y_column into a numpy array (y) and make the 
    rest of the dataframe into a 2 dimensional numpy array (x). return 
    (x, y).
    e.g.
                a  b  c
        df = 0  1  3  5
             1  2  4  6
        y_column = 'c'
        output: np.array([[1, 3], [2, 4]]), np.array([5, 6])
    '''
    df2 = df.copy()
    x = df2.drop(y_column, axis=1)
    y = df[y_column]
    return x,y
    

In [6]:
def pandas_query(df):
    '''
    input: dataframe
    output: dataframe
    given a dataframe containing university data with these columns:
        name, address, website, type, size
    return the dataframe containing the average size for each 
    university type ordered by average size in ascending order.
    '''
    new_df = df.groupby('type')['size'].mean().sort_values(by = 'size', ascending = True)
    return new_df

### The bias of a coin is 0.6. What is the probability of flipping 8 or more heads in 10 flips?


In [7]:
cdf = stats.binom.cdf(7,10,0.6)
1-cdf

prob = stats.binom(10,0.6)
total = prob.pmf(8), prob.pmf(9), prob.pmf(10)
print(total, sum(total), '\n', 1-cdf)

(0.12093235199999994, 0.04031078400000004, 0.0060466176) 0.16728975359999998 
 0.16728975359999998


### A probability distribution P is dependent on two categorical values x and y. x can take on values T and F, while y can take on values of a, b, and c. The following joint distribution table describes the joint probability P(x,y). What is P(x=T | y=b) ?

        a        b        c
        
`T  |  0.2   |  0.1   |  0.2`

`F  |  0.05  |  0.15  |  0.3`

In [8]:
# conditional probability formula
# prob of A given B = prob A and B / prob B

0.1 / (0.1+0.15)

# this is conditional probability bc conditional probability is the probability of occurrence 
# of a certain event say A, based on the occurrence of some other event say B. expressed as:
# P(A|B) = P(A ⋂ B)/P(B)


# Bayes theorem derived from the conditional probability of events. This theorem includes 
# two conditional probabilities for the events say A and B expressed as:
# P(A|B) = P(B|A) * P(A)/P(B)

0.4

### Which distribution is most appropriate to use to represent the scenario? The side that faces up when you roll a fair, 20 sided die.


uniform discrete