# Boxplot Math

## Import Libraries

In [72]:
import pandas as pd
from IPython.display import display, Math

Here we are going to cover some basic math concepts like:

- Max
- Min
- Mean
- Median
- quartiles

## Build our Sample Data

In [2]:
# lets say that this is our sample data
data = [90, 25, 14, 26, 14, 12, 5, 8, 2, 6, 4, 13, 75]

## Build our functions

### Min

In [3]:
def custom_min(data):
    """custom function to explain a basic min function"""
    # in case we dont know what we got
    if not isinstance(data, (list, tuple)):
        raise TypeError("Input must be a list or tuple")
    if len(data) == 0:
        raise ValueError("Cannot find the min of an empty sequence")
    
    minimum = data[0]
    for value in data[1:]:
        if value < minimum:
            minimum = value
    return minimum


In [4]:
print(f"The custom min of the data is: {custom_min(data)}")

The custom min of the data is: 2


#### Explanation

In [5]:
## lets break down the slicing. when we see data[1:], what is that doing?
# let's simulate the loop. We dont want the first element since we already plugged that in as the minimum
# so we take the first element to the rest of the list
step1 = data[1:]  # this is equivalent to data[1:len(data)]
print(f"Step 1: {step1} - this is the data without the first element")
step2 = step1[1:]
print(f"Step 2: {step2} - now we are stepping through removing the first element again")
step3 = step2[1:]
print(f"Step 3: {step3} - and again...")
# etc...

Step 1: [25, 14, 26, 14, 12, 5, 8, 2, 6, 4, 13, 75] - this is the data without the first element
Step 2: [14, 26, 14, 12, 5, 8, 2, 6, 4, 13, 75] - now we are stepping through removing the first element again
Step 3: [26, 14, 12, 5, 8, 2, 6, 4, 13, 75] - and again...


In [6]:
# when we run the loop, we are taking the first element of the newly generate list
test_data = [1, 2, 3]
for value in test_data:
    print(f"value is : {value}")

value is : 1
value is : 2
value is : 3


#### Pandas version

Now that we understand the basic python concept of min, let's look at how we are really going to use it.
Pandas dataframes...

In [7]:
# we are going to start with a dictionary. 
# we are going to include each data element as a Pandas Series or list or array...
# each list needs to have the same shape/length
# we are going to use .25 for each error correct amount
d = {
    'transaction_ids': [1, 2, 3, 4, 5, 6],
    'error_correct_counts': [4, 7, 10, 5, 0, 2],
    'error_correct_amounts': [1, 1.75, 2.50, 1.25, 0, .5]
}

In [8]:
# create a dataframe
ds = pd.DataFrame(d)
ds.head()

Unnamed: 0,transaction_ids,error_correct_counts,error_correct_amounts
0,1,4,1.0
1,2,7,1.75
2,3,10,2.5
3,4,5,1.25
4,5,0,0.0


In [9]:
min_error_correct_count = ds['error_correct_counts'].min()
print(f"The minimum error correct count is: {min_error_correct_count}")

The minimum error correct count is: 0


In [10]:
min_error_correct_amount = ds['error_correct_amounts'].min()
print(f"The minimum error correct amount is: {min_error_correct_amount}")

The minimum error correct amount is: 0.0


In [11]:
# this will get the minimum for every column
min_values = ds.min()
print("Minimum values for each column:")
print(min_values)

Minimum values for each column:
transaction_ids          1.0
error_correct_counts     0.0
error_correct_amounts    0.0
dtype: float64


In [12]:
# get the min across columns for each row
ds.min(axis=1)

0    1.00
1    1.75
2    2.50
3    1.25
4    0.00
5    0.50
dtype: float64

### Max

#### Explanation

In [13]:
# this is how pandas creates a funcy wrapper over a smart loop
# each series gets passed in
def custom_max(series):
    max_val = series.iloc[0]
    for val in series.iloc[1:]:
        if val > max_val:
            max_val = val
    return max_val

In [14]:
print(f"Custom max of error_correct_amounts: {custom_max(ds['error_correct_amounts'])}")

Custom max of error_correct_amounts: 2.5


In [15]:
# lets grab a single series from our dataset
series = ds['error_correct_amounts']
series

0    1.00
1    1.75
2    2.50
3    1.25
4    0.00
5    0.50
Name: error_correct_amounts, dtype: float64

In [16]:
print(type(series))

<class 'pandas.core.series.Series'>


lets look at iloc

In [17]:
print(series.iloc[2]) # we are just getting the index of the element here

2.5


series.iloc[1:]: is the same as what we did earlier with the data

In [18]:
series.iloc[1:] # we have removed the first element

1    1.75
2    2.50
3    1.25
4    0.00
5    0.50
Name: error_correct_amounts, dtype: float64

#### Pandas version

In [19]:
max_error_correct_count = ds['error_correct_counts'].max()
print(f"The maximum error correct count is: {max_error_correct_count}")

The maximum error correct count is: 10


In [20]:
max_error_correct_amount = ds['error_correct_amounts'].max()
print(f"The maximum error correct amount is: {max_error_correct_amount}")

The maximum error correct amount is: 2.5


You can see that these do match up. We have 10 error corrects for 2.50 in transaction id 3

### Mean

#### Explanation

In [30]:
# first lets break down a sum function
def custom_sum(series):
    """Custom function to explain a basic sum function"""
    if not isinstance(series, (pd.Series, list)):
        raise TypeError("Input must be a Pandas Series or a list")
    if len(series) == 0:
        raise ValueError("Cannot sum an empty series or list")
    
    total = 0
    for value in series:
        total += value
    return total

In [31]:
# we just need to take the total of all elements and divide by the number of elements
length = len(data)
total = sum(data) # using the built-in sum function
print(f"The Length of our data is {length} and the total is {total}")

The Length of our data is 13 and the total is 294


In [32]:
# now lets test our custom sum function to see if it matches
c_total = custom_sum(data)
print(f"The custom sum of our data is: {c_total}")

The custom sum of our data is: 294


In [27]:
# to calculate the mean or average, we just divide the total by the length
mean = total / length
print(f"The mean of our data is: {mean}")

The mean of our data is: 22.615384615384617


In [28]:
# in python, we can import the statistics module to get the mean
import statistics
mean_stat = statistics.mean(data)
print(f"The mean using the statistics module is: {mean_stat}")

The mean using the statistics module is: 22.615384615384617


#### Pandas

In [33]:
error_correct_count_mean = ds['error_correct_counts'].mean()
print(f"The mean error correct count is: {error_correct_count_mean}")

The mean error correct count is: 4.666666666666667


In [34]:
error_correct_amount_mean = ds['error_correct_amounts'].mean()
print(f"The mean error correct amount is: {error_correct_amount_mean}")

The mean error correct amount is: 1.1666666666666667


### Median

#### Explanation

The median is a little different. The basic principal of a median is just simply finding the middle number in a series of list. The trick is that we have to sort it first otherwise it wouldn't make any sense. Let's take a look at an example using our original data

In [35]:
print(data)

[90, 25, 14, 26, 14, 12, 5, 8, 2, 6, 4, 13, 75]


#### Sorting

In [38]:
# sort the data first
sorted_data = sorted(data)
print(sorted(data))

[2, 4, 5, 6, 8, 12, 13, 14, 14, 25, 26, 75, 90]


![median](images/median_1.png)

#### data has odd length

In [57]:
#if we take the length of our data and we mod by 2, we can determine if the length is even or odd
# if its false, then we have an odd length
sorted_test = [1,2,3,4,5]
is_even = len(sorted_test) % 2 == 0
sorted_result = 'even' if is_even else 'odd'
n = len(sorted_test)
mid = n // 2 # this divides by 2 and returns the integer part
sorted_median = sorted_test[mid]
print(f"the list is {sorted_result} so we need to take the length of {n} and get the middle index of {mid} to get the median result of {sorted_median}")

the list is odd so we need to take the length of 5 and get the middle index of 2 to get the median result of 3


In [58]:
print(sorted_test[mid])

3


#### data has even length

In [82]:
even_test = [1,2,3,4,5,6]
n = len(even_test)
is_even = n % 2 == 0
sorted_even_result = 'even' if is_even else 'odd'
even_mid = n // 2
even_median = (even_test[even_mid - 1] + even_test[even_mid]) / 2
print(f"""The list is {sorted_even_result} so we need to take the length of {n} and the middle index of {even_mid}. 
      This actually gives us two numbers. We have the numbers 3 and 4.
      To get the median we need to get the mean of these two numbers to get a result of {even_median}""")


The list is even so we need to take the length of 6 and the middle index of 3. 
      This actually gives us two numbers. We have the numbers 3 and 4.
      To get the median we need to get the mean of these two numbers to get a result of 3.5


In [83]:
first_number = even_test[even_mid-1]
second_number = even_test[even_mid]
test_result = (first_number + second_number) / 2
print(f"The first number is {first_number} and the second number is {second_number}.")

display(Math(f'(%s+%s)/2=%s' %(first_number, second_number, test_result)))

The first number is 3 and the second number is 4.


<IPython.core.display.Math object>

In [84]:
# let's build a custom median functio
def custom_median(data):
    sorted_data = sorted(data)
    n = len(sorted_data)
    mid = n // 2

    if n % 2 == 0:  # even length
        return (sorted_data[mid - 1] + sorted_data[mid]) / 2
    else:
        return sorted_data[mid]

In [85]:
# now if we go back to our original dataset, we can get the median
median_value = custom_median(data)
print(f"The custom median of the data is: {median_value}")

The custom median of the data is: 13


#### Pandas

In [86]:
error_correct_count_median = ds['error_correct_counts'].median()
print(f"The median error correct count is: {error_correct_count_median}")

The median error correct count is: 4.5


In [87]:
error_correct_amount_median = ds['error_correct_amounts'].median()
print(f"The median error correct amount is: {error_correct_amount_median}")

The median error correct amount is: 1.125


In [91]:
ds

Unnamed: 0,transaction_ids,error_correct_counts,error_correct_amounts
0,1,4,1.0
1,2,7,1.75
2,3,10,2.5
3,4,5,1.25
4,5,0,0.0
5,6,2,0.5


In [94]:
# lets test that with our custom function
column_medians = ds.apply(custom_median, axis=0)
column_medians

transaction_ids          3.500
error_correct_counts     4.500
error_correct_amounts    1.125
dtype: float64

In [95]:
# if we want row medians
row_medians = ds.apply(custom_median, axis=1)
row_medians

0    1.0
1    2.0
2    3.0
3    4.0
4    0.0
5    2.0
dtype: float64

In [100]:
print(pd.Series([1,4,1]).median())
print(pd.Series([2,7,1.75]).median())
print(pd.Series([3,10,2.50]).median())
print(pd.Series([4,5,1.25]).median())
print(pd.Series([5,0,0]).median())
print(pd.Series([6,2,0.5]).median())

1.0
2.0
3.0
4.0
0.0
2.0


### quantile

### describe