# Numpy

### Install libraries

Use `!` to run things on terminal

In [24]:
!pip install numpy
!pip install pandas



### Import libraries

In [25]:
# Import numpy for statistical functions
import numpy as np
# Import pandas for easy CSV reading
import pandas as pd

### Read the CSV

In [27]:
df = pd.read_csv('./datasets/StudentsPerformance.csv')
# Display only the top 5 elements
df.head(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


### Create an array from any column
- Getting math scores column as an array

In [29]:
math_scores = np.array(df['math score'])
math_scores[:10]

array([72, 69, 90, 47, 76, 71, 88, 40, 64, 38])

### Numpy Statistics

#### Mean

- Formula

${\displaystyle A={\frac {1}{n}}\sum _{i=1}^{n}a_{i}={\frac {a_{1}+a_{2}+\cdots +a_{n}}{n}}}$

In [34]:
# Using sum and dividing by total
mean = np.sum(math_scores) / len(math_scores)
print(mean)
# Using built in mean function
mean = np.mean(math_scores)
print(mean)

66.089
66.089


#### Median
- Description
- Formula

  $m\left(x\right)
=\begin{cases}
  x_\frac{n+1}{2}                                    & n\text{ odd}\\
  \frac {1}{2}\left(x_{\frac{n}{2}} + x_{\frac{n}{2} + 1}\right) & n \text{ even}
\end{cases}$


In [71]:
def get_median(arr):
    n = len(arr)
    sorted_arr = sorted(math_scores)
    if n % 2 == 0:
        return (sorted_arr[n//2] + sorted_arr[n//2 - 1])/2
    else:
        return sorted_arr[n//2]

In [72]:
# Using own formula
print(get_median(math_scores))
# Using built in median function
print(np.median(math_scores))

66.0
66.0


#### Variance
- Description
- Formula

  $\sigma^2 = \frac{\sum\limits_{i=1}^N (X -\mu)^2}{N}$



In [73]:
def get_variance(arr):
    n = len(arr)
    mean = np.mean(arr)
    # deviations = [(x - mean) ** 2 for x in arr]
    # Better use numpy vector operations
    deviations = (arr - mean) ** 2
    variance = np.sum(deviations) / n
    return variance

In [74]:
print(get_variance(math_scores))
print(np.var(math_scores))

229.68907899999996
229.68907899999996


#### Standard Deviation
- Description
- Formula

$\sigma = \sqrt{\frac{\sum\limits_{i=1}^N (X -\mu)^2}{N}}$


In [75]:
# Using just sqrt of variance
print(np.sqrt(np.var(math_scores)))
# Using built in numpy function
print(np.std(math_scores))

15.155496659628149
15.155496659628149
