In [2]:
import numpy as np
import pandas as pd
import statistics
from scipy import stats
import math

In [3]:
original_data = np.array(
    [
        150,
        151,
        152,
        152,
        153,
        154,
        155,
        155,
        155,
        155,
        156,
        156,
        156,
        157,
        158,
        158,
        160,
        160,
        160,
        160,
        160,
        161,
        161,
        161,
        161,
        162,
        163,
        163,
        164,
        164,
        164,
        165,
        166,
        167,
        168,
        168,
        169,
        170,
        172,
        173,
    ]
)

data = original_data

In [4]:
data.sum() / len(data), data.mean(), statistics.mean(data)

(160.375, 160.375, 160)

In [5]:
statistics.mode(data), stats.mode(data)

(160, ModeResult(mode=160, count=5))

In [6]:
odd_data = [150, 151, 152, 152, 153, 154, 155, 155, 155]

position = math.ceil(len(odd_data) / 2)

odd_data[position - 1]

153

In [7]:
position = len(data) // 2

(data[position - 1] + data[position]) / 2

160.0

### Easy way

In [8]:
np.median(odd_data), statistics.median(odd_data)

(153.0, 153)

In [9]:
np.median(data), statistics.median(data)

(160.0, 160.0)

### Weighted Arithmetic Mean

In [10]:
grades = np.array([9, 8, 7, 3])
weights = np.array([1, 2, 3, 4])

np.average(grades, weights=weights), (9 * 1 + 8 * 2 + 7 * 3 + 3 * 4) / (
    1 + 2 + 3 + 4
), (grades * weights).sum() / weights.sum()

(5.8, 5.8, 5.8)

In [11]:
data = {
    "lower": [150, 154, 158, 162, 166, 170],
    "upper": [154, 158, 162, 166, 170, 174],
    "fi": [5, 9, 11, 7, 5, 3],
}

dataset = pd.DataFrame(data)

dataset

Unnamed: 0,lower,upper,fi
0,150,154,5
1,154,158,9
2,158,162,11
3,162,166,7
4,166,170,5
5,170,174,3


In [12]:
dataset["xi"] = (dataset["lower"] + dataset["upper"]) / 2

dataset

Unnamed: 0,lower,upper,fi,xi
0,150,154,5,152.0
1,154,158,9,156.0
2,158,162,11,160.0
3,162,166,7,164.0
4,166,170,5,168.0
5,170,174,3,172.0


In [13]:
dataset["fi.xi"] = dataset["fi"] * dataset["xi"]

dataset

Unnamed: 0,lower,upper,fi,xi,fi.xi
0,150,154,5,152.0,760.0
1,154,158,9,156.0,1404.0
2,158,162,11,160.0,1760.0
3,162,166,7,164.0,1148.0
4,166,170,5,168.0,840.0
5,170,174,3,172.0,516.0


In [14]:
dataset["Fi"] = 0

dataset

Unnamed: 0,lower,upper,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,0
1,154,158,9,156.0,1404.0,0
2,158,162,11,160.0,1760.0,0
3,162,166,7,164.0,1148.0,0
4,166,170,5,168.0,840.0,0
5,170,174,3,172.0,516.0,0


In [15]:
counter = 0
sum_frequencies = []

for row in dataset.iterrows():
    counter += row[1]["fi"]
    sum_frequencies.append(counter)

dataset["Fi"] = sum_frequencies

dataset

Unnamed: 0,lower,upper,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5.0
1,154,158,9,156.0,1404.0,14.0
2,158,162,11,160.0,1760.0,25.0
3,162,166,7,164.0,1148.0,32.0
4,166,170,5,168.0,840.0,37.0
5,170,174,3,172.0,516.0,40.0


#### easy way

In [16]:
dataset["Fi"] = dataset["fi"].cumsum()

dataset

Unnamed: 0,lower,upper,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5
1,154,158,9,156.0,1404.0,14
2,158,162,11,160.0,1760.0,25
3,162,166,7,164.0,1148.0,32
4,166,170,5,168.0,840.0,37
5,170,174,3,172.0,516.0,40


In [17]:
dataset["fi"].sum(), dataset["fi.xi"].sum()

(40, 6428.0)

In [18]:
dataset["fi.xi"].sum() / dataset["fi"].sum()

160.7

In [19]:
dataset["fi"].max(), dataset["fi"].idxmax()

(11, 2)

In [20]:
dataset[dataset["fi"] == dataset["fi"].max()]

Unnamed: 0,lower,upper,fi,xi,fi.xi,Fi
2,158,162,11,160.0,1760.0,25


In [21]:
dataset[dataset["fi"] == dataset["fi"].max()]["xi"].values[0]

160.0

In [22]:
dataset

Unnamed: 0,lower,upper,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5
1,154,158,9,156.0,1404.0,14
2,158,162,11,160.0,1760.0,25
3,162,166,7,164.0,1148.0,32
4,166,170,5,168.0,840.0,37
5,170,174,3,172.0,516.0,40


In [23]:
fi_2 = dataset["fi"].sum() / 2

fi_2

20.0

In [24]:
for row in dataset.iterrows():
    lower_limit = row[1]["lower"]
    class_frequency = row[1]["fi"]
    id_pass_frequency = row[0]

    if fi_2 <= row[1]["Fi"]:
        id_pass_frequency -= 1
        break

id_pass_frequency

1

In [25]:
Fi_past = dataset.iloc[id_pass_frequency]["Fi"]

Fi_past

14.0

In [26]:
median = lower_limit + ((fi_2 - Fi_past) / class_frequency) * 4

median

160.1818181818182

### Geometric, Harmonic, Quadratic Mean

In [27]:
from scipy.stats.mstats import gmean, hmean

In [28]:
data = original_data

gmean(data), hmean(data)

(160.26958390038905, 160.1647194799467)

In [29]:
def quadratic_mean(data):
    return math.sqrt((data**2).sum() / len(data))


quadratic_mean(data)

160.48091786876097

### Quartis

In [30]:
odd_data = [150, 151, 152, 152, 153, 154, 155, 155, 155]

np.median(odd_data)

153.0

In [31]:
median_position = len(odd_data) // 2

median_position

4

In [32]:
left = odd_data[0:median_position]
right = odd_data[median_position + 1 :]

left, right

([150, 151, 152, 152], [154, 155, 155, 155])

In [33]:
q1 = np.median(left)
q3 = np.median(right)

q1, q3

(151.5, 155.0)

### Easy way

In [34]:
np.quantile(odd_data, 0.25), np.quantile(odd_data, 0.50), np.quantile(odd_data, 0.75)

(152.0, 153.0, 155.0)

In [35]:
np.quantile(data, 0.25), np.quantile(data, 0.50), np.quantile(data, 0.75)

(155.75, 160.0, 164.0)

#### scipy

In [36]:
stats.scoreatpercentile(data, 25), stats.scoreatpercentile(
    data, 50
), stats.scoreatpercentile(data, 75)

(155.75, 160.0, 164.0)

#### pandas

In [37]:
pd.Series(data).quantile([0.25, 0.50, 0.75])

0.25    155.75
0.50    160.00
0.75    164.00
dtype: float64

In [38]:
def get_quartiles(dataset, q1=True):
    if q1:
        f1 = dataset["fi"].sum() / 4
    else:
        f1 = dataset["fi"].sum() * 3 / 4

    lower_limit, class_frequency, id_pass_frequency = None, None, None

    for row in dataset.iterrows():
        lower_limit = row[1]["lower"]
        class_frequency = row[1]["fi"]
        id_pass_frequency = row[0]

        if f1 <= row[1]["Fi"]:
            id_pass_frequency -= 1
            break

    Fi_past = dataset.iloc[id_pass_frequency]["Fi"]

    return lower_limit + ((f1 - Fi_past) / class_frequency) * 4

In [39]:
get_quartiles(dataset, q1=True), get_quartiles(dataset, q1=False)

(156.22222222222223, 164.85714285714286)

### Percentis


In [40]:
data = original_data

np.median(data)

160.0

In [41]:
np.quantile(data, 0.25), np.quantile(data, 0.50), np.quantile(data, 0.75)

(155.75, 160.0, 164.0)

In [42]:
np.percentile(data, 25), np.percentile(data, 50), np.percentile(data, 75)

(155.75, 160.0, 164.0)

In [43]:
np.percentile(data, 5), np.percentile(data, 10), np.percentile(data, 90)

(151.95, 152.9, 168.1)

In [44]:
stats.scoreatpercentile(data, 5), stats.scoreatpercentile(
    data, 10
), stats.scoreatpercentile(data, 90)

(151.95000000000002, 152.89999999999998, 168.1)

## Exercise

In [45]:
dataset_census = pd.read_csv("data/census.csv")

dataset_census.head(10)

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [46]:
age = dataset_census["age"]

age.mean(), age.median(), age.mode()

(38.58164675532078,
 37.0,
 0    36
 Name: age, dtype: int64)

In [47]:
gmean(age), hmean(age), quadratic_mean(age)

(36.210879158177256, 33.91874139089839, 40.9218664329987)

In [48]:
np.average(age, weights=dataset_census["age"])

43.40403516159512

In [49]:
data = original_data

data

array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
       157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
       163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
       173])

In [50]:
data.max() - data.min()

23

In [51]:
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)

q1, q3

(155.75, 164.0)

In [52]:
DI = q3 - q1

DI

8.25

In [53]:
lower = q1 - 1.5 * DI
upper = q3 + 1.5 * DI

lower, upper

(143.375, 176.375)

### Variance

In [54]:
odd_data = [150, 151, 152, 152, 153, 154, 155, 155, 155]

In [56]:
np.mean(odd_data)

153.0

In [58]:
deviation = abs(odd_data - np.mean(odd_data))

deviation

array([3., 2., 1., 1., 0., 1., 2., 2., 2.])

In [None]:
deviation = deviation**2

deviation

array([9., 4., 1., 1., 0., 1., 4., 4., 4.])

In [60]:
sum_deviation = deviation.sum()

sum_deviation

28.0

In [63]:
v = sum_deviation / (len(odd_data))

v

3.111111111111111

In [65]:
def get_variance(data):
    return ((data - np.mean(data)) ** 2).sum() / len(data)

get_variance(odd_data)

3.111111111111111

In [66]:
np.var(odd_data)

3.111111111111111

In [69]:
stats.tvar(odd_data)

3.5