# Medidas de posição e dispersão
## Base de dados

In [1]:
import math
import pandas as pd
import numpy as np
import statistics as st

from scipy  import stats

In [2]:
data = np.array([160, 165, 167, 164, 160, 166, 160, 161, 150, 152,
                 173, 160, 155, 164, 168, 162, 161, 168, 163, 156,
                 155, 169, 151, 170, 164, 155, 152, 163, 160, 155,
                 157, 156, 158, 158, 161, 154, 161, 156, 172, 153])

## Média aritmética simples

In [3]:
data.sum() / len(data)

160.375

In [4]:
data.mean()

160.375

In [5]:
st.mean(data)

160

## Moda

In [6]:
st.mode(data)

160

In [7]:
stats_mode = stats.mode(data)
stats_mode

ModeResult(mode=array([160]), count=array([5]))

In [8]:
print(f'Moda = {stats_mode[0][0]}')
print(f'Ocorrências = {stats_mode[1][0]}')

Moda = 160
Ocorrências = 5


## Mediana

In [9]:
data_impar = [150,151,152,152,153,154,155,155,155]
data_impar

[150, 151, 152, 152, 153, 154, 155, 155, 155]

### Cálculo manual (ímpar)

In [10]:
pos = len(data_impar) // 2
pos

4

In [11]:
data_impar[pos]

153

### Cálculo manual (par)

In [12]:
pos = len(data) // 2
pos

20

In [13]:
data[pos - 1], data[pos]

(156, 155)

In [14]:
data.sort()

In [15]:
data[pos - 1], data[pos]

(160, 160)

In [16]:
mediana = (data[pos - 1] + data[pos]) / 2
mediana

160.0

### Bibliotecas

In [17]:
np.median(data_impar)

153.0

In [18]:
np.median(data)

160.0

In [19]:
st.median(data_impar)

153

In [20]:
st.median(data)

160.0

## Média aritmética ponderada

In [21]:
notas = np.array([9, 8, 7, 3])
pesos = np.array([1, 2, 3, 4])

In [22]:
((9 * 1) + (8 * 2) + (7 * 3) + (3 * 4)) / (1 + 2 + 3 + 4)

5.8

In [23]:
media_ponderada = (notas * pesos).sum() / pesos.sum()
media_ponderada

5.8

In [24]:
np.average(notas, weights = pesos)

5.8

## Média aritmética, mode e mediana com distribuição de frequência (dados agrupados)

In [25]:
data_stats = {'estatura_cm': ['150 |-- 154','154 |-- 158','158 |-- 162','162 |-- 166',
                       '166 |-- 170','170 |--| 174','Total'],
       'fi': [5, 9, 11, 7, 5, 3, 40]}

In [26]:
xi = []
for est in data_stats['estatura_cm'][:-1]:    
    xi.append(int(int(est[:3]) + (int(est[-3:]) - int(est[:3])) / 2))
xi.append(sum(xi))
print(xi)

[152, 156, 160, 164, 168, 172, 972]


In [27]:
data_stats['xi'] = xi
data_stats

{'estatura_cm': ['150 |-- 154',
  '154 |-- 158',
  '158 |-- 162',
  '162 |-- 166',
  '166 |-- 170',
  '170 |--| 174',
  'Total'],
 'fi': [5, 9, 11, 7, 5, 3, 40],
 'xi': [152, 156, 160, 164, 168, 172, 972]}

In [28]:
fixi = np.array(data_stats['fi'][:-1]) * np.array(data_stats['xi'][:-1])
fixi = np.insert(fixi, len(fixi), fixi.sum()).astype(int)
print(fixi)

[ 760 1404 1760 1148  840  516 6428]


In [29]:
data_stats['fixi'] = list(fixi)
data_stats

{'estatura_cm': ['150 |-- 154',
  '154 |-- 158',
  '158 |-- 162',
  '162 |-- 166',
  '166 |-- 170',
  '170 |--| 174',
  'Total'],
 'fi': [5, 9, 11, 7, 5, 3, 40],
 'xi': [152, 156, 160, 164, 168, 172, 972],
 'fixi': [760, 1404, 1760, 1148, 840, 516, 6428]}

In [30]:
j = 0
Fi = []
for val in data_stats['fi'][:-1]:
    j += val
    Fi.append(j)
    
Fi.append(sum(Fi))
Fi

[5, 14, 25, 32, 37, 40, 153]

In [31]:
data_stats['Fi'] = Fi
data

array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
       157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
       163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
       173])

In [32]:
data_stats = {'inferior': [150, 154, 158, 162, 166, 170],
       'superior': [154, 158, 162, 166, 170, 174],
       'fi': [5, 9, 11, 7, 5, 3]}

In [33]:
df = pd.DataFrame(data_stats)
df

Unnamed: 0,inferior,superior,fi
0,150,154,5
1,154,158,9
2,158,162,11
3,162,166,7
4,166,170,5
5,170,174,3


In [34]:
df['xi'] = (df.superior + df.inferior) / 2
df

Unnamed: 0,inferior,superior,fi,xi
0,150,154,5,152.0
1,154,158,9,156.0
2,158,162,11,160.0
3,162,166,7,164.0
4,166,170,5,168.0
5,170,174,3,172.0


In [35]:
df['fi.xi'] = df.fi * df.xi
df

Unnamed: 0,inferior,superior,fi,xi,fi.xi
0,150,154,5,152.0,760.0
1,154,158,9,156.0,1404.0
2,158,162,11,160.0,1760.0
3,162,166,7,164.0,1148.0
4,166,170,5,168.0,840.0
5,170,174,3,172.0,516.0


In [36]:
Fi = []
j = 0
for line in df.iterrows():
    j += line[1][2]
    Fi.append(j)
df['Fi'] = Fi
df

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5.0
1,154,158,9,156.0,1404.0,14.0
2,158,162,11,160.0,1760.0,25.0
3,162,166,7,164.0,1148.0,32.0
4,166,170,5,168.0,840.0,37.0
5,170,174,3,172.0,516.0,40.0


### Média

In [37]:
df['fi'].sum(), df['fi.xi'].sum()

(40, 6428.0)

In [38]:
df['fi.xi'].sum() / df['fi'].sum()

160.7

### Moda

In [39]:
df['fi'].max()

11

In [40]:
df.query(f'fi == {df["fi"].max()}')

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
2,158,162,11,160.0,1760.0,25.0


In [41]:
df.query(f'fi == {df["fi"].max()}')['xi'].values[0]

160.0

### Mediana

In [42]:
df

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5.0
1,154,158,9,156.0,1404.0,14.0
2,158,162,11,160.0,1760.0,25.0
3,162,166,7,164.0,1148.0,32.0
4,166,170,5,168.0,840.0,37.0
5,170,174,3,172.0,516.0,40.0


In [43]:
fi_2 = df['fi'].sum() / 2
fi_2

20.0

In [44]:
lim_inferior, freq_class, id_freq_anterior, Fi_anterior = 0, 0, 0, 0
for line in df.iterrows():
    
    lim_inferior = line[1][0]
    freq_class = line[1][2]
    id_freq_anterior = line[0]
    
    print(f'|##>line\n{line}\n')
    print(f'|##>lim_inf===>{line[1][0]}')
    print(f'|##>freq_class===>{line[1][2]}')
    print(f'|##>id_freq_ant===>{line[0]}\n')
    
    if line[1][5] >= fi_2:
        id_freq_anterior -= 1
        Fi_anterior = df.iloc[[id_freq_anterior]]['Fi']
        break

|##>line
(0, inferior    150.0
superior    154.0
fi            5.0
xi          152.0
fi.xi       760.0
Fi            5.0
Name: 0, dtype: float64)

|##>lim_inf===>150.0
|##>freq_class===>5.0
|##>id_freq_ant===>0

|##>line
(1, inferior     154.0
superior     158.0
fi             9.0
xi           156.0
fi.xi       1404.0
Fi            14.0
Name: 1, dtype: float64)

|##>lim_inf===>154.0
|##>freq_class===>9.0
|##>id_freq_ant===>1

|##>line
(2, inferior     158.0
superior     162.0
fi            11.0
xi           160.0
fi.xi       1760.0
Fi            25.0
Name: 2, dtype: float64)

|##>lim_inf===>158.0
|##>freq_class===>11.0
|##>id_freq_ant===>2



In [45]:
lim_inferior, freq_class, id_freq_anterior, Fi_anterior

(158.0,
 11.0,
 1,
 1    14.0
 Name: Fi, dtype: float64)

In [46]:
mediana = (lim_inferior + ((fi_2 - Fi_anterior) * 4) / freq_class).values[0]
mediana

160.1818181818182

### Função completa

In [47]:
def get_stats(df: pd.DataFrame, quartis:bool = False, q1:bool = False) -> tuple:
    
    if not quartis:    
        media = df['fi.xi'].sum() / df['fi'].sum()
        moda = df.query(f'fi == {df["fi"].max()}')['xi'].values[0]
        fi_ = df['fi'].sum() / 2
    else:
        if q1:
            fi_ = df['fi'].sum() / 4
        else:
            fi_ = (3 * df['fi'].sum()) / 4
    
    lim_inferior, freq_class, id_freq_anterior, Fi_anterior = 0, 0, 0, 0
    for line in df.iterrows():

        lim_inferior = line[1][0]
        freq_class = line[1][2]
        id_freq_anterior = line[0]

        if line[1][5] >= fi_:
            id_freq_anterior -= 1
            Fi_anterior = df.iloc[[id_freq_anterior]]['Fi']
            break
            
    mediana = (lim_inferior + ((fi_ - Fi_anterior) * 4) / freq_class).values[0]
            
    if quartis:        
        return mediana
    else:
        return media, moda, mediana 

In [48]:
get_stats(df)

(160.7, 160.0, 160.1818181818182)

## Média geométrica, harmônica e quadrática
### Média geométrica

In [49]:
data

array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
       157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
       163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
       173])

In [50]:
np.prod(data.astype(float)) ** (1/len(data))

160.26958390038902

In [51]:
stats.mstats.gmean(data)

160.26958390038902

### Média harmônica

In [52]:
len(data) / sum(1/data)

160.16471947994663

In [53]:
stats.mstats.hmean(data)

160.16471947994674

### Média quadrática

In [54]:
def sqrt_mean(data):
    #return math.sqrt(sum(pow(n, 2) for n in data) / len(data))
    return (sum(pow(n, 2) for n in data) / len(data)) ** 0.5

In [55]:
sqrt_mean(data)

160.48091786876097

## Quartis

In [56]:
data_impar

[150, 151, 152, 152, 153, 154, 155, 155, 155]

### Cálculo manual

In [57]:
np.median(data_impar)

153.0

In [58]:
pos_mediana = len(data_impar)//2
pos_mediana

4

In [59]:
left = data_impar[0:pos_mediana]
left

[150, 151, 152, 152]

In [60]:
np.median(left)

151.5

In [61]:
right = data_impar[pos_mediana + 1:]
right

[154, 155, 155, 155]

In [62]:
np.median(right)

155.0

### Bibliotecas

#### Numpy

In [63]:
np.quantile(data_impar, 0.5)

153.0

In [64]:
np.quantile(data_impar, 0.75)

155.0

In [65]:
np.quantile(data_impar, 0.25)

152.0

In [66]:
left2 = data_impar[0:pos_mediana + 1]
left2

[150, 151, 152, 152, 153]

In [67]:
np.median(left2)

152.0

In [68]:
np.quantile(data, 0.25), np.quantile(data, 0.5), np.quantile(data, 0.75)

(155.75, 160.0, 164.0)

#### SciPy

In [69]:
stats.scoreatpercentile(data, 25),stats.scoreatpercentile(data, 50), stats.scoreatpercentile(data, 75)

(155.75, 160.0, 164.0)

#### Pandas

In [70]:
df_estatura = pd.DataFrame(data)
df_estatura.head()

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153


In [71]:
df_estatura.quantile([0.25,0.5,0.75])

Unnamed: 0,0
0.25,155.75
0.5,160.0
0.75,164.0


In [72]:
df_estatura.describe()

Unnamed: 0,0
count,40.0
mean,160.375
std,5.903877
min,150.0
25%,155.75
50%,160.0
75%,164.0
max,173.0


## Quartis com distribuição de frequência (dados agrupados)

In [73]:
df

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5.0
1,154,158,9,156.0,1404.0,14.0
2,158,162,11,160.0,1760.0,25.0
3,162,166,7,164.0,1148.0,32.0
4,166,170,5,168.0,840.0,37.0
5,170,174,3,172.0,516.0,40.0


In [74]:
get_stats(df, True, True), get_stats(df, True, False)

(156.22222222222223, 164.85714285714286)

In [75]:
sqrt_mean(df['inferior'])

160.14576693333692

In [76]:
from Models.Stats import Stats
cls_stats = Stats()

In [77]:
# Média, Moda, Mediana (dados agrupados)
cls_stats.get_stats(df)

(160.7, 160.0, 160.1818181818182)

In [78]:
# Quartis (25%, 50%, 75%) (dados agrupados)
cls_stats.get_stats(df, quartis = True)

(156.22222222222223, 160.1818181818182, 164.85714285714286)

In [79]:
cls_stats.sqrt_mean(df['inferior'])

160.14576693333692

In [80]:
cls_stats.sqrt_mean(df['superior'])

164.1422147610622

In [81]:
cls_stats.sqrt_mean(df['fi'])

7.187952884282608

In [82]:
cls_stats.sqrt_mean(data)

160.48091786876097

In [83]:
cls_stats.geom_mean(df['superior'])

163.85757125810557

In [84]:
cls_stats.geom_mean(data)

160.26958390038902

In [85]:
cls_stats.harm_mean(data)

160.16471947994663

## Percentis

In [86]:
np.median(data)

160.0

In [87]:
np.quantile(data, 0.5)

160.0

In [88]:
np.percentile(data, 50)

160.0

In [89]:
np.percentile(data, 5), np.percentile(data, 10), np.percentile(data, 90)

(151.95000000000002, 152.89999999999998, 168.1)

In [90]:
stats.scoreatpercentile(data, 5), stats.scoreatpercentile(data, 10), stats.scoreatpercentile(data, 90)

(151.95000000000002, 152.89999999999998, 168.1)

In [91]:
df_estatura.head()

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153


In [92]:
df_estatura.quantile([0.05,0.10,0.90])

Unnamed: 0,0
0.05,151.95
0.1,152.9
0.9,168.1
