<a href="https://colab.research.google.com/github/AbnerRidigolo/cadernos-calculos/blob/main/cap11_exemplos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Capítulo 2 - Exemplos

## Módulos

In [None]:
import numpy as np
import pandas as pd

## Carregar dados

In [None]:
df_dados_paises = pd.read_csv('dados_1997_2011_paises_csv.csv',
                              sep = ';',
                              decimal = ',',
                              encoding='latin1')

In [None]:
df_dados_paises.head(30)

Unnamed: 0,pais,ano,idh,corrupcao_indice,competitividade_indice,globalizacao_indice,pib,populacao
0,África do Sul,1997,0.6328,56,44.54,52.60298,148814200.0,43353.632
1,África do Sul,1998,0.6272,57,31.11,54.51911,134295600.0,43961.924
2,África do Sul,1999,0.6216,50,43.9,61.04379,133183600.0,44526.272
3,África do Sul,2000,0.616,52,51.52,62.47182,132877600.0,45064.098
4,África do Sul,2001,0.6126,50,45.12,62.04144,118479000.0,45576.215
5,África do Sul,2002,0.6092,50,40.97,62.15008,111100800.0,46076.953
6,África do Sul,2003,0.6058,48,47.67,61.81803,168219300.0,46566.626
7,África do Sul,2004,0.6024,48,53.79,63.23793,219092900.0,47032.866
8,África do Sul,2005,0.599,44,51.95,64.01835,247064300.0,47483.447
9,África do Sul,2006,0.601,46,52.01,64.97832,261007000.0,47925.669


In [None]:
df_dados_paises.dtypes

pais                       object
ano                         int64
idh                       float64
corrupcao_indice            int64
competitividade_indice    float64
globalizacao_indice       float64
pib                       float64
populacao                 float64
dtype: object

- Contagem países

In [None]:
df_dados_paises.groupby('pais') \
               .agg(n = pd.NamedAgg('pais', 'count')) \
               .reset_index()

Unnamed: 0,pais,n
0,Alemanha,15
1,Austrália,15
2,Brasil,15
3,Bélgica,15
4,Canadá,15
5,China,15
6,Cingapura,15
7,Coréia,15
8,Espanha,15
9,Estados Unidos,15


- Tipos das variáveis

In [None]:
df_dados_paises.dtypes

pais                       object
ano                         int64
idh                       float64
corrupcao_indice            int64
competitividade_indice    float64
globalizacao_indice       float64
pib                       float64
populacao                 float64
dtype: object

## Métricas

### min, max, mean

In [None]:
df_dados_paises.groupby('pais') \
            .agg(min_idh = pd.NamedAgg('idh', 'min'),
                 max_idh = pd.NamedAgg('idh', 'max'),
                 media_idh = pd.NamedAgg('idh', 'mean')) \
            .reset_index()

Unnamed: 0,pais,min_idh,max_idh,media_idh
0,Alemanha,0.8466,0.905,0.88288
1,Austrália,0.8958,0.929,0.914507
2,Brasil,0.6464,0.718,0.684653
3,Bélgica,0.8628,0.886,0.876107
4,Canadá,0.8736,0.908,0.89048
5,China,0.5598,0.687,0.62524
6,Cingapura,0.7404,0.866,0.821587
7,Coréia,0.8078,0.897,0.856907
8,Espanha,0.8162,0.878,0.852427
9,Estados Unidos,0.8886,0.913,0.900947


### Moda

In [None]:
df_dados_paises.groupby('pais')['idh'] \
               .apply(lambda x: x.mode().iloc[0]) \
               .to_frame() \
               .reset_index()

Unnamed: 0,pais,idh
0,Alemanha,0.8466
1,Austrália,0.8958
2,Brasil,0.6464
3,Bélgica,0.8628
4,Canadá,0.903
5,China,0.5598
6,Cingapura,0.7404
7,Coréia,0.8078
8,Espanha,0.8162
9,Estados Unidos,0.8886


### Mediana

In [None]:
df_dados_paises.groupby('pais') \
               .agg(median_idh = pd.NamedAgg('idh', 'median')) \
               .reset_index()

Unnamed: 0,pais,median_idh
0,Alemanha,0.8888
1,Austrália,0.9156
2,Brasil,0.6866
3,Bélgica,0.8754
4,Canadá,0.8894
5,China,0.624
6,Cingapura,0.8282
7,Coréia,0.8588
8,Espanha,0.8534
9,Estados Unidos,0.901


### Percentile

- O numpy utiliza esta interpolação como default

In [None]:
'https://stackoverflow.com/questions/60467081/linear-interpolation-in-numpy-quantile'

def my_quantile(array, q):
    n = len(array)
    index = (n - 1) * q
    if int(index) == index:  # has no fractional part
        return array[index]
    fraction = index - int(index)
    left = int(index)
    right = left + 1
    i, j = array[left], array[right]
    return i + (j - i) * fraction

In [None]:
dados = [1, 1, 1, 2, 2, 3, 4, 4, 7, 8, 9]

In [None]:
np.quantile(dados, 0.35)

2.0

In [None]:
my_quantile(dados, 0.35)

2.0

- Exemplo pandas

In [None]:
df_dados_paises.groupby('pais')['idh'] \
               .apply(lambda x: x.quantile([0.05, 0.25, 0.5, 0.75, 0.95])) \
               .to_frame() \
               .reset_index() \
               .rename(columns={'level_1': 'percentil'})

Unnamed: 0,pais,percentil,idh
0,Alemanha,0.05,0.85066
1,Alemanha,0.25,0.86710
2,Alemanha,0.50,0.88880
3,Alemanha,0.75,0.90050
4,Alemanha,0.95,0.90360
...,...,...,...
90,Índia,0.05,0.44996
91,Índia,0.25,0.46530
92,Índia,0.50,0.49540
93,Índia,0.75,0.52500


- Ajustes adicionais

In [None]:
percentis = df_dados_paises.groupby('pais')['idh'] \
                           .apply(lambda x: x.quantile([0.05, 0.95])) \
                           .to_frame() \
                           .reset_index() \
                           .rename(columns={'level_1': 'percentil'})

In [None]:
percentis.pivot(index='pais', columns='percentil', values='idh') \
         .reset_index() \
         .rename(columns={0.05:'p_5', 0.95:'p_95'})

percentil,pais,p_5,p_95
0,Alemanha,0.85066,0.9036
1,Austrália,0.89818,0.9276
2,Brasil,0.65074,0.7159
3,Bélgica,0.86588,0.8853
4,Canadá,0.87486,0.9073
5,China,0.56638,0.6835
6,Cingapura,0.75454,0.8646
7,Coréia,0.81298,0.8949
8,Espanha,0.82152,0.8766
9,Estados Unidos,0.89056,0.9095


### Desvio médio absoluto

In [None]:
df_dados_paises.groupby('pais') \
               .agg(desvio_medio_abs_idh = pd.NamedAgg('idh', 'mad')) \
               .reset_index()

Unnamed: 0,pais,desvio_medio_abs_idh
0,Alemanha,0.017301
1,Austrália,0.00874
2,Brasil,0.018983
3,Bélgica,0.004848
4,Canadá,0.010219
5,China,0.035243
6,Cingapura,0.029934
7,Coréia,0.025273
8,Espanha,0.015732
9,Estados Unidos,0.005124


### Variância e Desvio Padrão

In [None]:
df_dados_paises.groupby('pais') \
               .agg(variancia_idh = pd.NamedAgg('idh', 'var'),
                    dp_idh = pd.NamedAgg('idh', 'std')) \
               .reset_index()

Unnamed: 0,pais,variancia_idh,dp_idh
0,Alemanha,0.000408,0.020187
1,Austrália,0.00011,0.010485
2,Brasil,0.000517,0.022727
3,Bélgica,4.1e-05,0.006401
4,Canadá,0.000142,0.011899
5,China,0.001758,0.041928
6,Cingapura,0.001443,0.037984
7,Coréia,0.000886,0.029773
8,Espanha,0.000372,0.019299
9,Estados Unidos,4.3e-05,0.006558
