# Análise Associação e Correlação

## Módulos

In [1]:
# instalar versão atualizada
! sudo pip install scipy==1.7.1

Collecting scipy==1.7.1
  Downloading scipy-1.7.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (28.5 MB)
[K     |████████████████████████████████| 28.5 MB 1.3 MB/s 
Installing collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.4.1
    Uninstalling scipy-1.4.1:
      Successfully uninstalled scipy-1.4.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
Successfully installed scipy-1.7.1


In [2]:
# instalar nova ferramenta para testes de hipoteses
! sudo pip install pingouin

Collecting pingouin
  Downloading pingouin-0.5.0.tar.gz (182 kB)
[K     |████████████████████████████████| 182 kB 5.1 MB/s 
Collecting statsmodels>=0.12.0
  Downloading statsmodels-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 24.1 MB/s 
Collecting pandas_flavor>=0.2.0
  Downloading pandas_flavor-0.2.0-py2.py3-none-any.whl (6.6 kB)
Collecting outdated
  Downloading outdated-0.2.1-py3-none-any.whl (7.5 kB)
Collecting littleutils
  Downloading littleutils-0.2.2.tar.gz (6.6 kB)
Building wheels for collected packages: pingouin, littleutils
  Building wheel for pingouin (setup.py) ... [?25l[?25hdone
  Created wheel for pingouin: filename=pingouin-0.5.0-py3-none-any.whl size=193661 sha256=73389c8f1738cee98c221b133fee5f6a72fc38e820c91ea7e4ee053d9e1beecb
  Stored in directory: /root/.cache/pip/wheels/14/46/f9/cedd81d68d2515c24bbbd000d5b347e4fe092ccc4b568f7f70
  Building wheel for littleutils (setup.py) ... [?25l

In [3]:
# gerais
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from scipy import stats as st
import pingouin as pg

## Dados

In [5]:
dados_tratamentos = pd.read_csv('dados_tratamentos.csv', sep = ';')
dados_tratamentos.head()

Unnamed: 0,id,tratamento,evolucao
0,1,tratado,melhorou
1,2,tratado,melhorou
2,3,nao-tratado,melhorou
3,4,tratado,melhorou
4,5,tratado,nao-melhorou


In [6]:
dados_nps = pd.read_csv('nps_example.csv', sep = ';')
dados_nps.head()

Unnamed: 0,id,response_status,how_long_listening,age,nps_score,gender
0,11706300,Complete,Less than 6 months,25-34,10.0,Female
1,11706302,Complete,1 year to less than 3 years,25-34,10.0,Female
2,11706307,Complete,6 months to less than a year,35-44,10.0,Female
3,11706312,Complete,Less than 6 months,35-44,10.0,Female
4,11706316,Complete,6 months to less than a year,25-34,10.0,Male


In [7]:
dados_bolsa = pd.read_csv('dados_bolsa.csv', sep = ';', decimal = ',')
dados_bolsa

Unnamed: 0,data,petr4,bbdc3,vale5,ambv4,itub4
0,2004-05-26,-0.002270,0.009524,0.013699,0.032668,0.009843
1,2004-05-27,0.026301,0.028396,0.028303,0.014060,0.021442
2,2004-05-28,-0.016484,-0.000092,-0.025261,0.019151,0.000000
3,2004-05-31,0.010986,-0.009174,0.011235,-0.001786,0.000000
4,2004-06-01,0.028142,0.023056,-0.004518,0.013629,0.012107
...,...,...,...,...,...,...
1773,2011-03-14,0.003905,-0.000407,-0.002139,-0.010953,0.004765
1774,2011-03-15,-0.008487,0.004473,-0.016724,-0.004873,0.011994
1775,2011-03-16,-0.009272,-0.017409,-0.025294,-0.020699,-0.023705
1776,2011-03-17,0.010799,-0.001236,0.027293,-0.001136,-0.005364


## Análises

### Análise de associação - Phi

- Para obter o *Phi*, precisamos obter a tabela cruzada

In [8]:
tabela = pd.crosstab(dados_tratamentos['tratamento'], 
                     dados_tratamentos['evolucao'])
tabela

evolucao,melhorou,nao-melhorou
tratamento,Unnamed: 1_level_1,Unnamed: 2_level_1
nao-tratado,26,29
tratado,35,15


- Ferramenta scipy.stats

In [9]:
# obter valor do qui quadrado
qui_quadrado = st.chi2_contingency(tabela)[0]
qui_quadrado

4.6625668947297125

In [10]:
# tamanho da amostra
n = np.sum(np.sum(tabela))
n

105

In [11]:
# obter o phi
Phi = np.sqrt(qui_quadrado / n)
Phi

0.21072588592155886

- Teste qui-quadrado

In [12]:
qui, p, gl, t_esp = st.chi2_contingency(tabela)

In [13]:
qui, p

(4.6625668947297125, 0.030827072412198585)

### Análise de associação - *V* de *Cramer*

- Filtrar respostas incompletas

In [14]:
dados_nps_filtrados = dados_nps[(dados_nps['response_status'] == 'Complete') & \
                                (dados_nps['nps_score'].notna()) & \
                                (dados_nps['gender'].notna())].reset_index()

- Criar grupos do NPS

In [15]:
def create_nps_groups(x):

  if x <= 5:
    return 'detratores'
  elif x >= 9:
    return 'promotores'
  else:
    return 'neutros'

In [16]:
dados_nps_filtrados['nps_groups'] = dados_nps_filtrados['nps_score'].apply(lambda x: create_nps_groups(x))

In [17]:
dados_nps_filtrados.groupby('nps_groups').size().to_frame('n')

Unnamed: 0_level_0,n
nps_groups,Unnamed: 1_level_1
detratores,25
neutros,171
promotores,2047


- V de Cramer: Vamos analisar a associação entre Sexo e os grupos do NPS

In [18]:
# obter a tabela cruzada
tabela = pd.crosstab(dados_nps_filtrados['gender'], dados_nps_filtrados['nps_groups'])
tabela

nps_groups,detratores,neutros,promotores
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,12,105,1565
Male,13,66,482


- Método 1: utilizando a função do scipy "association"

In [19]:
vcramer_1 = st.contingency.association(tabela, method='cramer')
vcramer_1

0.1134646600588705

- Método 2: estimação direta pela formula

In [20]:
# obter valor do qui quadrado
qui_quadrado = st.chi2_contingency(tabela)[0]
qui_quadrado

28.87689583154292

In [21]:
# tamanho da amostra
n = np.sum(np.sum(tabela))
n

2243

In [22]:
# minimo entre r e c
minshape = min(tabela.shape)-1
minshape

1

In [23]:
vcramer_2 = np.sqrt(qui_quadrado / (n * minshape))
vcramer_2

0.1134646600588705

- Teste qui-quadrado

In [24]:
qui, p, gl, t_esp = st.chi2_contingency(tabela)

In [25]:
qui, p

(28.87689583154292, 5.363666207418583e-07)

### Análise de Correlação

- Verificar nulos

In [26]:
dados_bolsa[dados_bolsa.isnull()]

Unnamed: 0,data,petr4,bbdc3,vale5,ambv4,itub4
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,
...,...,...,...,...,...,...
1773,,,,,,
1774,,,,,,
1775,,,,,,
1776,,,,,,


- Tratar os nulos

In [27]:
dados_bolsa_filtrados = dados_bolsa.dropna()

In [31]:
dados_bolsa_filtrados.agg(min_data = pd.NamedAgg('data', 'min'),
                          max_data = pd.NamedAgg('data', 'max'))

Unnamed: 0,data
min_data,2004-05-26
max_data,2011-03-18


#### Correlação de Pearson

In [None]:
# correlacao
pearson_correl = dados_bolsa_filtrados.corr()
pearson_correl

Unnamed: 0,petr4,bbdc3,vale5,ambv4,itub4
petr4,1.0,0.539247,0.724023,0.392074,0.593834
bbdc3,0.539247,1.0,0.592143,0.470529,0.778506
vale5,0.724023,0.592143,1.0,0.482919,0.642838
ambv4,0.392074,0.470529,0.482919,1.0,0.488886
itub4,0.593834,0.778506,0.642838,0.488886,1.0


In [None]:
# matriz ajustada
pearson_correl.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,petr4,bbdc3,vale5,ambv4,itub4
petr4,1.0,0.54,0.72,0.39,0.59
bbdc3,0.54,1.0,0.59,0.47,0.78
vale5,0.72,0.59,1.0,0.48,0.64
ambv4,0.39,0.47,0.48,1.0,0.49
itub4,0.59,0.78,0.64,0.49,1.0


- Verificar se correlações são significantes

In [None]:
# colunas para correlacionar
correl_columns = ['petr4', 'bbdc3', 'vale5', 'ambv4', 'itub4']

In [None]:
# teste
pg.pairwise_corr(dados_bolsa_filtrados, 
                 columns = correl_columns, 
                 method = 'pearson')

Unnamed: 0,X,Y,method,alternative,n,r,CI95%,p-unc,BF10,power
0,petr4,bbdc3,pearson,two-sided,1604,0.539247,"[0.5, 0.57]",1.094616e-121,7.488e+117,1.0
1,petr4,vale5,pearson,two-sided,1604,0.724023,"[0.7, 0.75]",1.122582e-260,3.6539999999999997e+256,1.0
2,petr4,ambv4,pearson,two-sided,1604,0.392074,"[0.35, 0.43]",4.390573e-60,3.057e+56,1.0
3,petr4,itub4,pearson,two-sided,1604,0.593834,"[0.56, 0.62]",1.7930020000000002e-153,3.7909999999999997e+149,1.0
4,bbdc3,vale5,pearson,two-sided,1604,0.592143,"[0.56, 0.62]",2.14394e-152,3.189e+148,1.0
5,bbdc3,ambv4,pearson,two-sided,1604,0.470529,"[0.43, 0.51]",3.714265e-89,2.774e+85,1.0
6,bbdc3,itub4,pearson,two-sided,1604,0.778506,"[0.76, 0.8]",0.0,inf,1.0
7,vale5,ambv4,pearson,two-sided,1604,0.482919,"[0.44, 0.52]",1.7386e-94,5.687999999999999e+90,1.0
8,vale5,itub4,pearson,two-sided,1604,0.642838,"[0.61, 0.67]",1.0648299999999999e-187,5.347e+183,1.0
9,ambv4,itub4,pearson,two-sided,1604,0.488886,"[0.45, 0.53]",3.929767e-97,2.467e+93,1.0


#### Correlação de Spearman

In [None]:
# correlacao
spearman_correl = dados_bolsa_filtrados.corr(method='spearman')
spearman_correl

Unnamed: 0,petr4,bbdc3,vale5,ambv4,itub4
petr4,1.0,0.472143,0.644211,0.330002,0.531198
bbdc3,0.472143,1.0,0.505002,0.419052,0.758946
vale5,0.644211,0.505002,1.0,0.404373,0.574199
ambv4,0.330002,0.419052,0.404373,1.0,0.449118
itub4,0.531198,0.758946,0.574199,0.449118,1.0


In [None]:
# matriz ajustada
spearman_correl.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,petr4,bbdc3,vale5,ambv4,itub4
petr4,1.0,0.47,0.64,0.33,0.53
bbdc3,0.47,1.0,0.51,0.42,0.76
vale5,0.64,0.51,1.0,0.4,0.57
ambv4,0.33,0.42,0.4,1.0,0.45
itub4,0.53,0.76,0.57,0.45,1.0


- Teste de significância

In [None]:
# colunas para correlacionar
correl_columns = ['petr4', 'bbdc3', 'vale5', 'ambv4', 'itub4']

In [None]:
# teste
pg.pairwise_corr(dados_bolsa_filtrados, 
                 columns = correl_columns, 
                 method = 'spearman')

Unnamed: 0,X,Y,method,alternative,n,r,CI95%,p-unc,power
0,petr4,bbdc3,spearman,two-sided,1604,0.472143,"[0.43, 0.51]",7.723804e-90,1.0
1,petr4,vale5,spearman,two-sided,1604,0.644211,"[0.61, 0.67]",9.494683000000001e-189,1.0
2,petr4,ambv4,spearman,two-sided,1604,0.330002,"[0.29, 0.37]",4.669599e-42,1.0
3,petr4,itub4,spearman,two-sided,1604,0.531198,"[0.5, 0.57]",1.762927e-117,1.0
4,bbdc3,vale5,spearman,two-sided,1604,0.505002,"[0.47, 0.54]",1.5135e-104,1.0
5,bbdc3,ambv4,spearman,two-sided,1604,0.419052,"[0.38, 0.46]",3.1665670000000004e-69,1.0
6,bbdc3,itub4,spearman,two-sided,1604,0.758946,"[0.74, 0.78]",8.700692e-301,1.0
7,vale5,ambv4,spearman,two-sided,1604,0.404373,"[0.36, 0.44]",3.7948e-64,1.0
8,vale5,itub4,spearman,two-sided,1604,0.574199,"[0.54, 0.61]",2.3954190000000001e-141,1.0
9,ambv4,itub4,spearman,two-sided,1604,0.449118,"[0.41, 0.49]",1.89686e-80,1.0


#### Correlação de Kendall

In [None]:
# correlacao
kendall_correl = dados_bolsa_filtrados.corr(method='kendall')
kendall_correl

Unnamed: 0,petr4,bbdc3,vale5,ambv4,itub4
petr4,1.0,0.332691,0.471033,0.231311,0.378755
bbdc3,0.332691,1.0,0.358557,0.294492,0.571953
vale5,0.471033,0.358557,1.0,0.282942,0.411663
ambv4,0.231311,0.294492,0.282942,1.0,0.318638
itub4,0.378755,0.571953,0.411663,0.318638,1.0


In [None]:
# matriz ajustada
kendall_correl.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,petr4,bbdc3,vale5,ambv4,itub4
petr4,1.0,0.33,0.47,0.23,0.38
bbdc3,0.33,1.0,0.36,0.29,0.57
vale5,0.47,0.36,1.0,0.28,0.41
ambv4,0.23,0.29,0.28,1.0,0.32
itub4,0.38,0.57,0.41,0.32,1.0


- Teste de significância

In [None]:
# colunas para correlacionar
correl_columns = ['petr4', 'bbdc3', 'vale5', 'ambv4', 'itub4']

In [None]:
# teste
pg.pairwise_corr(dados_bolsa_filtrados, 
                 columns = correl_columns, 
                 method = 'kendall')

Unnamed: 0,X,Y,method,alternative,n,r,CI95%,p-unc,power
0,petr4,bbdc3,kendall,two-sided,1604,0.332691,"[0.29, 0.38]",1.358166e-88,1.0
1,petr4,vale5,kendall,two-sided,1604,0.471033,"[0.43, 0.51]",9.805061e-176,1.0
2,petr4,ambv4,kendall,two-sided,1604,0.231311,"[0.18, 0.28]",8.582459999999999e-44,1.0
3,petr4,itub4,kendall,two-sided,1604,0.378755,"[0.34, 0.42]",2.531465e-114,1.0
4,bbdc3,vale5,kendall,two-sided,1604,0.358557,"[0.32, 0.4]",1.3756149999999999e-102,1.0
5,bbdc3,ambv4,kendall,two-sided,1604,0.294492,"[0.25, 0.34]",8.231269e-70,1.0
6,bbdc3,itub4,kendall,two-sided,1604,0.571953,"[0.54, 0.6]",7.030155e-258,1.0
7,vale5,ambv4,kendall,two-sided,1604,0.282942,"[0.24, 0.33]",1.2462689999999999e-64,1.0
8,vale5,itub4,kendall,two-sided,1604,0.411663,"[0.37, 0.45]",1.1008199999999998e-134,1.0
9,ambv4,itub4,kendall,two-sided,1604,0.318638,"[0.27, 0.36]",1.852468e-81,1.0
