In [3]:
import pandas as pd
import numpy as np
import random

In [8]:
# Configuração da semente para reprodutibilidade
np.random.seed(42)

# Número de observações
n = 500

# Variáveis quantitativas
# Variáveis com relação linear
x1 = np.random.uniform(0, 100, n)
y1 = 2 * x1 + np.random.normal(0, 10, n)  # Correlação linear positiva

# Variáveis com relação não-linear
x2 = np.random.uniform(-10, 10, n)
y2 = np.sin(x2) + np.random.normal(0, 0.2, n)  # Correlação não-linear

# Variáveis sem relação
x3 = np.random.uniform(0, 50, n)
y3 = np.random.uniform(0, 50, n)

# Variáveis qualitativas
# Criando categorias dependentes e independentes
qual1 = np.random.choice(['A', 'B', 'C'], size=n, p=[0.3, 0.5, 0.2])  # Independente
qual2 = np.where(x1 > 50, 'High', 'Low')  # Dependente de x1
qual3 = np.random.choice(['Yes', 'No'], size=n, p=[0.6, 0.4])  # Independente
qual4 = np.where(qual1 == 'A', 'Type1', np.where(qual1 == 'B', 'Type2', 'Type3'))  # Dependente de qual1

# Criando relação entre qualitativa e quantitativa
quant_qual_relation = np.where(qual3 == 'Yes', x2 + 5, x2 - 5) # relacionado com uma qualitativa (qual3) e uma quanti (x2)

# Construindo o DataFrame
df = pd.DataFrame({
    'x1': x1,
    'y1': y1,
    'x2': x2,
    'y2': y2,
    'x3': x3,
    'y3': y3,
    'qual1': qual1,
    'qual2': qual2,
    'qual3': qual3,
    'qual4': qual4,
    'quant_qual_relation': quant_qual_relation
})

In [5]:
df.head()

Unnamed: 0,x1,y1,x2,y2,x3,y3,qual1,qual2,qual3,qual4,quant_qual_relation
0,37.454012,78.325584,-4.659435,1.037008,7.001205,42.689246,B,Low,No,Type2,-9.659435
1,95.071431,208.90457,7.5726,0.898849,13.840715,12.098857,A,High,Yes,Type1,12.5726
2,73.199394,155.903027,5.94852,-0.301744,48.576635,48.031349,B,High,Yes,Type2,10.94852
3,59.865848,113.96266,3.169037,-0.057935,16.56735,9.846285,C,High,Yes,Type3,8.169037
4,15.601864,22.219581,7.011635,0.807335,24.102053,47.571491,C,Low,Yes,Type3,12.011635


## Relacoes entre as quantitativas

### Correlacao Pearson - limitacao linear

In [6]:
df_quanti = df.select_dtypes(include=[np.number]) # selecionando apenas variaveis quantitativas

In [7]:
df_quanti.head()

Unnamed: 0,x1,y1,x2,y2,x3,y3,quant_qual_relation
0,37.454012,78.325584,-4.659435,1.037008,7.001205,42.689246,-9.659435
1,95.071431,208.90457,7.5726,0.898849,13.840715,12.098857,12.5726
2,73.199394,155.903027,5.94852,-0.301744,48.576635,48.031349,10.94852
3,59.865848,113.96266,3.169037,-0.057935,16.56735,9.846285,8.169037
4,15.601864,22.219581,7.011635,0.807335,24.102053,47.571491,12.011635


* Conforme disposto acima, X1 e Y1 possui relacao linear, logo, o indice de correlacao de pearson foi alto: 0,98
* Alem disso, X2 e a variavel quanti_quali tambem possui alto indice
* Porem, ele nao conseguiu capturar a relacao entre X2 e Y2

In [43]:
df_quanti.corr('pearson')

Unnamed: 0,x1,y1,x2,y2,x3,y3,quant_qual_relation
x1,1.0,0.986456,0.044031,0.028469,-0.03152,-0.058175,0.058179
y1,0.986456,1.0,0.043642,0.043508,-0.021401,-0.067457,0.062315
x2,0.044031,0.043642,1.0,0.17493,-0.023454,0.026076,0.781427
y2,0.028469,0.043508,0.17493,1.0,0.078015,0.050573,0.127415
x3,-0.03152,-0.021401,-0.023454,0.078015,1.0,-0.063086,-0.004032
y3,-0.058175,-0.067457,0.026076,0.050573,-0.063086,1.0,0.03983
quant_qual_relation,0.058179,0.062315,0.781427,0.127415,-0.004032,0.03983,1.0


### Mutual Information - mensura nao-linearidades

In [9]:
from sklearn.feature_selection import mutual_info_regression

In [10]:
# Mutual information é calculada em relação a uma variável-alvo; vamos calcular para cada variável como alvo.
quantitative_columns = df_quanti.columns
mutual_info_matrix = pd.DataFrame(index=quantitative_columns, columns=quantitative_columns)

for target in quantitative_columns:
    # Definindo a variável alvo
    target_data = df_quanti[target]
    # Definindo as demais variáveis como features
    feature_data = df_quanti.drop(columns=[target])
    # Calculando mutual information
    mutual_info = mutual_info_regression(feature_data, target_data, discrete_features=False)
    mutual_info_matrix.loc[target, feature_data.columns] = mutual_info

# Preenchendo a diagonal com NaN, pois não faz sentido calcular MI com a própria variável
np.fill_diagonal(mutual_info_matrix.values, np.nan)

* Semelhante ao coeficiente de pearson, o metodo de mutual information tambem identificou correlacao entre X1 e Y1
* Porem, alem disso, tambem identificou relacao entre X2 e Y2, que tem uma relacao não-linear

In [23]:
mutual_info_matrix

Unnamed: 0,x1,y1,x2,y2,x3,y3,quant_qual_relation
x1,,1.692966,0.0,0.0,0.0,0.026353,0.0
y1,1.692966,,0.0,0.0,0.050972,0.0,0.0
x2,0.0,0.0,,1.032575,0.0,0.037412,3.822553
y2,0.0,0.0,1.032575,,0.030195,0.019822,0.570177
x3,0.0,0.050972,0.0,0.030195,,0.037164,0.0
y3,0.026353,0.0,0.037412,0.019822,0.037164,,0.050291
quant_qual_relation,0.0,0.0,3.822553,0.570177,0.0,0.050291,


## Relacoes entre as qualitativas

### V de cramer

In [24]:
df_quali = df.select_dtypes(exclude=[np.number])

In [25]:
df_quali.head()

Unnamed: 0,qual1,qual2,qual3,qual4
0,B,Low,No,Type2
1,A,High,Yes,Type1
2,B,High,Yes,Type2
3,C,High,Yes,Type3
4,C,Low,Yes,Type3


In [53]:
from scipy.stats import chi2_contingency
import itertools

def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()  # Somando todos os valores na matriz
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(r, k) - 1)))

In [63]:
# Calculando o V de Cramer para todas as combinações de variáveis qualitativas
qualitative_columns = df_quali.columns
combinations = itertools.combinations(qualitative_columns, 2)

# Criando uma lista para armazenar os resultados
cramers_v_list = []

# Calculando V de Cramer para cada par de variáveis qualitativas
for var1, var2 in itertools.combinations(qualitative_columns, 2):
    # Criando a matriz de contingência
    contingency_table = pd.crosstab(df_quali[var1], df_quali[var2])
    # Calculando o V de Cramer
    value = cramers_v(contingency_table)
    # Armazenando o resultado em uma lista
    cramers_v_list.append({'Variable 1': var1, 'Variable 2': var2, 'Cramér\'s V': value})

# Convertendo a lista para um DataFrame
cramers_v_df = pd.DataFrame(cramers_v_list)

* Pelo v de cramer conseguimos identificar a relacao entre qual1 e qual4

In [64]:
cramers_v_df

Unnamed: 0,Variable 1,Variable 2,Cramér's V
0,qual1,qual2,0.062686
1,qual1,qual3,0.048071
2,qual1,qual4,1.0
3,qual2,qual3,0.03656
4,qual2,qual4,0.062686
5,qual3,qual4,0.048071


### Mutual Information

In [69]:
from sklearn.metrics import mutual_info_score

In [70]:
# Função para calcular a mutual information para variáveis qualitativas
def mutual_information_qualitative(var1, var2):
    return mutual_info_score(var1, var2)

In [71]:
# Inicializando a matriz para armazenar os valores de Mutual Information
mi_matrix = pd.DataFrame(np.zeros((len(qualitative_columns), len(qualitative_columns))),
                         index=qualitative_columns, columns=qualitative_columns)

# Preenchendo a matriz com os valores de Mutual Information
for var1, var2 in itertools.combinations(qualitative_columns, 2):
    mi_value = mutual_information_qualitative(df_quali[var1], df_quali[var2])
    mi_matrix.at[var1, var2] = mi_value
    mi_matrix.at[var2, var1] = mi_value

# Preenchendo a diagonal com NaN, já que Mutual Information com a própria variável não faz sentido
np.fill_diagonal(mi_matrix.values, np.nan)

* So que pelo mutual information tambem conseguimos capturar a relacao entre qual1 e qual4

In [73]:
mi_matrix

Unnamed: 0,qual1,qual2,qual3,qual4
qual1,,0.00197,0.001153,1.039746
qual2,0.00197,,0.000827,0.00197
qual3,0.001153,0.000827,,0.001153
qual4,1.039746,0.00197,0.001153,


## Relacao entre quali e quanti

### Categorizando as variaveis quanti e aplicando Mutual Information

In [76]:
# Aplicando a categorização das variáveis quantitativas usando qcut
quantitative_vars = df.select_dtypes(include=[np.number])
df_categ = df.copy()

In [90]:
df_categ['x1'] = pd.qcut(df['x1'], q=6)
df_categ['y1'] = pd.qcut(df['y1'], q=6)
df_categ['x2'] = pd.qcut(df['x2'], q=6)
df_categ['y2'] = pd.qcut(df['y2'], q=6)
df_categ['x3'] = pd.qcut(df['x3'], q=6)
df_categ['y3'] = pd.qcut(df['y3'], q=6)
df_categ['quant_qual_relation'] = pd.qcut(df['quant_qual_relation'], q=6)

In [89]:
df_categ

Unnamed: 0,x1,y1,x2,y2,x3,y3,qual1,qual2,qual3,qual4,quant_qual_relation
0,"(32.304, 51.316]","(65.041, 101.707]","(-6.716, -3.136]","(0.834, 1.471]","(0.00573, 7.03]","(41.606, 49.741]",B,Low,No,Type2,"(-14.886, -6.392]"
1,"(84.85, 99.296]","(166.735, 215.709]","(6.632, 9.988]","(0.834, 1.471]","(7.03, 15.987]","(9.096, 16.589]",A,High,Yes,Type1,"(9.629, 14.988]"
2,"(66.318, 84.85]","(136.488, 166.735]","(3.51, 6.632]","(-0.467, -0.0413]","(40.976, 49.891]","(41.606, 49.741]",B,High,Yes,Type2,"(9.629, 14.988]"
3,"(51.316, 66.318]","(101.707, 136.488]","(0.516, 3.51]","(-0.467, -0.0413]","(15.987, 23.652]","(9.096, 16.589]",C,High,Yes,Type3,"(5.286, 9.629]"
4,"(13.785, 32.304]","(-13.834, 29.01]","(6.632, 9.988]","(0.42, 0.834]","(23.652, 32.146]","(41.606, 49.741]",C,Low,Yes,Type3,"(9.629, 14.988]"
...,...,...,...,...,...,...,...,...,...,...,...
495,"(32.304, 51.316]","(65.041, 101.707]","(6.632, 9.988]","(0.42, 0.834]","(7.03, 15.987]","(33.695, 41.606]",C,Low,Yes,Type3,"(9.629, 14.988]"
496,"(51.316, 66.318]","(101.707, 136.488]","(0.516, 3.51]","(0.42, 0.834]","(0.00573, 7.03]","(16.589, 25.322]",B,High,Yes,Type2,"(5.286, 9.629]"
497,"(0.505, 13.785]","(-13.834, 29.01]","(6.632, 9.988]","(-0.0413, 0.42]","(40.976, 49.891]","(9.096, 16.589]",B,Low,No,Type2,"(1.158, 5.286]"
498,"(84.85, 99.296]","(166.735, 215.709]","(-3.136, 0.516]","(-0.0413, 0.42]","(40.976, 49.891]","(41.606, 49.741]",A,High,Yes,Type1,"(5.286, 9.629]"


In [91]:
all_columns = df_categ.columns
combinations = itertools.combinations(all_columns, 2)

# Inicializando a matriz para armazenar os valores de Mutual Information
mi_matrix = pd.DataFrame(np.zeros((len(all_columns), len(all_columns))),
                         index=all_columns, columns=all_columns)

# Preenchendo a matriz com os valores de Mutual Information
for var1, var2 in itertools.combinations(all_columns, 2):
    mi_value = mutual_information_qualitative(df_categ[var1], df_categ[var2])
    mi_matrix.at[var1, var2] = mi_value
    mi_matrix.at[var2, var1] = mi_value

# Preenchendo a diagonal com NaN, já que Mutual Information com a própria variável não faz sentido
np.fill_diagonal(mi_matrix.values, np.nan)

In [92]:
mi_matrix

Unnamed: 0,x1,y1,x2,y2,x3,y3,qual1,qual2,qual3,qual4,quant_qual_relation
x1,,1.242291,0.023746,0.024355,0.023624,0.029501,0.009057,0.635523,0.003745,0.009057,0.025506
y1,1.242291,,0.023496,0.021541,0.029294,0.029572,0.007156,0.570986,0.003576,0.007156,0.022541
x2,0.023746,0.023496,,0.387297,0.023852,0.017437,0.012742,0.01116,0.004166,0.012742,0.818156
y2,0.024355,0.021541,0.387297,,0.018398,0.019874,0.012111,0.003047,0.006019,0.012111,0.039403
x3,0.023624,0.029294,0.023852,0.018398,,0.029024,0.012763,0.009372,0.007281,0.012763,0.025498
y3,0.029501,0.029572,0.017437,0.019874,0.029024,,0.018046,0.010598,0.003993,0.018046,0.017028
qual1,0.009057,0.007156,0.012742,0.012111,0.012763,0.018046,,0.00197,0.001153,1.039746,0.008563
qual2,0.635523,0.570986,0.01116,0.003047,0.009372,0.010598,0.00197,,0.000827,0.00197,0.013992
qual3,0.003745,0.003576,0.004166,0.006019,0.007281,0.003993,0.001153,0.000827,,0.001153,0.329139
qual4,0.009057,0.007156,0.012742,0.012111,0.012763,0.018046,1.039746,0.00197,0.001153,,0.008563
