https://re-thought.com/exploring-correlation-in-python/

In [10]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm

df = pd.read_csv('0615 old feature_std 3.0_spare 40_PQRST.csv')

X = df.drop('old',axis = 1)
y = df['old']

# 皮爾森積動差相關係數
## （Pearson product-moment correlation coefficient）

In [11]:
corrmat = df.corr(method='pearson')
  
# Convert correlation matrix to 1-D Series and sort
sorted_mat = corrmat.unstack().sort_values()

sorted_mat

ecg_R_intervals  ecg_hr_mean             -0.989087
ecg_hr_mean      ecg_R_intervals         -0.989087
ecg_Q_intervals  ecg_hr_mean             -0.987422
ecg_hr_mean      ecg_Q_intervals         -0.987422
                 ecg_Q_onset_intervals   -0.987187
                                            ...   
cardiovascular   old                           NaN
hbp              cardiovascular                NaN
diabetes         cardiovascular                NaN
recode_T         cardiovascular                NaN
old              cardiovascular                NaN
Length: 24649, dtype: float64

In [12]:
type(sorted_mat)

pandas.core.series.Series

# 斯皮爾曼等級相關係數
## （Spearman’s rank correlation coefficient）

In [13]:
# 第一種寫法，取得相關矩陣
df_corr = df.corr(method="spearman")

df_corr.to_csv("0616 spearman corr.csv", index= True)

In [5]:
from scipy.stats import spearmanr

In [6]:
# 第二種寫法
import scipy

corr_spearman, spearman_pvalue = scipy.stats.spearmanr(df)

corr_spearman

  c /= stddev[:, None]
  c /= stddev[None, :]


array([[ 1.        ,  0.52962096,  0.7945782 , ..., -0.11846936,
        -0.07273914, -0.10028612],
       [ 0.52962096,  1.        ,  0.38780252, ..., -0.07686283,
        -0.12987358,  0.07058684],
       [ 0.7945782 ,  0.38780252,  1.        , ..., -0.12800726,
        -0.08511218, -0.08948813],
       ...,
       [-0.11846936, -0.07686283, -0.12800726, ...,  1.        ,
        -0.03942212,  0.08284436],
       [-0.07273914, -0.12987358, -0.08511218, ..., -0.03942212,
         1.        , -0.03157162],
       [-0.10028612,  0.07058684, -0.08948813, ...,  0.08284436,
        -0.03157162,  1.        ]])

In [7]:
spearman_pvalue

array([[0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        0.00000000e+000, 8.25958257e-213, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        1.89458215e-237, 0.00000000e+000, 1.70999595e-200],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        0.00000000e+000, 7.33350020e-291, 0.00000000e+000],
       ...,
       [0.00000000e+000, 1.89458215e-237, 0.00000000e+000, ...,
        0.00000000e+000, 9.17140406e-064, 1.25169366e-275],
       [8.25958257e-213, 0.00000000e+000, 7.33350020e-291, ...,
        9.17140406e-064, 0.00000000e+000, 1.58538607e-041],
       [0.00000000e+000, 1.70999595e-200, 0.00000000e+000, ...,
        1.25169366e-275, 1.58538607e-041, 0.00000000e+000]])

In [8]:
# 第三種寫法
rho, p = spearmanr(X)
rho

array([[ 1.        ,  0.52962096,  0.7945782 , ..., -0.04732096,
        -0.11846936, -0.07273914],
       [ 0.52962096,  1.        ,  0.38780252, ...,  0.06671213,
        -0.07686283, -0.12987358],
       [ 0.7945782 ,  0.38780252,  1.        , ..., -0.06872137,
        -0.12800726, -0.08511218],
       ...,
       [-0.04732096,  0.06671213, -0.06872137, ...,  1.        ,
         0.13165912, -0.01832715],
       [-0.11846936, -0.07686283, -0.12800726, ...,  0.13165912,
         1.        , -0.03942212],
       [-0.07273914, -0.12987358, -0.08511218, ..., -0.01832715,
        -0.03942212,  1.        ]])

In [9]:
p

array([[0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        4.42466149e-091, 0.00000000e+000, 8.25958257e-213],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        2.92284328e-179, 1.89458215e-237, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        4.03819358e-190, 0.00000000e+000, 7.33350020e-291],
       ...,
       [4.42466149e-091, 2.92284328e-179, 4.03819358e-190, ...,
        0.00000000e+000, 0.00000000e+000, 4.69620522e-015],
       [0.00000000e+000, 1.89458215e-237, 0.00000000e+000, ...,
        0.00000000e+000, 0.00000000e+000, 9.17140406e-064],
       [8.25958257e-213, 0.00000000e+000, 7.33350020e-291, ...,
        4.69620522e-015, 9.17140406e-064, 0.00000000e+000]])

# 肯德爾等級相關係數  
## (Kendall correlation coefficients)  

肯德爾等級相關係數，主要是觀察兩個類別型特徵之間的關係強度  
Kendall 的 tau 是衡量兩個排名之間對應關係的指標  
這是一種序數數據的相關性度量  
tau接近 1 的值表示非常一致，接近 -1 的值表示非常不同意。

In [39]:
# example
from scipy import stats
x1 = [12, 2, 1, 12, 2]
x2 = [1, 4, 7, 1, 0]
tau, p_value = stats.kendalltau(x1, x2)
tau

-0.4714045207910316

In [40]:
p_value

0.2827454599327748

In [44]:
from scipy.stats import kendalltau

tau,  p_value = scipy.stats.kendalltau(df['old'], df['diabetes'])

tau

0.08164899918520882

In [45]:
p_value

9.853184446753511e-275