# url: https://qiita.com/ShoheiKojima/items/9a94931d5f298cf9663c

# Pythonで相関係数を計算する[4パターン]

## PythonでPearsonの相関係数を計算する方法を、パターンごとにまとめてみた
    2つのリストを比較 -> pd.Series.corr()
    1つのDataFrameに含まれるデータの総当たり -> pd.DataFrame.corr()
    2つの対応のあるDataFrameで、対応しているデータ同士を比較 -> pd.DataFrame.corrwith()
    2つの対応のないDataFrameを総当たりで比較 -> scipyのcdist2つのリストを比較 -> pandasのcorr()を使用

# 2つのリストを比較

In [1]:
import pandas as pd
import numpy as np

# テスト用のリストを作る
l1=list(np.random.randint(0, 10, 10))
l2=list(np.random.randint(0, 10, 10))

# 作ったlist
print(l1)
[4, 6, 0, 8, 6, 2, 0, 3, 3, 5]
print(l2)
[4, 6, 3, 7, 8, 4, 6, 9, 0, 0]

# リストをps.Seriesに変換
s1=pd.Series(l1)
s2=pd.Series(l2)

# pandasを使用してPearson's rを計算
res=s1.corr(s2)   # numpy.float64 に格納される

# 結果
print(res)

# 補足
# s1.corr(s2) は s1.corr(s2, method='pearson') と同じ
# 他にも以下などが使える
display(s1.corr(s2, method='pearson'))
display(s1.corr(s2, method='spearman'))
display(s1.corr(s2, method='kendall'))

[7, 9, 0, 6, 9, 6, 8, 3, 1, 7]
[2, 5, 9, 1, 1, 1, 9, 6, 9, 9]
-0.4641542481730193


-0.4641542481730193

-0.3228831616805113

-0.2571722499368198

# 1つのDataFrameに含まれるデータの総当たり -> pandasのcorr()を使用する

In [2]:
import pandas as pd
import numpy as np

# テスト用のDataFrameを作る
df = pd.DataFrame(index = ['idx' + str(i) for i in range(10)])

for i in range(3):
    df['col' + str(i)] = np.random.rand(10)

# 作ったdata frame
display(df)

# pandasを使用してPearson's rを計算
res=df.corr()   # pandasのDataFrameに格納される

# 結果
display(res)

# 補足
# df.corr() は df.corr(method='pearson') と同じ

# 他にも以下などが使える
display(df.corr(method='pearson'))
display(df.corr(method='spearman'))
display(df.corr(method='kendall'))

Unnamed: 0,col0,col1,col2
idx0,0.371087,0.950281,0.711905
idx1,0.19368,0.627805,0.940009
idx2,0.372649,0.865396,0.771621
idx3,0.075908,0.50818,0.062268
idx4,0.614652,0.543775,0.457844
idx5,0.90365,0.999514,0.67747
idx6,0.615386,0.387643,0.243838
idx7,0.760527,0.778586,0.401577
idx8,0.425545,0.041776,0.55194
idx9,0.733903,0.437113,0.778866


Unnamed: 0,col0,col1,col2
col0,1.0,0.187803,0.090452
col1,0.187803,1.0,0.316056
col2,0.090452,0.316056,1.0


Unnamed: 0,col0,col1,col2
col0,1.0,0.187803,0.090452
col1,0.187803,1.0,0.316056
col2,0.090452,0.316056,1.0


Unnamed: 0,col0,col1,col2
col0,1.0,0.10303,-0.078788
col1,0.10303,1.0,0.309091
col2,-0.078788,0.309091,1.0


Unnamed: 0,col0,col1,col2
col0,1.0,0.111111,-0.111111
col1,0.111111,1.0,0.155556
col2,-0.111111,0.155556,1.0


# 2つの対応のあるDataFrameで、対応しているデータ同士を比較 -> pandasのcorrwith()を使用する

In [3]:
import pandas as pd
import numpy as np

# テスト用のDataFrameを作る
df1=pd.DataFrame(index=['idx'+str(i) for i in range(10)])
for i in range(3):
    df1['col'+str(i)]=np.random.rand(10)

df2=pd.DataFrame(index=['idx'+str(i) for i in range(10)])
for i in range(4):
    df2['col'+str(i)]=np.random.rand(10)

# 作ったdata frame
# indexの名前がdf1とdf2で一致している必要あり
display(df1)
display(df2)

# pandasを使用してPearson's rを計算
res=df1.corrwith(df2)   # pandasのSeriesに格納される

# 結果
# df1とdf2で、同じ名前のカラム同士が比較される
# df2のcol3のように、同じ名前のカラムがdf1に存在しない場合は比較されない
display(res)

# 補足
# df1.corrwith(df2) は df1.corrwith(df2, method='pearson') と同じ
# 他にも以下などが使える
display(df.corrwith(df2, method='pearson'))
display(df.corrwith(df2, method='spearman'))
display(df.corrwith(df2, method='kendall'))

Unnamed: 0,col0,col1,col2
idx0,0.755589,0.471308,0.032547
idx1,0.285211,0.076133,0.070085
idx2,0.408797,0.611113,0.331851
idx3,0.249739,0.774025,0.827919
idx4,0.305625,0.948313,0.338476
idx5,0.389424,0.128673,0.559349
idx6,0.674555,0.818581,0.305893
idx7,0.280734,0.335536,0.57359
idx8,0.240832,0.332733,0.684761
idx9,0.330653,0.259004,0.253075


Unnamed: 0,col0,col1,col2,col3
idx0,0.674439,0.380138,0.93974,0.078903
idx1,0.393078,0.661484,0.614931,0.614736
idx2,0.988543,0.487546,0.223435,0.811712
idx3,0.380908,0.573999,0.068904,0.005098
idx4,0.579113,0.474842,0.025432,0.947246
idx5,0.488172,0.453679,0.865041,0.036813
idx6,0.800256,0.803987,0.059251,0.960699
idx7,0.532282,0.743966,0.261859,0.869578
idx8,0.241775,0.60168,0.203842,0.109384
idx9,0.611211,0.055374,0.240333,0.538409


col0    0.588812
col1    0.196134
col2   -0.419573
col3         NaN
dtype: float64

col0    0.168313
col1   -0.130891
col2    0.589464
col3         NaN
dtype: float64

col0    0.175758
col1   -0.357576
col2    0.575758
col3         NaN
dtype: float64

col0    0.066667
col1   -0.244444
col2    0.422222
col3         NaN
dtype: float64

# 2つの対応のないDataFrameを総当たりで比較 -> scipyのcdistを使用する

In [4]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

# テスト用のDataFrameを作る
df1=pd.DataFrame(index=['df1idx'+str(i) for i in range(10)])
for i in range(2):
    df1['df1col'+str(i)]=np.random.rand(10)

df2=pd.DataFrame(index=['df2idx'+str(i) for i in range(10)])
for i in range(3):
    df2['df2col'+str(i)]=np.random.rand(10)

# 作ったdata frame
# indexやcolumnの名前が異なっていても大丈夫
# 当然、indexの長さはdf1とdf2で同じである必要がある
display(df1)
display(df2)

# pd.DataFrameをnumpy.ndarrayに変換
ndf1=df1.T.values
ndf2=df2.T.values

# cdistを使用してPearson's rを計算
# cdistは (1から相関係数を引いた値) を返す
# つまり、相関係数は (1 - cdistの結果) となる
res=(1 - cdist(ndf1, ndf2, metric='correlation'))   # numpy.ndarrayに格納される

# 結果
display(res)

# (オプション) 結果をpd.DataFrameに変換する
res=pd.DataFrame(res, index=df1.columns, columns=df2.columns)
display(res)

Unnamed: 0,df1col0,df1col1
df1idx0,0.960435,0.74791
df1idx1,0.466169,0.986008
df1idx2,0.903699,0.846081
df1idx3,0.026144,0.350689
df1idx4,0.583812,0.219648
df1idx5,0.245538,0.972939
df1idx6,0.986448,0.369217
df1idx7,0.060879,0.621413
df1idx8,0.378382,0.800731
df1idx9,0.617216,0.826718


Unnamed: 0,df2col0,df2col1,df2col2
df2idx0,0.972909,0.091834,0.499361
df2idx1,0.232843,0.171582,0.540843
df2idx2,0.553718,0.117093,0.104819
df2idx3,0.606602,0.847437,0.457459
df2idx4,0.039237,0.293063,0.695886
df2idx5,0.550089,0.254106,0.130302
df2idx6,0.266572,0.558932,0.390571
df2idx7,0.674553,0.224411,0.823455
df2idx8,0.594084,0.988025,0.921409
df2idx9,0.675105,0.936591,0.964777


array([[-0.04494975, -0.26422226, -0.24479468],
       [ 0.3826771 , -0.19048527, -0.13595214]])

Unnamed: 0,df2col0,df2col1,df2col2
df1col0,-0.04495,-0.264222,-0.244795
df1col1,0.382677,-0.190485,-0.135952
