## 상관분석

- 두 변수의 선형관계를 확인하기 위해 상관분석을 실시
- 두 수치형 변수의 비교: Peason's Correlation Coefficient를 확인하며, 그외 다양한 상관계수 존재
- 상관계수가 0에 가까울수록 선형관계가 약하며, 절댓값이 1에 가까울수록 강한 선형관계를 가짐 (-1<=p<=1)

1. **Pandas: corr() 상관계수 빠르게 확인 가능**
 - method에 'pearson', 'kendall', 'spearman' 각각의 상관계수로 계산
 - 데이터프레임 전용 메소드
 
2. **Scipy: pearsonr()**
- Pearson 상관분석을 실시하는 scipy 함수
- 입력: 2개의 1차원 벡터를 넣고 / 출력: 상관계수와 p-value 차례대로 출력
3. **Scipy: spearmanr()**
4. **Scipy: kendalltau()**


In [2]:
import pandas as pd
from scipy.stats import pearsonr, spearmanr, kendalltau

In [3]:
df = pd.read_csv("C:/Users/silan/Python/Data/bike.csv")
df.head(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [4]:
df.corr()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
season,1.0,0.029368,-0.008126,0.008879,0.258689,0.264744,0.19061,-0.147121,0.096758,0.164011,0.163439
holiday,0.029368,1.0,-0.250491,-0.007074,0.000295,-0.005215,0.001929,0.008409,0.043799,-0.020956,-0.005393
workingday,-0.008126,-0.250491,1.0,0.033772,0.029966,0.02466,-0.01088,0.013373,-0.319111,0.11946,0.011594
weather,0.008879,-0.007074,0.033772,1.0,-0.055035,-0.055376,0.406244,0.007261,-0.135918,-0.10934,-0.128655
temp,0.258689,0.000295,0.029966,-0.055035,1.0,0.984948,-0.064949,-0.017852,0.467097,0.318571,0.394454
atemp,0.264744,-0.005215,0.02466,-0.055376,0.984948,1.0,-0.043536,-0.057473,0.462067,0.314635,0.389784
humidity,0.19061,0.001929,-0.01088,0.406244,-0.064949,-0.043536,1.0,-0.318607,-0.348187,-0.265458,-0.317371
windspeed,-0.147121,0.008409,0.013373,0.007261,-0.017852,-0.057473,-0.318607,1.0,0.092276,0.091052,0.101369
casual,0.096758,0.043799,-0.319111,-0.135918,0.467097,0.462067,-0.348187,0.092276,1.0,0.49725,0.690414
registered,0.164011,-0.020956,0.11946,-0.10934,0.318571,0.314635,-0.265458,0.091052,0.49725,1.0,0.970948


In [6]:
df[['casual', 'registered', 'count']].corr()

Unnamed: 0,casual,registered,count
casual,1.0,0.49725,0.690414
registered,0.49725,1.0,0.970948
count,0.690414,0.970948,1.0


In [7]:
df[['casual', 'registered', 'count']].corr(method = 'kendall') # 테스트를 위해 입력 

Unnamed: 0,casual,registered,count
casual,1.0,0.582213,0.666411
registered,0.582213,1.0,0.919346
count,0.666411,0.919346,1.0


In [8]:
df[['casual', 'registered', 'count']].corr(method = 'spearman')

Unnamed: 0,casual,registered,count
casual,1.0,0.775785,0.847378
registered,0.775785,1.0,0.988901
count,0.847378,0.988901,1.0


In [9]:
pearsonr(df['casual'], df['registered'])

(0.49724968508700823, 0.0)

In [10]:
stat, p = pearsonr(df['casual'], df['registered'])
print(stat)
print(p)

0.49724968508700823
0.0


In [15]:
df[['temp','atemp', 'humidity','casual']].corr().round(2) # 가장 낮은 상관계수: -.0.35

Unnamed: 0,temp,atemp,humidity,casual
temp,1.0,0.98,-0.06,0.47
atemp,0.98,1.0,-0.04,0.46
humidity,-0.06,-0.04,1.0,-0.35
casual,0.47,0.46,-0.35,1.0


In [16]:
df[['season', 'atemp', 'casual']].groupby("season").corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,atemp,casual
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,atemp,1.0,0.478312
1,casual,0.478312,1.0
2,atemp,1.0,0.378122
2,casual,0.378122,1.0
3,atemp,1.0,0.381423
3,casual,0.381423,1.0
4,atemp,1.0,0.443751
4,casual,0.443751,1.0


In [18]:
df_corr = df[['season', 'atemp', 'casual']].groupby("season").corr()
df_corr = df_corr.reset_index()
df_corr

Unnamed: 0,season,level_1,atemp,casual
0,1,atemp,1.0,0.478312
1,1,casual,0.478312,1.0
2,2,atemp,1.0,0.378122
3,2,casual,0.378122,1.0
4,3,atemp,1.0,0.381423
5,3,casual,0.381423,1.0
6,4,atemp,1.0,0.443751
7,4,casual,0.443751,1.0


In [19]:
df_corr = df_corr.loc[df_corr['atemp']<1] # 자기 자신 상관 값인 1을 제외
df_corr

Unnamed: 0,season,level_1,atemp,casual
1,1,casual,0.478312,1.0
3,2,casual,0.378122,1.0
5,3,casual,0.381423,1.0
7,4,casual,0.443751,1.0


In [21]:
df['weather'].unique()

array([1, 2, 3, 4], dtype=int64)

In [29]:
df['is_sunny'] = df['weather'] == 1 + 0
df_corr = df.groupby('is_sunny')[['casual','temp']].corr()
df_corr

Unnamed: 0_level_0,Unnamed: 1_level_0,casual,temp
is_sunny,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,casual,1.0,0.446361
False,temp,0.446361,1.0
True,casual,1.0,0.471053
True,temp,0.471053,1.0


In [36]:
round(abs(df_corr.iloc[1, 0] - df_corr.iloc[3, 0]), 3) # 맑은 날과 그렇지 않은 날의 상관계수 차이

0.025

## 군집분석

1. 계층적 군집분석
 - 특징<br>
 1) 데이터간 유사도를 기반으로 계산하며, 군집의 개수가 정해져 있지 않음<br>
 2) 계층적 군집분석을 실시하는 과정과 실시 후 특정 군집 개수로 데이터를 라벨링하는 과정이 있음<br>
 3) 단점: 데이터 변동에 민감하며, 학습데이터가 많을 경우 연산에 많은 시간 소요<br>
 <br>
 - **sklearn-AgglomerativeClustering()**
  - 계층적 군집분석을 실시하는 sklearn의 함수
  - n_clusters 에 분리할 군집 개수를 설정
  - affinity에 데이터 간 거리 계산 방법, linkage에 군집 간 유사도 방법 설정
 <br>
 - 계층도(Dendrogram), matplotlib 사용 (scipy_dendrogram(), linkage()
 
 <br>
2. 비계층적 군집분석

In [38]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt

In [39]:
df = pd.read_csv("C:/Users/silan/Python/Data/iris.csv")

In [40]:
df.head(4)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa


In [41]:
df_sub = df.iloc[:, :-1]
df_sub.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2


In [42]:
model = AgglomerativeClustering(n_clusters=3).fit(df_sub)
model

AgglomerativeClustering(n_clusters=3)

In [43]:
model.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0], dtype=int64)

In [44]:
df['cluster'] = model.labels_ # 클러스터 할당하기
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,cluster
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1


In [47]:
pd.crosstab(df['Species'], df['cluster'])

cluster,0,1,2
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,0,50,0
versicolor,49,0,1
virginica,15,0,35


In [48]:
df.groupby('cluster').mean().reset_index() # 군집별 특징별 확인 가능

Unnamed: 0,cluster,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,0,5.920312,2.751562,4.420312,1.434375
1,1,5.006,3.428,1.462,0.246
2,2,6.869444,3.086111,5.769444,2.105556
