In [None]:


import pandas as pd 
import numpy as np
import seaborn as sns

from sklearn.decomposition import PCA

# 필요한 패키지/모듈 가져오기
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
%matplotlib inline

from factor_analyzer import FactorAnalyzer



In [None]:
data = pd.read_csv('divorce_data.csv', sep =';')

In [None]:
data.head()

In [None]:
data = data.set_index('Divorce')

In [None]:
data.info()

In [None]:
# 열별로 scaling (z-standardization)
from sklearn.preprocessing import StandardScaler
data_scale = pd.DataFrame(StandardScaler().fit_transform(data), columns=data.columns, index = data.index)

In [None]:
data_scale.head()

In [None]:
data_scale.cov()

In [None]:
data_scale.corr()

# Principal Component Analysis

In [None]:
# PCA 진행
pca = PCA(random_state=20210323)
X_p = pca.fit_transform(data_scale)


In [None]:
#Eigen Value
ev,v=np.linalg.eig(data.corr())
ev

In [None]:
#시각화
plt.scatter(range(1, data_scale.shape[1]+1),ev)
plt.plot(range(1,data_scale.shape[1]+1),ev)
plt.title('Scree Plot')
plt.xlabel('Principal Components')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()

In [None]:
plt.scatter(range(1, data_scale.shape[1]+1),ev)
plt.plot(range(1, data_scale.shape[1]+1),ev)
plt.axis([0,5,0,10])
plt.yticks([0,0.5,1,5,10])
plt.title('Scree Plot')
plt.xlabel('Principal Components')
plt.ylabel('Eigenvalue')
plt.show()

In [None]:
each_variance_ratio = pca.explained_variance_ratio_
each_variance_ratio #각 PC별 분산 설명 비율율

In [None]:
#고유값을 기준으로 설명 가능한 분산 (누적)
cumsum = np.cumsum(pca.explained_variance_ratio_)

pd.Series(np.cumsum(pca.explained_variance_ratio_)) 

In [None]:
# 시각화 2
plt.figure(figsize=(30,10))
percent_variance = np.round(pca.explained_variance_ratio_* 100, decimals =2)
columns = []
for i in range(len(percent_variance)):
    columns.append(f'PC{i+1}')

ax = plt.bar(x = range(len(percent_variance)), height=percent_variance, tick_label=columns)
plt.ylabel('Percentage of Variance Explained (%)')
plt.xlabel('Principal Component')
plt.title('Variance Explained')
plt.show()

In [None]:
pd.DataFrame(pca.components_[:,:4], 
             columns = ['PC1','PC2','PC3','PC4'], 
             index=data.columns)

In [None]:
X_pp = pd.DataFrame(X_p[:,:4], 
                    columns = ['PC1','PC2','PC3','PC4'], 
                    index = data.index)
X_pp #PCA 이후 만들어진 data (PC1~ PC4로 총 4개 변수로 축소)

In [None]:
#시각화3
pca_result = pd.DataFrame(pca.components_[:,:4], 
             columns = ['PC1','PC2','PC3','PC4'], 
             index=data.columns)
plt.figure(figsize=(6,10))
sns.heatmap(pca_result, cmap="Blues", annot=True, fmt='.2f') #소수 둘째자리까지

In [None]:
# PCA 진행
pca = PCA(random_state=20210323)
X_p = pca.fit_transform(data)

In [None]:
X_pp = pd.DataFrame(X_p[:,:4], 
                    columns = ['PC1','PC2','PC3','PC4'], 
                    index = data.index)
X_pp #PCA 이후 만들어진 data (PC1~ PC4로 총 4개 변수로 축소)

In [None]:
# PC1 x PC2 
sns.scatterplot(data=X_pp, x='PC1',y='PC2',hue=X_pp.index)

In [None]:
# PC1 x PC3
sns.scatterplot(data=X_pp, x='PC1',y='PC3',hue=X_pp.index)

In [None]:
# PC1 x PC4
sns.scatterplot(data=X_pp, x='PC1',y='PC4',hue=X_pp.index)

In [None]:
# PC2 x PC3 
sns.scatterplot(data=X_pp, x='PC2',y='PC3',hue=X_pp.index)

In [None]:
# PC2 x PC4 
sns.scatterplot(data=X_pp, x='PC2',y='PC4',hue=X_pp.index)

In [None]:
# PC3 x PC4 
sns.scatterplot(data=X_pp, x='PC3',y='PC4',hue=X_pp.index)

In [None]:
X_PC1 = X_pp[['PC1']]
X_PC2 = X_pp[['PC2']]
X_PC3= X_pp[['PC3']]
X_PC4 = X_pp[['PC4']]

In [None]:
X_PC1.groupby('Divorce').describe()

In [None]:
X_PC1.groupby('Divorce').describe()

In [None]:
X_PC1.groupby('Divorce').describe()

In [None]:
X_PC1.groupby('Divorce').describe()

# Factor Analysis

In [None]:
# Bartlett Test

from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi_square_value, p_value = calculate_bartlett_sphericity(data_scale)
chi_square_value, p_value # p-value < 0.05 --> 귀무가설 기각

In [None]:
# KMO Test (Kaiser-Meyer-Olkin Test)


from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all, kmo_model =calculate_kmo(data_scale)
kmo_model # 0.8 이상이므로 꽤 좋음

## 요인 수 선택

In [None]:
# 개수 선택법 1) Eigenvalue
fac = FactorAnalyzer(n_factors=54,rotation=None)
fac.fit(data_scale)
ev, v = fac.get_eigenvalues()
ev

In [None]:
plt.scatter(range(1, data_scale.shape[1]+1),ev)
plt.plot(range(1, data_scale.shape[1]+1),ev)

plt.xlabel('Factor Number')
plt.ylabel('Eigenvalue')
plt.title('Scree Plot')
plt.show()

In [None]:
plt.scatter(range(1, data_scale.shape[1]+1),ev)
plt.plot(range(1, data_scale.shape[1]+1),ev)
plt.axis([0,5,0,10])
plt.yticks([0,0.5,1,5,10])
plt.xlabel('Factor Number')
plt.ylabel('Eigenvalue')
plt.title('Scree Plot')
plt.show()

# FA Rotation 시작

## 1) FA Rotation: Varimax

In [None]:
# factor 4개로 선택
fa = FactorAnalyzer(n_factors=4, rotation = "varimax")
fa.fit(data_scale)

In [None]:
# FA: Factor Loadings
fa_result_loading = pd.DataFrame(fa.loadings_, 
                                 columns = ['Factor1', 'Factor2', 'Factor3', 'Factor4'],
                                 index=data_scale.columns)
fa_result_loading

In [None]:
# Varimax로 한 Communality
commu_vari = pd.DataFrame(fa.get_communalities())
commu_vari.columns=['공통성']

In [None]:
# 시각화
plt.figure(figsize=(6,12))
sns.heatmap(fa_result_loading, cmap="Blues", annot=True, fmt='.2f') #소수 둘째자리까지

In [None]:
fa.get_factor_variance()
fa_result_fromLoading = pd.DataFrame(fa.get_factor_variance(), columns = ['Factor1', 'Factor2', 'Factor3','Factor4'])
fa_result_fromLoading.index = ['SS Loadings', 'Proportion Var', 'Cumulative Var']
fa_result_fromLoading

## 2) FA Rotation: Promax

In [None]:
# factor 4개로 선택
fa_p = FactorAnalyzer(n_factors=4, rotation = "promax")
fa_p.fit(data_scale)

In [None]:
# FA: Factor Loadings
fa_p_result_loading = pd.DataFrame(fa_p.loadings_, 
                                 columns = ['Factor1', 'Factor2', 'Factor3', 'Factor4'],
                                 index=data_scale.columns)
fa_p_result_loading

In [None]:
#Promax로 한 Communality
commu_pro = pd.DataFrame(fa_p.get_communalities())
commu_pro.columns=['공통성']

In [None]:
# 시각화
plt.figure(figsize=(6,10))
sns.heatmap(fa_p_result_loading, cmap="Blues", annot=True, fmt='.2f')

In [None]:
fa_p.get_factor_variance()
fa_p_result_fromLoading = pd.DataFrame(fa_p.get_factor_variance(), columns = ['Factor1', 'Factor2', 'Factor3','Factor4'])
fa_p_result_fromLoading.index = ['SS Loadings', 'Proportion Var', 'Cumulative Var']
fa_p_result_fromLoading

## 3) FA Rotation: Quartimax

In [None]:
# factor 4개로 선택
fa_q = FactorAnalyzer(n_factors=4, rotation = "quartimax")
fa_q.fit(data_scale)

In [None]:
# FA: Factor Loadings
fa_q_result_loading = pd.DataFrame(fa_q.loadings_, 
                                 columns = ['Factor1', 'Factor2', 'Factor3', 'Factor4'],
                                 index=data_scale.columns)
fa_q_result_loading.head()

In [None]:
#Quartimax로 한 Communality
commu_Quarti = pd.DataFrame(fa_q.get_communalities())
commu_Quarti.columns=['공통성']

In [None]:
# 시각화
plt.figure(figsize=(6,10))
sns.heatmap(fa_q_result_loading, cmap="Blues", annot=True, fmt='.2f')

In [None]:
fa_q.get_factor_variance()
fa_q_result_fromLoading = pd.DataFrame(fa_q.get_factor_variance(), columns = ['Factor1', 'Factor2', 'Factor3','Factor4'])
fa_q_result_fromLoading.index = ['SS Loadings', 'Proportion Var', 'Cumulative Var']
fa_q_result_fromLoading

# FA Rotation: Oblimin

In [None]:
# factor 4개로 선택
fa_o = FactorAnalyzer(n_factors=4, rotation = "oblimin")
fa_o.fit(data_scale)

In [None]:
# FA: Factor Loadings
fa_o_result_loading = pd.DataFrame(fa_p.loadings_, 
                                 columns = ['Factor1', 'Factor2', 'Factor3', 'Factor4'],
                                 index=data_scale.columns)
fa_o_result_loading.head()

In [None]:
#Oblimin으로 한 Communality
commu_Obli = pd.DataFrame(fa_o.get_communalities())
commu_Obli.columns=['공통성']

In [None]:
# 시각화
plt.figure(figsize=(6,10))
sns.heatmap(fa_o_result_loading, cmap="Blues", annot=True, fmt='.2f')

In [None]:
fa_o.get_factor_variance()
fa_o_result_fromLoading = pd.DataFrame(fa_o.get_factor_variance(), columns = ['Factor1', 'Factor2', 'Factor3','Factor4'])
fa_o_result_fromLoading.index = ['SS Loadings', 'Proportion Var', 'Cumulative Var']
fa_o_result_fromLoading

## >> 네개의 rotation 방식 비교: Cumulative Var가 직교회전 두가지 방식에서는 0.81이 나왔는데, 사각회전에서는 더 낮게 나왔으므로 직교회전 rotation을 진행. 그 중에 Quartimax가 제1요인에 과대해석 되어있으므로 다요인분석에 용이한 Varimax를 선택하여 분석을 진행하였다. 

https://m.blog.naver.com/shoutjoy/221802826087

## Factor 1: Affection (배우자에 대한 애정 및 관심도)
## Factor 2: Aggression (논쟁 시 배우자에 대한 공격성)
## Factor 3: Silence (논쟁 시 말을 많이 하는지)
## Factor 4: Home-Distance (집 내에서 친밀도)

In [None]:
X_f = fa.fit_transform(data_scale)
X_ff = pd.DataFrame(X_f[:,:4], 
                    columns = ['Affection','Aggression','Silence','Home-Distance'], 
                    index = data_scale.index)
X_ff

## FA Score Plot 분석

In [None]:
# Score plot F1xF2
sns.scatterplot(data=X_ff, x='Affection',y='Aggression',hue=X_ff.index)

In [None]:
# Score plot F1xF3
sns.scatterplot(data=X_ff, x='Affection',y='Silence',hue=X_ff.index)

In [None]:
# Score plot F1xF4
sns.scatterplot(data=X_ff, x='Affection',y='Home-Distance',hue=X_ff.index)

In [None]:
# Score plot F2xF4
sns.scatterplot(data=X_ff, x='Aggression',y='Home-Distance',hue=X_ff.index)

In [None]:
# Score plot F2xF3
sns.scatterplot(data=X_ff, x='Aggression',y='Silence',hue=X_ff.index)

In [None]:
# Score plot F3xF4
sns.scatterplot(data=X_ff, x='Silence',y='Home-Distance',hue=X_ff.index)

In [None]:
X_aff = X_ff[['Affection']]
X_agg = X_ff[['Aggression']]
X_si = X_ff[['Silence']]
X_home = X_ff[['Home-Distance']]

In [None]:
X_aff.groupby('Divorce').describe()

In [None]:
X_agg.groupby('Divorce').describe()

In [None]:
X_si.groupby('Divorce').describe()

In [None]:
X_home.groupby('Divorce').describe()