In [1]:
import pandas as pd  
import numpy as np  
from sklearn.preprocessing import StandardScaler  
from sklearn.decomposition import PCA  
import matplotlib.pyplot as plt  
import seaborn as sns  
from matplotlib import rcParams 
import joblib

# 设置中文显示  
rcParams['font.sans-serif'] = ['SimHei']  
rcParams['axes.unicode_minus'] = False  

In [2]:
# 读取数据  
data = pd.read_csv(r"C:\Users\ghc_l\Desktop\43final\datause.csv")  

# 确认应变量和自变量  
y = data['price']    
initial_features = ['area', 'layout', 'site', 'total_height',   
                   'toward', 'built_year', 'price_per_sqm', 'age']  
X_initial = data[initial_features] 

In [3]:
# 计算相关性矩阵  
correlation_matrix = X_initial.corr()  
print("相关性矩阵：")  
print(correlation_matrix) 

相关性矩阵：
                   area    layout      site  total_height    toward  \
area           1.000000  0.747019 -0.003462      0.091229  0.187621   
layout         0.747019  1.000000 -0.023110      0.029967  0.199039   
site          -0.003462 -0.023110  1.000000     -0.037975 -0.035359   
total_height   0.091229  0.029967 -0.037975      1.000000  0.123839   
toward         0.187621  0.199039 -0.035359      0.123839  1.000000   
built_year     0.021741  0.035812 -0.098482      0.533032  0.194916   
price_per_sqm  0.439363  0.411331 -0.030444      0.290498  0.158511   
age           -0.021741 -0.035812  0.098482     -0.533032 -0.194916   

               built_year  price_per_sqm       age  
area             0.021741       0.439363 -0.021741  
layout           0.035812       0.411331 -0.035812  
site            -0.098482      -0.030444  0.098482  
total_height     0.533032       0.290498 -0.533032  
toward           0.194916       0.158511 -0.194916  
built_year       1.000000       0.2

In [4]:
# 绘制相关性热力图  
plt.figure(figsize=(12, 10))  
sns.heatmap(correlation_matrix,  
            annot=True,  
            cmap='coolwarm',  
            center=0,  
            fmt='.2f',  
            square=True)  
plt.title('Feature Correlation Matrix')    
plt.savefig(r'C:\Users\ghc_l\Desktop\43final\特征相关性热力图.png',  
            dpi=300,  
            bbox_inches='tight')  
plt.close() 

In [5]:
# 选择特征（移除built_year）  
selected_features = ['area', 'layout', 'site', 'total_height',  
                    'toward', 'price_per_sqm', 'age']  
X = data[selected_features] 

In [6]:
# 标准化数据  
scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)  
scaled_df = pd.DataFrame(X_scaled, columns=selected_features)  
print("\n标准化后的数据统计：")  
print(scaled_df.describe())  


标准化后的数据统计：
               area        layout          site  total_height        toward  \
count  2.541000e+03  2.541000e+03  2.541000e+03  2.541000e+03  2.541000e+03   
mean  -2.740385e-16  2.366379e-16 -3.355574e-17  6.711147e-17  1.006672e-16   
std    1.000197e+00  1.000197e+00  1.000197e+00  1.000197e+00  1.000197e+00   
min   -1.593543e+00 -2.340238e+00 -2.288847e+00 -2.378360e+00 -1.308024e+00   
25%   -6.191756e-01 -2.501186e-01 -1.133970e+00 -7.414920e-01 -9.803085e-01   
50%   -2.159889e-01 -2.501186e-01  2.090687e-02  2.213714e-01  2.837362e-03   
75%    3.383927e-01  7.845938e-01  1.175784e+00  6.065168e-01  1.313699e+00   
max    1.050239e+01  5.958156e+00  2.330661e+00  2.339671e+00  1.641414e+00   

       price_per_sqm          age  
count   2.541000e+03  2541.000000  
mean   -2.796311e-18     0.000000  
std     1.000197e+00     1.000197  
min    -2.221575e+00    -1.459990  
25%    -6.410715e-01    -0.897872  
50%    -1.697747e-01    -0.335755  
75%     5.074697e-01    

In [7]:
# 初始PCA分析  
pca_initial = PCA()  
X_pca_initial = pca_initial.fit_transform(X_scaled)  

# 计算方差解释比  
explained_variance_ratio = pca_initial.explained_variance_ratio_  
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)  

print("\n各主成分方差解释比例：")  
for i, ratio in enumerate(explained_variance_ratio, 1):  
    print(f"PC{i}: {ratio:.4f} ({ratio*100:.2f}%)")  

print("\n累计方差解释比例：")  
for i, ratio in enumerate(cumulative_variance_ratio, 1):  
    print(f"前{i}个主成分: {ratio:.4f} ({ratio*100:.2f}%)")  


各主成分方差解释比例：
PC1: 0.3315 (33.15%)
PC2: 0.2186 (21.86%)
PC3: 0.1415 (14.15%)
PC4: 0.1280 (12.80%)
PC5: 0.0795 (7.95%)
PC6: 0.0658 (6.58%)
PC7: 0.0351 (3.51%)

累计方差解释比例：
前1个主成分: 0.3315 (33.15%)
前2个主成分: 0.5501 (55.01%)
前3个主成分: 0.6917 (69.17%)
前4个主成分: 0.8196 (81.96%)
前5个主成分: 0.8992 (89.92%)
前6个主成分: 0.9649 (96.49%)
前7个主成分: 1.0000 (100.00%)


In [8]:
# 绘制累积方差解释率图  
plt.figure(figsize=(10, 6))  
plt.plot(range(1, len(explained_variance_ratio) + 1),  
         cumulative_variance_ratio, 'bo-')  
plt.xlabel('Number of Components')    
plt.ylabel('Cumulative Explained Variance Ratio')    
plt.title('PCA Cumulative Variance Ratio')    
plt.grid(True)  
plt.savefig(r'C:\Users\ghc_l\Desktop\43final\主成分累积方差解释率.png',  
            dpi=300,  
            bbox_inches='tight')  
plt.close() 

In [9]:
# 最终PCA  
n_components = 6  
pca_final = PCA(n_components=n_components)  
X_reduced = pca_final.fit_transform(X_scaled)  

# 分析主成分特征权重  
components_df = pd.DataFrame(  
    pca_final.components_.T,  
    columns=[f'PC{i+1}' for i in range(n_components)],  
    index=selected_features  
)  

print("\n主成分特征权重：")  
print(components_df) 


主成分特征权重：
                    PC1       PC2       PC3       PC4       PC5       PC6
area           0.508696  0.379881 -0.000479 -0.092385 -0.288760  0.052150
layout         0.497304  0.398041  0.043868 -0.053540 -0.296244 -0.181810
site          -0.059156  0.138702 -0.973702  0.149338 -0.009103 -0.081649
total_height   0.299525 -0.560282 -0.172886 -0.171622 -0.345655  0.634043
toward         0.278568 -0.085773  0.102008  0.935232  0.109693  0.133931
price_per_sqm  0.487380 -0.034586 -0.087700 -0.248664  0.825954  0.091574
age           -0.292511  0.596277  0.044655  0.018705  0.122736  0.727479


In [10]:
# 保存主成分权重  
components_df.to_excel(r"C:\Users\ghc_l\Desktop\43final\主成分含义.xlsx")  

# 绘制特征权重热力图  
plt.figure(figsize=(10, 8))  
sns.heatmap(components_df[['PC1', 'PC2', 'PC3']],  
            annot=True,  
            cmap='coolwarm',  
            center=0,  
            fmt='.2f')  
plt.title('Feature Weights in Principal Components')    
plt.savefig(r'C:\Users\ghc_l\Desktop\43final\主成分特征权重热力图.png',  
            dpi=300,  
            bbox_inches='tight')  
plt.close()  

In [14]:
# 创建包含price和主成分的最终数据集  
final_df = pd.DataFrame(X_reduced, columns=[f"PC{i+1}" for i in range(n_components)])  
final_df['price'] = y  # 添加price列  
final_df.to_csv(r"C:\Users\ghc_l\Desktop\43final\降维后的数据.csv",  
                index=False,  
                encoding='utf-8-sig') 

In [16]:
# 为了之后进行预测，保存转换器
# 保存标准化参数
joblib.dump(scaler, r'C:\Users\ghc_l\Desktop\43final\scaler.pkl')  
# 保存PCA参数
joblib.dump(pca_final, r'C:\Users\ghc_l\Desktop\43final\pca_transformer.pkl')
# 保存特征列表  
feature_order = pd.DataFrame({'feature_names': selected_features})  
feature_order.to_csv(r'C:\Users\ghc_l\Desktop\43final\feature_order.csv', index=False) 